class CBETA::CharCount

Public Class Methods

new(xml_root) click to toggle source
# File lib/cbeta/char_count.rb, line 2
def initialize(xml_root)
  @xml_root = xml_root
  @result = {}
end

Public Instance Methods

char_count(canon=nil) click to toggle source
# File lib/cbeta/char_count.rb, line 7
def char_count(canon=nil)
  stat_all if canon.nil?
  stat_canon(canon)
  @result
end

Private Instance Methods

handle_node(e) click to toggle source
# File lib/cbeta/char_count.rb, line 15
def handle_node(e)
  return if e.comment?
  return handle_text(e) if e.text?
  return if %w(foreign mulu rdg reg sic).include? e.name
  
  case e.name
  when 'g'    then @result[@work] += 1
  when 'note' then handle_note(e)
  when 't'    then handle_t(e)
  else traverse(e)
  end
end
handle_note(e) click to toggle source
# File lib/cbeta/char_count.rb, line 28
def handle_note(e)
  if %w(inline interlinear).include? e['place']
    traverse(e)
  end
end
handle_t(e) click to toggle source
# File lib/cbeta/char_count.rb, line 34
def handle_t(e)
  if e.has_attribute? 'place' and e['place'].include? 'foot'
    return
  end
  traverse(e)
end
handle_text(e) click to toggle source
# File lib/cbeta/char_count.rb, line 41
def handle_text(e)
  s = e.content().chomp
  return if s.empty?
  return if e.parent.name == 'app'

  # cbeta xml 文字之間會有多餘的換行
  s.gsub!(/[\n\r]/, '')
  
  @result[@work] += s.size
end
stat_all() click to toggle source
# File lib/cbeta/char_count.rb, line 52
def stat_all
  Dir.entries(@xml_root).sort.each do |canon|
    next if canon.start_with? '.'
    next if canon == 'schema'
    stat_canon(canon)
  end
end
stat_canon(canon) click to toggle source
# File lib/cbeta/char_count.rb, line 60
def stat_canon(canon)
  return if canon.nil?
  puts 'stat canon: ' + canon
  folder = File.join(@xml_root, canon)
  Dir.entries(folder).sort.each do |vol|
    next if vol.start_with? '.'
    p = File.join(folder, vol)
    stat_vol(p)
  end
end
stat_file(fn) click to toggle source
# File lib/cbeta/char_count.rb, line 71
def stat_file(fn)
  @work = File.basename(fn, '.xml')
  @work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
  @work = 'T0220' if @work.start_with?('T0220')
  unless @result.key? @work
    puts "stat work: #{@work}"
    @result[@work] = 0 
  end
  
  doc = CBETA.open_xml(fn)
  body = doc.at_xpath('/TEI/text/body')
  traverse(body)
end
stat_vol(vol_folder) click to toggle source
# File lib/cbeta/char_count.rb, line 85
def stat_vol(vol_folder)
  Dir.entries(vol_folder).sort.each do |f|
    next if f.start_with? '.'
    p = File.join(vol_folder, f)
    stat_file(p)
  end
end
traverse(e) click to toggle source
# File lib/cbeta/char_count.rb, line 93
def traverse(e)
  e.children.each { |c| 
    handle_node(c)
  }
end