class Suika::Tagger

Tagger is a class that tokenizes Japanese text.

@example

require 'suika'

tagger = Suika::Tagger.new
tagger.parse('すもももももももものうち').each { |token| puts token }

# すもも  名詞,一般,*,*,*,*,すもも,スモモ,スモモ
# も      助詞,係助詞,*,*,*,*,も,モ,モ
# もも    名詞,一般,*,*,*,*,もも,モモ,モモ
# も      助詞,係助詞,*,*,*,*,も,モ,モ
# もも    名詞,一般,*,*,*,*,もも,モモ,モモ
# の      助詞,連体化,*,*,*,*,の,ノ,ノ
# うち    名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ

Constants

DICTIONARY_KEY
DICTIONARY_PATH
INT_MAX

Attributes

trie[R]

Public Class Methods

new() click to toggle source

Create a new tagger by loading the built-in binary dictionary.

# File lib/suika/tagger.rb, line 26
def initialize
  raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s

  @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
  @trie = DartsClone::DoubleArray.new
  @trie.set_array(@sysdic[:trie])
end

Public Instance Methods

inspect() click to toggle source
# File lib/suika/tagger.rb, line 84
def inspect
  to_s
end
parse(sentence) click to toggle source

Parse the given sentence. @param sentence [String] Japanese text to be parsed. @return [Array<String>]

# File lib/suika/tagger.rb, line 37
def parse(sentence)
  lattice = Lattice.new(sentence.length)
  start = 0
  terminal = sentence.length

  while start < terminal
    step = terminal - start

    query = sentence[start..-1] || ''
    result = trie.common_prefix_search(query)
    unless result.empty?
      words, indices = result
      unless words.empty?
        step = INT_MAX
        words.each_with_index do |word, i|
          features[indices[i]].each do |el|
            lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
          end
          step = word.length if word.length < step
        end
      end
    end

    word = sentence[start] || ''
    char_cate = CharDef.char_category(sentence[start] || '')
    char_type = CharDef.char_type(sentence[start] || '')
    if char_cate[:invoke]
      unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
      unk_terminal = terminal if terminal < unk_terminal
      pos = start + 1
      while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
        word << (sentence[pos] || '')
        pos += 1
      end
    end
    unknowns[char_type].each do |el|
      lattice.insert(start, start + word.length, word, true,
                     el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
    end
    step = word.length if word.length < step

    start += step
  end

  viterbi(lattice)
end

Private Instance Methods

connect_cost(r_id, l_id) click to toggle source
# File lib/suika/tagger.rb, line 106
def connect_cost(r_id, l_id)
  @sysdic[:concosts][r_id][l_id]
end
features() click to toggle source
# File lib/suika/tagger.rb, line 98
def features
  @sysdic[:features]
end
unknowns() click to toggle source
# File lib/suika/tagger.rb, line 102
def unknowns
  @sysdic[:unknowns]
end
viterbi(lattice) click to toggle source
# File lib/suika/tagger.rb, line 110
def viterbi(lattice)
  bos = lattice.end_nodes[0][0]
  bos.min_cost = 0
  bos.min_prev = nil

  (lattice.length + 1).times do |n|
    lattice.begin_nodes[n].each do |rnode|
      rnode.min_cost = INT_MAX
      rnode.min_prev = nil
      lattice.end_nodes[n].each do |lnode|
        cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
        if cost < rnode.min_cost
          rnode.min_cost = cost
          rnode.min_prev = lnode
        end
      end
    end
  end

  eos = lattice.begin_nodes[-1][0]
  prev_node = eos.min_prev
  res = []
  until prev_node.nil?
    res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
    prev_node = prev_node.min_prev
  end

  res.reverse
end