class RNlp::Tokenize

it copes only with Japanese

Public Instance Methods

tokenize(input) click to toggle source
# File lib/r_nlp/tokenize.rb, line 7
def tokenize(input)
  natto = Natto::MeCab.new
  # array for token
  token = Array.new
  # make morphological analysis
  natto.parse(input) do |n|
    # word surface and word speech tag
    surface = n.surface
    tag = n.feature.split(',')[0]
    # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
    if tag == '助動詞'
      token[token.size-1][0] += surface
    else
      (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
    end
  end
  if token[token.size-1][0] == '。'
    token.pop
  end
  return token
end