class TinyClassifier::Tokenizer

Constants

TOKENIZERS

Attributes

type[RW]

Public Class Methods

new(params = nil) click to toggle source
# File lib/tiny-classifier/tokenizer.rb, line 22
def initialize(params = nil)
  if params
    @type = params[:type]
  end
  @type ||= :none
end

Public Instance Methods

tokenize(input) click to toggle source
# File lib/tiny-classifier/tokenizer.rb, line 29
def tokenize(input)
  case @type.to_s.downcase.to_sym
  when :mecab
    tokenize_by_mecab(input)
  else
    input
  end
end

Private Instance Methods

tokenize_by_mecab(input) click to toggle source
# File lib/tiny-classifier/tokenizer.rb, line 39
def tokenize_by_mecab(input)
  require "natto"
  natto = Natto::MeCab.new
  terms = []
  natto.parse(input) do |term|
    if term.feature =~ /\A(名詞|形容詞|動詞)/
      terms << term.surface
    end
  end
  terms.join(" ").strip
end