class Company::Mapping::BasicTokenizer

Public Class Methods

new(ignorePunctuation = true, ignoreCase = true) click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 7
def initialize(ignorePunctuation = true, ignoreCase = true)
  @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase
end

Public Instance Methods

to_s() click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 11
def to_s
  "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}"
end
tokenize(text) click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 15
def tokenize(text)
  text = tranform(text)
  tokens = Array.new
  index = 0
  while (index < text.length)
    char = text[index]
    case char
    when /\s/
      index = index + 1
    when /\w/ #/(?<word>\w+)/
      buf = ""
      while ((index < text.length) && (text[index].match(/\w/)))
        buf << text[index]
        index += 1
      end
      tokens.push buf
      index += 1
    else
      tokens.push(char) unless @doIgnorePunctuation
      index += 1
    end
  end
  tokens
end

Private Instance Methods

tranform(text) click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 41
def tranform(text)
  @doIgnoreCase ? text.to_s.downcase : text.to_s
end