class Company::Mapping::BasicTokenizer
Public Class Methods
new(ignorePunctuation = true, ignoreCase = true)
click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 7 def initialize(ignorePunctuation = true, ignoreCase = true) @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase end
Public Instance Methods
to_s()
click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 11 def to_s "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}" end
tokenize(text)
click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 15 def tokenize(text) text = tranform(text) tokens = Array.new index = 0 while (index < text.length) char = text[index] case char when /\s/ index = index + 1 when /\w/ #/(?<word>\w+)/ buf = "" while ((index < text.length) && (text[index].match(/\w/))) buf << text[index] index += 1 end tokens.push buf index += 1 else tokens.push(char) unless @doIgnorePunctuation index += 1 end end tokens end
Private Instance Methods
tranform(text)
click to toggle source
# File lib/company/mapping/document_utils/basic_tokenizer.rb, line 41 def tranform(text) @doIgnoreCase ? text.to_s.downcase : text.to_s end