module Spellchecker::Tokenizer
Constants
- BLANK_REGEXP
- DOT
- LINEBREAK
- NULL_POS
- NULL_TOKEN
- PAIR_POST
- PAIR_PRE
- PRE_N_POST
- SIMPLE_POST
- SIMPLE_PRE
- SPLITTABLES
- SPLITTABLES_REGEXP
- WORD_REGEXP
Public Instance Methods
call(str)
click to toggle source
rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity @param str [String] string to be tokenized. @return [Spellchecker::Tokenizer::List]
# File lib/spellchecker/tokenizer.rb, line 30 def call(str) chars = str.chars pos = 0 list = Tokenizer::List.new (chars.length + 1).times.each_with_object([]) do |i, acc| char = chars[i] if char.nil? list << Token.new(acc.join, pos) unless acc.empty? break end if char.match?(BLANK_REGEXP) list << Token.new(acc.join, pos) unless acc.empty? acc.clear elsif splitable?(char) is_next_wordchar = word_char?(chars[i + 1]) if acc.empty? && char == DOT && is_next_wordchar pos = i acc << char elsif !word_char?(chars[i - 1]) || !is_next_wordchar || char == LINEBREAK list << Token.new(acc.join, pos) unless acc.empty? list << Token.new(char, i) acc.clear else acc << char end else pos = i if acc.empty? acc << char end end list end
splitable?(char)
click to toggle source
@param char [String] @return [Boolean]
# File lib/spellchecker/tokenizer.rb, line 73 def splitable?(char) SPLITTABLES_REGEXP.match?(char) || char == LINEBREAK end
word_char?(char)
click to toggle source
@param char [String] @return [Boolean]
# File lib/spellchecker/tokenizer.rb, line 79 def word_char?(char) char&.match?(WORD_REGEXP) end