module Spellchecker::Tokenizer

Constants

BLANK_REGEXP
DOT
LINEBREAK
NULL_POS
NULL_TOKEN
PAIR_POST
PAIR_PRE
PRE_N_POST
SIMPLE_POST
SIMPLE_PRE
SPLITTABLES
SPLITTABLES_REGEXP
WORD_REGEXP

Public Instance Methods

call(str) click to toggle source

rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity @param str [String] string to be tokenized. @return [Spellchecker::Tokenizer::List]

# File lib/spellchecker/tokenizer.rb, line 30
def call(str)
  chars = str.chars
  pos = 0
  list = Tokenizer::List.new

  (chars.length + 1).times.each_with_object([]) do |i, acc|
    char = chars[i]

    if char.nil?
      list << Token.new(acc.join, pos) unless acc.empty?

      break
    end

    if char.match?(BLANK_REGEXP)
      list << Token.new(acc.join, pos) unless acc.empty?
      acc.clear
    elsif splitable?(char)
      is_next_wordchar = word_char?(chars[i + 1])

      if acc.empty? && char == DOT && is_next_wordchar
        pos = i
        acc << char
      elsif !word_char?(chars[i - 1]) || !is_next_wordchar || char == LINEBREAK
        list << Token.new(acc.join, pos) unless acc.empty?
        list << Token.new(char, i)

        acc.clear
      else
        acc << char
      end
    else
      pos = i if acc.empty?
      acc << char
    end
  end

  list
end
splitable?(char) click to toggle source

@param char [String] @return [Boolean]

# File lib/spellchecker/tokenizer.rb, line 73
def splitable?(char)
  SPLITTABLES_REGEXP.match?(char) || char == LINEBREAK
end
word_char?(char) click to toggle source

@param char [String] @return [Boolean]

# File lib/spellchecker/tokenizer.rb, line 79
def word_char?(char)
  char&.match?(WORD_REGEXP)
end