class Flashtext::KeywordProcessor

Attributes

_keyword[RW]
_white_space_chars[RW]
case_sensitive[RW]
keyword_trie_hash[RW]
word_boundaries[RW]

Public Class Methods

new(case_sensitive = false) click to toggle source
# File lib/flashtext/keyword_processor.rb, line 7
def initialize case_sensitive = false
  self._keyword = '_keyword_'
  self._white_space_chars = Set.new(['.', '\t', '\n', '\a', ' ', ','])
  self.keyword_trie_hash = {}
  self.case_sensitive = case_sensitive
  self.word_boundaries = Set.new("0".upto("9").to_a + "A".upto("Z").to_a + "a".upto("z").to_a + ["_"])
end

Public Instance Methods

add_keyword(keyword, clean_name = nil) click to toggle source
# File lib/flashtext/keyword_processor.rb, line 15
def add_keyword keyword, clean_name = nil
  if not clean_name and keyword
    clean_name = keyword
  end

  if keyword and clean_name
    keyword = keyword.downcase if not case_sensitive
    current_hash = keyword_trie_hash
    keyword.each_char do |char|
      current_hash =  if current_hash.has_key?(char)
                        current_hash[char]
                      else
                        current_hash[char] = {}
                        current_hash[char]
                      end
    end
    current_hash[_keyword] = clean_name
  end
end
add_keywords_from_hash(keyword_hash) click to toggle source
# File lib/flashtext/keyword_processor.rb, line 35
def add_keywords_from_hash keyword_hash
  raise ArgumentError, "#{keyword_hash} is not hash. argument expected: Hash" unless keyword_hash.instance_of?(Hash)
  keyword_hash.each do |clean_name, keywords|
    raise ArgumentError, "#{keyword_hash['clean_name']} is not array. expected: Array" unless keywords.instance_of?(Array)
    keywords.each do |keyword|
      add_keyword(keyword.to_s, clean_name.to_s)
    end
  end
end
extract_keywords(sentence) click to toggle source
# File lib/flashtext/keyword_processor.rb, line 45
def extract_keywords sentence
  keywords_extracted = []
  keywords_extracted if not sentence #if sentence is empty or none just return empty list
  sentence = sentence.downcase if not case_sensitive
  current_hash = keyword_trie_hash

  sequence_end_pos = 0
  idx = 0
  sentence_len = sentence.length

  while idx < sentence_len
    char = sentence[idx]
    # when we reach a character that might denote word end
    if not word_boundaries.member?(char)
      # If end is present OR ?? (confused)
      if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
        # Update longest sequence found
        sequence_found = nil
        longest_sequence_found = nil
        is_longer_seq_found = false

        if current_hash.has_key?(_keyword)
          sequence_found = current_hash[_keyword]
          longest_sequence_found = current_hash[_keyword]
          sequence_end_pos = idx
        end

        # re look for longest_sequence from this position
        if current_hash.has_key?(char)
          current_hash_continued = current_hash[char]

          idy = idx + 1
          while idy < sentence_len
            inner_char = sentence[idy]
            if not word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
              # update longest sequence found. This will keep updating longest_sequence if exists.
              longest_sequence_found = current_hash_continued[_keyword]
              sequence_end_pos = idy
              is_longer_seq_found = true
            end
            if current_hash_continued.has_key?(inner_char)
              current_hash_continued = current_hash_continued[inner_char]
            else
              break
            end
            idy = idy + 1
          end
          # checked for end of sentenance
          if idy == sentence_len and current_hash_continued.has_key?(_keyword)
            # Update longest sequence found
            longest_sequence_found = current_hash_continued[_keyword]
            sequence_end_pos = idy
            is_longer_seq_found = true
          end
          idx = sequence_end_pos if is_longer_seq_found
        end
        current_hash = keyword_trie_hash # reset
        if longest_sequence_found
          keywords_extracted << longest_sequence_found
        end
      else
        # reset current_hash
        current_hash = keyword_trie_hash
      end
    elsif current_hash.has_key?(char)
      # we can continue from this char
      current_hash = current_hash[char]
    else
      # we reset current_hash
      current_hash = keyword_trie_hash
      # skip to end of keyword
      while idx < sentence_len
        char = sentence[idx]
        break if not word_boundaries.member?(char)
        idx = idx + 1
      end
    end
    # if we are end of sentence and have a sequence discovered
    if idx + 1 >= sentence_len
      if current_hash.has_key?(_keyword)
        sequence_found = current_hash[_keyword]
        keywords_extracted << sequence_found
      end
    end
    idx = idx + 1 # loop increment.
  end
  keywords_extracted
end
replace_keywords(sentence) click to toggle source
# File lib/flashtext/keyword_processor.rb, line 134
def replace_keywords sentence
  if sentence.nil? || sentence.empty?
    return sentence
  end
  new_sentence = ""
  original_sentence = sentence
  sentence = sentence.downcase if not case_sensitive
  current_word = ""
  current_hash = keyword_trie_hash
  current_white_space = ""
  sequence_end_pos = 0
  idx = 0
  sentence_len = sentence.length

  while idx < sentence_len
    char = sentence[idx]
    current_word += original_sentence[idx]

    if not word_boundaries.member?(char)
      current_white_space = char
      if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
        # update longest sequence found
        sequence_found = nil
        longest_sequence_found = nil
        is_longer_seq_found = false
        if current_hash.has_key?(_keyword)
          sequence_found = current_hash[_keyword]
          longest_sequence_found = current_hash[_keyword]
          sequence_end_pos = idx
        end

        # re look for longest_sequence from this position
        if current_hash.has_key?(char)
          current_hash_continued = current_hash[char]
          current_word_continued = current_word
          idy = idx + 1
          while idy < sentence_len
            inner_char = sentence[idy]
            current_word_continued += original_sentence[idy]
            if !word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
              # Update longest sequence found
              current_white_space = inner_char
              longest_sequence_found = current_hash_continued[_keyword]
              sequence_end_pos = idy
              is_longer_seq_found = true
            end
            if current_hash_continued.has_key?(inner_char)
              current_hash_continued = current_hash_continued[inner_char]
            else
              break
            end
            idy += 1
          end
          if idy == sentence_len # end of sentence reached.
            if current_hash_continued.member?(_keyword)
              # update longest sequence found
              current_white_space = ""
              longest_sequence_found = current_hash_continued[_keyword]
              sequence_end_pos = idy
              is_longer_seq_found = true
            end
          end
          if is_longer_seq_found
            idx = sequence_end_pos
            current_word = current_word_continued
          end
        end
        current_hash = keyword_trie_hash
        if longest_sequence_found
          new_sentence += (longest_sequence_found + current_white_space)
          current_word = ''
          current_white_space = ''
        else
          new_sentence += current_word
          current_word = ''
          current_white_space = ''
        end
      else
        # we reset current_hash
        current_hash = keyword_trie_hash
        new_sentence += current_word
        current_word = ''
        current_white_space = ''
      end
    elsif current_hash.has_key?(char)
      # we can continue from this char
      current_hash = current_hash[char]
    else
      # reset current_hash
      current_hash = keyword_trie_hash
      idy = idx + 1
      while idy < sentence_len
        char = sentence[idy]
        current_word += original_sentence[idy]
        break if not word_boundaries.member?(char)
        idy += 1
      end
      idx = idy
      new_sentence += current_word
      current_word = ""
      current_white_space = ""
    end
    if idx + 1 >= sentence_len && current_hash.has_key?(_keyword)
      sequence_found = current_hash[_keyword]
      new_sentence += sequence_found
    end
    idx = idx + 1 # loop increment
  end
  return new_sentence
end