class LevenshteinComparator

Constants

ASCII_REGEXP_MAPPING
STOP_WORDS

Attributes

cleanified_strings[RW]

Public Class Methods

clean(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 125
def self.clean(s)
  self.unaccent(
    self.remove_featuring(
      self.remove_parenthesis(
        self.decode_html_entities(s)
      )
    )
  )
end
decode_html_entities(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 101
def self.decode_html_entities(s)
  HTMLEntities.new.decode(s)
end
new(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 76
def initialize(s)
  self.cleanified_strings = self.class.to_array(s)
end
remove_featuring(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 85
def self.remove_featuring(s)
  res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
  res.strip
end
remove_parenthesis(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 80
def self.remove_parenthesis(s)
  res = s.gsub(/([\(\[].*[\)\]])/, '')
  res.strip
end
remove_stop_words(a) click to toggle source
# File lib/levenshtein_comparator.rb, line 105
def self.remove_stop_words(a)
  a - STOP_WORDS
end
to_array(s) click to toggle source

Cut the string into an array of words Two words separated by a dash (-) should be considered as : 1 word if the first or the second word is only 1 character 2 words otherwise

# File lib/levenshtein_comparator.rb, line 113
def self.to_array(s)
  s = self.clean(s)
  
  arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
    w.gsub(/[^A-Za-z0-9]/, '').downcase
  end.delete_if do |w|
    w.length < 2 && w !~ /\d/
  end
  
  self.remove_stop_words(arr)
end
unaccent(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 97
def self.unaccent(s)
  self.unaccent!(s.dup)
end
unaccent!(s) click to toggle source
# File lib/levenshtein_comparator.rb, line 90
def self.unaccent!(s)
  ASCII_REGEXP_MAPPING.each do |key, value|
    s.gsub! key, value
  end
  s
end

Public Instance Methods

compare(pattern) click to toggle source
# File lib/levenshtein_comparator.rb, line 135
def compare(pattern)
  pattern = self.class.to_array(pattern)
  
  size = cleanified_strings.size
  cleanified_strings.delete_if do |word|
    matched_word = pattern.find do |guess|
      if word =~ /\d+/
        guess == word
      else
        if guess.length > 4 and word.length > 4
          Levenshtein.distance(guess, word) <= 2
        elsif guess.length > 2 and word.length > 2
          Levenshtein.distance(guess, word) <= 1
        else
          guess == word
        end
      end
    end
    # only deleting one of the words
    pattern.delete_at(pattern.index(matched_word)) if matched_word
  end
  size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
end