class Autosuggest

Constants

LETTERS

from blog.lojic.com/2008/09/04/how-to-write-a-spelling-corrector-in-ruby/

VERSION

Public Class Methods

new(top_queries) click to toggle source
# File lib/autosuggest.rb, line 13
def initialize(top_queries)
  @top_queries = top_queries
  @concepts = {}
  @words = Set.new
  @non_duplicates = Set.new
  @blocked_words = Set.new
  @blacklisted_words = Set.new
  @preferred_queries = {}
  @profane_words = Set.new(Obscenity::Base.blacklist)
end

Public Instance Methods

add_concept(name, values) click to toggle source
# File lib/autosuggest.rb, line 24
def add_concept(name, values)
  @concepts[name] = Set.new(values.compact.uniq.map(&:downcase))
end
blacklist_words(words) click to toggle source
# File lib/autosuggest.rb, line 58
def blacklist_words(words)
  warn "[autosuggest] blacklist_words is deprecated. Use block_words instead."
  words.each do |word|
    @blacklisted_words << word.downcase
  end
end
block_words(words) click to toggle source
# File lib/autosuggest.rb, line 52
def block_words(words)
  words.each do |word|
    @blocked_words << word.downcase
  end
end
not_duplicates(pairs) click to toggle source
# File lib/autosuggest.rb, line 46
def not_duplicates(pairs)
  pairs.each do |pair|
    @non_duplicates << pair.map(&:downcase).sort
  end
end
parse_words(phrases, options = {}) click to toggle source
# File lib/autosuggest.rb, line 28
def parse_words(phrases, options = {})
  min = options[:min] || 1

  word_counts = Hash.new(0)
  phrases.each do |phrase|
    words = tokenize(phrase)
    words.each do |word|
      word_counts[word] += 1
    end
  end

  word_counts.select { |_, c| c >= min }.each do |word, _|
    @words << word
  end

  word_counts
end
prefer(queries) click to toggle source
# File lib/autosuggest.rb, line 65
def prefer(queries)
  queries.each do |query|
    @preferred_queries[normalize_query(query)] ||= query
  end
end
pretty_suggestions() click to toggle source
# File lib/autosuggest.rb, line 145
def pretty_suggestions
  str = "%-30s   %5s   %s\n" % %w(Query Score Notes)
  suggestions.each do |suggestion|
    str << "%-30s   %5d   %s\n" % [suggestion[:query], suggestion[:score], suggestion[:notes].join(", ")]
  end
  str
end
suggestions() click to toggle source
# File lib/autosuggest.rb, line 71
def suggestions
  stemmed_queries = {}
  added_queries = Set.new
  @top_queries.sort_by { |_query, count| -count }.map do |query, count|
    query = query.to_s

    # TODO do not ignore silently
    next if query.length < 2

    stemmed_query = normalize_query(query)

    # get preferred term
    preferred_query = @preferred_queries[stemmed_query]
    if preferred_query && preferred_query != query
      original_query, query = query, preferred_query
    end

    # exclude duplicates
    duplicate = stemmed_queries[stemmed_query]
    stemmed_queries[stemmed_query] ||= query

    # also detect possibly misspelled duplicates
    # TODO use top query as duplicate
    if !duplicate && query.length > 4
      edits(query).each do |edited_query|
        if added_queries.include?(edited_query)
          duplicate = edited_query
          break
        end
      end
    end
    if duplicate && @non_duplicates.include?([duplicate, query].sort)
      duplicate = nil
    end
    added_queries << query unless duplicate

    # find concepts
    concepts = []
    @concepts.each do |name, values|
      concepts << name if values.include?(query)
    end

    # exclude misspellings that are not brands
    misspelling = @words.any? && misspellings?(query)

    profane = blocked?(query, @profane_words)
    blocked = blocked?(query, @blocked_words)
    blacklisted = blocked?(query, @blacklisted_words)

    notes = []
    notes << "duplicate of #{duplicate}" if duplicate
    notes.concat(concepts)
    notes << "misspelling" if misspelling
    notes << "profane" if profane
    notes << "blocked" if blocked
    notes << "blacklisted" if blacklisted
    notes << "originally #{original_query}" if original_query

    result = {
      query: query,
      original_query: original_query,
      score: count,
      duplicate: duplicate,
      concepts: concepts,
      misspelling: misspelling,
      profane: profane,
      blocked: blocked
    }
    result[:blacklisted] = blacklisted if @blacklisted_words.any?
    result[:notes] = notes
    result
  end
end

Protected Instance Methods

blocked?(query, blocked_words) click to toggle source
# File lib/autosuggest.rb, line 164
def blocked?(query, blocked_words)
  recurse(tokenize(query)).each do |terms|
    return true if terms.any? { |t| blocked_words.include?(t) }
  end
  false
end
edits(word) click to toggle source
# File lib/autosuggest.rb, line 195
def edits(word)
  n = word.length
  deletion = (0...n).collect { |i| word[0...i] + word[i + 1..-1] }
  transposition = (0...n - 1).collect { |i| word[0...i] + word[i + 1, 1] + word[i, 1] + word[i + 2..-1] }
  alteration = []
  n.times { |i| LETTERS.each_byte { |l| alteration << word[0...i] + l.chr + word[i + 1..-1] } }
  insertion = []
  (n + 1).times { |i| LETTERS.each_byte { |l| insertion << word[0...i] + l.chr + word[i..-1] } }
  deletion + transposition + alteration + insertion
end
misspellings?(query) click to toggle source
# File lib/autosuggest.rb, line 155
def misspellings?(query)
  recurse(tokenize(query)).each do |terms|
    if terms.all? { |t| @concepts.any? { |_, values| values.include?(t) } || @words.include?(t) }
      return false
    end
  end
  true
end
normalize_query(query) click to toggle source
# File lib/autosuggest.rb, line 206
def normalize_query(query)
  tokenize(query.to_s.gsub("&", "and")).map { |q| Lingua.stemmer(q) }.sort.join
end
recurse(words) click to toggle source
# File lib/autosuggest.rb, line 171
def recurse(words)
  if words.size == 1
    [words]
  else
    result = [[words.join(" ")]]
    i = 0
    while i < words.size - 1
      recurse(words[0..i]).each do |v1|
        recurse(words[i + 1..-1]).each do |v2|
          result << v1 + v2
        end
      end
      i += 1
    end
    result.uniq
  end
end
tokenize(str) click to toggle source
# File lib/autosuggest.rb, line 189
def tokenize(str)
  str.to_s.downcase.split(" ")
end