module Runestone::Corpus

Public Class Methods

add(*words) click to toggle source
# File lib/runestone/corpus.rb, line 3
  def self.add(*words)
    return if words.size == 0

    conn = Runestone::Model.connection
    conn.execute(<<-SQL)
      INSERT INTO runestone_corpus ( word )
      VALUES (#{words.map { |w| conn.quote(Runestone.normalize(w)) }.join('),(')})
      ON CONFLICT DO NOTHING
    SQL
  end
similar_words(*words) click to toggle source
# File lib/runestone/corpus.rb, line 14
  def self.similar_words(*words)
    lut = {}
    conn = Runestone::Model.connection
    words = words.inject([]) do |ws, w|
      tt = typo_tolerance(w)
      ws << "#{conn.quote(w)}, #{conn.quote(w.downcase)}, #{tt}" if tt > 0
      ws
    end
    return lut if words.size == 0
    
    result = conn.execute(<<-SQL)
      WITH  tokens (token, token_downcased, typo_tolerance) AS (VALUES (#{words.join('), (')}))
      SELECT token, word, levenshtein(runestone_corpus.word, tokens.token_downcased)
      FROM tokens
      JOIN runestone_corpus ON runestone_corpus.word % tokens.token_downcased
      WHERE
        runestone_corpus.word != tokens.token_downcased
        AND levenshtein(runestone_corpus.word, tokens.token_downcased) <= tokens.typo_tolerance
    SQL
    result.each_row do |t, w, l|
      w.gsub!(/\(|\)|:|\||!|\&|\*/, '')
      next if w == t
      lut[t] ||= []
      lut[t] << w
    end
    lut
  end
typo_tolerance(word) click to toggle source
# File lib/runestone/corpus.rb, line 42
def self.typo_tolerance(word)
  Runestone.typo_tolerances.find { |k,v| v.member?(word.length) }&.first || 0
end