class Reckon::CosineSimilarity

Constants

DocumentInfo

Public Class Methods

new(options) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 18
def initialize(options)
  @docs = DocumentInfo.new({}, {})
  @options = options
end

Public Instance Methods

add_document(account, doc) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 23
def add_document(account, doc)
  tokens = tokenize(doc)
  LOGGER.info "doc tokens: #{tokens}"
  tokens.each do |n|
    (token, count) = n

    @docs.tokens[token] ||= Hash.new(0)
    @docs.tokens[token][account] += count
    @docs.accounts[account] ||= Hash.new(0)
    @docs.accounts[account][token] += count
  end
end
find_similar(query) click to toggle source

find most similar documents to query

# File lib/reckon/cosine_similarity.rb, line 37
def find_similar(query)
  LOGGER.info "find_similar #{query}"

  accounts = docs_to_check(query).map do |a|
    [a, tfidf(@docs.accounts[a])]
  end

  q = tfidf(tokenize(query))

  suggestions = accounts.map do |a, d|
    {
      similarity: calc_similarity(q, d),
      account: a
    }
  end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }

  LOGGER.info "most similar accounts: #{suggestions}"

  return suggestions
end

Private Instance Methods

calc_similarity(query, doc) click to toggle source

Cosine similarity is used to compare how similar 2 documents are. Returns a float between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.

see en.wikipedia.org/wiki/Cosine_similarity cos(theta) = (A . B) / (||A|| ||B||) where A . B is the “dot product” and ||A|| is the magnitude of A

The variables A and B are the set of unique terms in q and d.

For example, when q = “big red balloon” and d =“small green balloon” then the variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b = (0,0,1,1,1).

query and doc are hashes of token => tf/idf score

# File lib/reckon/cosine_similarity.rb, line 95
def calc_similarity(query, doc)
  tokens = Set.new(query.keys + doc.keys)

  a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
  b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)

  return a.inner_product(b) / (a.magnitude * b.magnitude)
end
calc_tf_idf(token_count, num_words_in_doc, df, num_docs) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 104
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
  # tf(t,d) = count of t in d / number of words in d
  tf = token_count / num_words_in_doc.to_f

  # smooth idf weight
  # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
  # df(t) = num of documents with term t in them
  # idf(t) = log(N/(1 + df )) + 1
  idf = Math.log(num_docs.to_f / (1 + df)) + 1

  tf * idf
end
docs_to_check(query) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 60
def docs_to_check(query)
  return tokenize(query).reduce(Set.new) do |corpus, t|
    corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
  end
end
mk_tokens(str) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 123
def mk_tokens(str)
  str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
end
tfidf(tokens) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 66
def tfidf(tokens)
  scores = {}

  tokens.each do |t, n|
    scores[t] = calc_tf_idf(
      n,
      tokens.length,
      @docs.tokens[t]&.length&.to_f || 0,
      @docs.accounts.length
    )
  end

  return scores
end
tokenize(str) click to toggle source
# File lib/reckon/cosine_similarity.rb, line 117
def tokenize(str)
  mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
    memo[n] += 1
  end.to_a
end