class Reckon::CosineSimilarity
Constants
- DocumentInfo
Public Class Methods
new(options)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 18 def initialize(options) @docs = DocumentInfo.new({}, {}) @options = options end
Public Instance Methods
add_document(account, doc)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 23 def add_document(account, doc) tokens = tokenize(doc) LOGGER.info "doc tokens: #{tokens}" tokens.each do |n| (token, count) = n @docs.tokens[token] ||= Hash.new(0) @docs.tokens[token][account] += count @docs.accounts[account] ||= Hash.new(0) @docs.accounts[account][token] += count end end
find_similar(query)
click to toggle source
find most similar documents to query
# File lib/reckon/cosine_similarity.rb, line 37 def find_similar(query) LOGGER.info "find_similar #{query}" accounts = docs_to_check(query).map do |a| [a, tfidf(@docs.accounts[a])] end q = tfidf(tokenize(query)) suggestions = accounts.map do |a, d| { similarity: calc_similarity(q, d), account: a } end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] } LOGGER.info "most similar accounts: #{suggestions}" return suggestions end
Private Instance Methods
calc_similarity(query, doc)
click to toggle source
Cosine similarity is used to compare how similar 2 documents are. Returns a float between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
see en.wikipedia.org/wiki/Cosine_similarity cos(theta) = (A . B) / (||A|| ||B||) where A . B is the “dot product” and ||A|| is the magnitude of A
The variables A and B are the set of unique terms in q and d.
For example, when q = “big red balloon” and d =“small green balloon” then the variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b = (0,0,1,1,1).
query and doc are hashes of token => tf/idf score
# File lib/reckon/cosine_similarity.rb, line 95 def calc_similarity(query, doc) tokens = Set.new(query.keys + doc.keys) a = Vector.elements(tokens.map { |n| query[n] || 0 }, false) b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false) return a.inner_product(b) / (a.magnitude * b.magnitude) end
calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 104 def calc_tf_idf(token_count, num_words_in_doc, df, num_docs) # tf(t,d) = count of t in d / number of words in d tf = token_count / num_words_in_doc.to_f # smooth idf weight # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2 # df(t) = num of documents with term t in them # idf(t) = log(N/(1 + df )) + 1 idf = Math.log(num_docs.to_f / (1 + df)) + 1 tf * idf end
docs_to_check(query)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 60 def docs_to_check(query) return tokenize(query).reduce(Set.new) do |corpus, t| corpus.union(Set.new(@docs.tokens[t[0]]&.keys)) end end
mk_tokens(str)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 123 def mk_tokens(str) str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?) end
tfidf(tokens)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 66 def tfidf(tokens) scores = {} tokens.each do |t, n| scores[t] = calc_tf_idf( n, tokens.length, @docs.tokens[t]&.length&.to_f || 0, @docs.accounts.length ) end return scores end
tokenize(str)
click to toggle source
# File lib/reckon/cosine_similarity.rb, line 117 def tokenize(str) mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo| memo[n] += 1 end.to_a end