module Simhash
Constants
- HASHBITS
- OPTIONS
- VERSION
Public Instance Methods
generate(str, options = {})
click to toggle source
# File lib/simhash2.rb, line 19 def generate(str, options = {}) # the split is how we get our tokens (or shingles) # adjust that, if we want to use shingles generate_from_tokens(str.split(/\s+/), options) end
generate_from_tokens(tokens, options = {})
click to toggle source
# File lib/simhash2.rb, line 25 def generate_from_tokens(tokens, options = {}) v = [0] * HASHBITS masks = v.dup masks.each_with_index { |_e, i| masks[i] = (1 << i) } filter_tokens(tokens, OPTIONS.merge(options)) do |token| h = simple_string_hash(token, HASHBITS) #warn "simple_string_hash (for: #{token.inspect}): #{h.inspect}" HASHBITS.times do |i| v[i] += (h & masks[i]).zero? ? -1 : +1 end end simhash = 0 HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 } simhash end
hamming_distance(simhash1, simhash2)
click to toggle source
# File lib/simhash2.rb, line 45 def hamming_distance(simhash1, simhash2) (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1') end
hash_similarity(left, right)
click to toggle source
# File lib/simhash2.rb, line 49 def hash_similarity(left, right) return (1.0 - (hamming_distance(left, right).to_f / HASHBITS)) end
similarity(string1, string2, options = {})
click to toggle source
# File lib/simhash2.rb, line 15 def similarity(string1, string2, options = {}) return hash_similarity(generate(string1, options), generate(string2, options)) end
Private Instance Methods
filter_tokens(tokens, options, &block)
click to toggle source
# File lib/simhash2.rb, line 69 def filter_tokens(tokens, options, &block) altered_tokens = [] tokens.each do |e| new_e = e.downcase.gsub(/\W+/, '') next if new_e.nil? || new_e.length < options[:min_token_length] if options[:stop_words] && !options[:stop_words].empty? next if options[:stop_words].include?(new_e) end if options[:stemming] altered_tokens << new_e.stem else altered_tokens << new_e end end altered_tokens.uniq! if options[:unique] if block_given? altered_tokens.each {|e| block[e] } else tokens.clear altered_tokens.each {|e| tokens << e } tokens end end
simple_string_hash(str, length)
click to toggle source
# File lib/simhash2.rb, line 55 def simple_string_hash(str, length) return 0 if str == '' x = str.bytes.first << 7 m = 1_000_003 mask = (1 << length) - 1 str.each_byte { |char| x = ((x * m) ^ char.to_i) & mask } x ^= str.bytes.count x = -2 if x == -1 x.to_i end