module Emiler
Constants
- COMPANY_NAME_STOP_WORDS
- INEXACT_MATCH_COEFFICIENT
- RAISE_ON_MALFORMED_EMAIL
- VERSION
Public Class Methods
similarity(item1, item2, type: :email)
click to toggle source
# File lib/emiler.rb, line 34 def similarity item1, item2, type: :email type = :default unless private_methods.include? :"similarity_#{type}" item1, item2 = [item1, item2].map(&:to_s).map(&:strip).map(&:downcase) { jw: JW::MATCHER.distance(item1, item2) }.merge send(:"similarity_#{type}", item1, item2) end
Private Class Methods
similarity_company_name(c1, c2)
click to toggle source
similarity for company names
# File lib/emiler.rb, line 48 def similarity_company_name c1, c2 return { full: 1.0, distances: [1.0] * c1.split(/\s+/).size, matches: c1.split(/\s+/).size, result: true } if c1 == c2 # exact match c1, c2 = [c1, c2].map { |c| c.split(/\s+/).reject(&COMPANY_NAME_STOP_WORDS.method(:include?)) } return { full: 1.0 - (1.0 - INEXACT_MATCH_COEFFICIENT) / 2.0, name: 1.0, result: true } if c1 == c2 # match without stopwords dists = c1.product(c2) .map { |(w1, w2)| JW::MATCHER.distance(w1, w2) } .sort .reverse count = [c1, c2].map(&:size).min average = dists.take(count).map.with_index { |e, i| e * (1.0 - i.to_f / count) / count }.reduce(:+) { full: average, distances: dists, matches: dists.count(1.0), result: false } end
similarity_default(*)
click to toggle source
stub for unknown types; returns empty hash for `similarity` to return jaro-winkler distance only
# File lib/emiler.rb, line 43 def similarity_default(*) { result: nil } end
similarity_email(e1, e2)
click to toggle source
rubocop:disable Metrics/AbcSize similarity for emails
# File lib/emiler.rb, line 106 def similarity_email e1, e2 return { full: 1.0, name: 1.0, domain: 1.0, result: true } if e1 == e2 em1, em2 = [e1, e2].map { |e| e.split '@' } if em1.size != 2 || em2.size != 2 raise MalformedEmailError.new(e1, e2) if RAISE_ON_MALFORMED_EMAIL return JW::DUMMY end domain = case when em1.last == em2.last then 1 # exact domain match when [em1, em2].map { |e| e.last.split('.')[-2] }.reduce(:==) then INEXACT_MATCH_COEFFICIENT else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.last, em2.last) end name = case when em1.first == em2.first then 1 # exact match when ![em1, em2].map { |e| e.first.scan(/[a-z]+/) }.reduce(:&).empty? then INEXACT_MATCH_COEFFICIENT else INEXACT_MATCH_COEFFICIENT / 2.0 * JW::MATCHER.distance(em1.first, em2.first) end full = domain * (1.0 - INEXACT_MATCH_COEFFICIENT) + name * INEXACT_MATCH_COEFFICIENT { full: full, name: name, domain: domain, result: full >= INEXACT_MATCH_COEFFICIENT * INEXACT_MATCH_COEFFICIENT } end
similarity_phone(p1, p2)
click to toggle source
similarity for phone numbers
# File lib/emiler.rb, line 67 def similarity_phone p1, p2 return { full: 1.0, distances: [1.0], result: true } if p1 == p2 # exact match p1, p2 = [p1, p2].map { |p| p.split(/[,;]/) } .map do |p| p.map do |e| phone = e.delete('^0-9') phone = case phone.length when 0..6 then phone when 7 then "+3493#{phone}" # consider Barcelona when 8..9 then "+34#{phone}" # consider Spain else "+#{phone}" end # rubocop:disable Style/RescueModifier Phoner::Phone.parse(phone) rescue nil # Phoner::CountryCodeError # rubocop:enable Style/RescueModifier end.compact end dists = p1.product(p2) .reject do |(pp1, pp2)| pp1.country_code != pp2.country_code || pp1.area_code != pp2.area_code || pp1.number[0...-2] != pp2.number[0...-2] end.map do |(pp1, pp2)| case when pp1.number[-2..-1] == pp2.number[-2..-1] then 1.0 when pp1.number[-2] == pp2.number[-2] then 0.9 else 0.8 end end.sort.reverse { full: dists.first || 0.0, distances: dists, result: dists.first && dists.first >= INEXACT_MATCH_COEFFICIENT } end