module Edits::Jaro
Implements Jaro
similarity algorithm.
Public Class Methods
distance(str1, str2)
click to toggle source
Calculate Jaro
distance
`Dj = 1 - Sj`
@example
Edits::Jaro.distance("information", "informant") # => 0.09764309764309764
@param (see distance) @return [Float] distance, from 0.0 (identical) to 1.0 (distant)
# File lib/edits/jaro.rb, line 43 def self.distance(str1, str2) 1.0 - similarity(str1, str2) end
similarity(seq1, seq2)
click to toggle source
Calculate Jaro
similarity
`Sj = 1/3 * ((m / |A|) + (m / |B|) + ((m - t) / m))`
Where `m` is matches and `t` is transposes
@example
Edits::Jaro.similarity("information", "informant") # => 0.9023569023569024
@param seq1 [String, Array] @param seq2 [String, Array] @return [Float] similarity, from 0.0 (none) to 1.0 (identical)
# File lib/edits/jaro.rb, line 20 def self.similarity(seq1, seq2) return 1.0 if seq1 == seq2 return 0.0 if seq1.empty? || seq2.empty? seq1 = seq1.codepoints if seq1.is_a? String seq2 = seq2.codepoints if seq2.is_a? String m, t = jaro_matches(seq1, seq2) return 0.0 if m == 0 m = m.to_f ((m / seq1.length) + (m / seq2.length) + ((m - t) / m)) / 3 end
Private Class Methods
jaro_matches(seq1, seq2)
click to toggle source
Calculate number of Jaro
matches and transpositions
@param (see distance) @return [(Integer, Integer)] matches and transpositions
# File lib/edits/jaro.rb, line 51 def self.jaro_matches(seq1, seq2) seq1, seq2 = seq2, seq1 if seq1.length > seq2.length # search range: (max(|A|, |B|) / 2) - 1 range = (seq2.length / 2) - 1 range = 0 if range.negative? seq1_flags = Array.new(seq1.length, false) seq2_flags = Array.new(seq2.length, false) matches = 0 last2 = seq2.length - 1 # Pass 1: # - determine number of matches # - initialize transposition flags seq1.length.times do |i| min_bound = i >= range ? i - range : 0 max_bound = (i + range) <= last2 ? (i + range) : last2 min_bound.upto(max_bound) do |j| next unless seq2_flags[j] != true && seq2[j] == seq1[i] seq2_flags[j] = true seq1_flags[i] = true matches += 1 break end end return [0, 0] if matches == 0 transposes = 0 j = 0 # Pass 2: determine number of half-transpositions seq1.length.times do |i| # find a match in first string next unless seq1_flags[i] == true # go to location of next match on second string j += 1 until seq2_flags[j] # transposition if not the current match transposes += 1 if seq1[i] != seq2[j] j += 1 end # half-transpositions -> transpositions transposes /= 2 [matches, transposes] end