class Phonetic::Metaphone2
The Double Metaphone
phonetic encoding algorithm is the second generation of the Metaphone
algorithm. Its original implementation was described by Lawrence Philips in the June 2000 issue of C/C++ Users Journal.
This implementation based on the PHP implementation by Stephen Woodbridge and contains modifications of algorithm by Kevin Atkinson. @see swoodbridge.com/DoubleMetaPhone/
PHP implementation by Stephen Woodbridge
@see aspell.net/metaphone/dmetaph.cpp
C++ implementation with modifications by Kevin Atkinson
@example
Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN'] Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM'] Phonetic::DoubleMetaphone.encode('edgar') # => ['ATKR', 'ATKR'] # or use alias: Phonetic::Metaphone2.encode('czerny') # => ['SRN', 'XRN'] Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM'] Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
Constants
- START_OF_WORD_MAP
Public Class Methods
encode(str, options = { size: 4 })
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 101 def self.encode(str, options = { size: 4 }) encode_word(str, options) end
encode_word(word, options = { size: 4 })
click to toggle source
Encode word to its Double Metaphone
code.
# File lib/phonetic/double_metaphone.rb, line 55 def self.encode_word(word, options = { size: 4 }) code_size = options[:size] || 4 w = word.strip.upcase code = Code.new i = 0 len = w.size last = len - 1 # pad the original string so that we can index beyond the edge of the world w += ' ' * 5 i += encode_start_of_word(w, code) while i < len && (code.first.size < code_size || code.last.size < code_size) case w[i] when 'A', 'E', 'I', 'O', 'U', 'Y' i += 1 when 'B' # "-mb", e.g", "dumb", already skipped over... i += gen_encode(w, i, 'P', 'P', code) when 'Ç', 'ç' code.add 'S', 'S' i += 1 when 'C', 'D' i += char_encode(w, i, len, code) when 'F', 'K', 'N' i += gen_encode(w, i, w[i], w[i], code) when 'G', 'H', 'J', 'L', 'M' i += char_encode(w, i, len, code) when 'Ñ', 'ñ' code.add 'N', 'N' i += 1 when 'P' i += encode_p(w, i, len, code) when 'Q' i += gen_encode(w, i, 'K', 'K', code) when 'R', 'S', 'T' i += char_encode(w, i, len, code) when 'V' i += gen_encode(w, i, 'F', 'F', code) when 'W', 'X', 'Z' i += char_encode(w, i, len, code) else i += 1 end end code.results(code_size) end
Private Class Methods
c_germanic?(w, i)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 512 def self.c_germanic?(w, i) # various germanic i > 1 && w[i - 2, 6] =~ /(^[^AEIOUY]ACH[^IE])|([BM]ACHER)/ end
ch_germanic_or_greek?(w, i, len)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 517 def self.ch_germanic_or_greek?(w, i, len) # germanic, greek, or otherwise 'ch' for 'kh' sound w[0, 4] =~ /^(V[AO]N\s|SCH)/ || # 'architect but not 'arch', 'orchestra', 'orchid' i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/ || (w[i + 2] =~ /[TS]/) || (i > 0 && w[i - 1] =~ /[AOUE]/ || i == 0) && # e.g., 'wachtler', 'wechsler', but not 'tichner' (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len) end
char_encode(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 124 def self.char_encode(w, i, len, code) self.send "encode_#{w[i].downcase}", w, i, len, code end
encode_c(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 128 def self.encode_c(w, i, len, code) r = 1 case # various germanic when c_germanic?(w, i) code.add 'K', 'K' r += 1 when w[i, 2] == 'CH' r += encode_ch(w, i, len, code) when w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ') # e.g, 'czerny' code.add 'S', 'X' r += 1 when w[i + 1, 3] == 'CIA' # e.g., 'focaccia' code.add 'X', 'X' r += 2 # double 'C', but not if e.g. 'McClellan' when w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M') r += encode_cc(w, i, code) + 1 when w[i, 2] =~ /C[KGQ]/ code.add 'K', 'K' r += 1 # italian vs. english when w[i, 3] =~ /CI[OEA]/ code.add 'S', 'X' r += 1 when w[i, 2] =~ /C[IEY]/ code.add 'S', 'S' r += 1 else code.add 'K', 'K' # name sent in 'mac caffrey', 'mac gregor' if w[i + 1, 2] =~ /\s[CQG]/ r += 2 elsif w[i + 1] =~ /[CKQ]/ && w[i + 1, 2] !~ /C[EI]/ r += 1 end end r end
encode_cc(w, i, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 426 def self.encode_cc(w, i, code) r = 0 # 'bellocchio' but not 'bacchus' if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU' # 'accident', 'accede' 'succeed' if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/ # 'bacci', 'bertucci', other italian code.add 'KS', 'KS' else code.add 'X', 'X' end r = 1 else # Pierce's rule code.add 'K', 'K' end r end
encode_ch(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 404 def self.encode_ch(w, i, len, code) case # italian 'chianti' when w[i, 4] == 'CHIA' code.add 'K', 'K' # find 'michael' when i > 0 && w[i, 4] == 'CHAE' code.add 'K', 'X' # germanic, greek, or otherwise 'ch' for 'kh' sound when ch_germanic_or_greek?(w, i, len) code.add 'K', 'K' when i == 0 code.add 'X', 'X' when w[0, 2] == 'MC' # e.g., "McHugh" code.add 'K', 'K' else code.add 'X', 'K' end 1 end
encode_d(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 170 def self.encode_d(w, i, len, code) r = 1 case when w[i + 1, 2] =~ /G[IEY]/ # e.g. 'edge' code.add 'J', 'J' r += 2 when w[i + 1] == 'G' # e.g. 'edgar' code.add 'TK', 'TK' r += 1 when w[i + 1] =~ /[TD]/ code.add 'T', 'T' r += 1 else code.add 'T', 'T' end r end
encode_g(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 190 def self.encode_g(w, i, len, code) r = 2 case when w[i + 1] == 'H' encode_gh(w, i, code) when w[i + 1] == 'N' encode_gn(w, i, code) # 'tagliaro' when w[i + 1, 2] == 'LI' && !slavo_germanic?(w) code.add 'KL', 'L' # -ger-, -gy- when g_ger_or_gy?(w, i) code.add 'K', 'J' when g_italian?(w, i) if w[0, 4] =~ /^(V[AO]N\s|SCH)/ || w[i + 1, 2] == 'ET' code.add 'K', 'K' elsif w[i + 1, 4] =~ /IER\s/ code.add 'J', 'J' else code.add 'J', 'K' end else r -= 1 if w[i + 1] != 'G' code.add 'K', 'K' end r end
encode_gh(w, i, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 445 def self.encode_gh(w, i, code) if i > 0 && !vowel?(w[i - 1]) code.add 'K', 'K' # Parker's rule (with some further refinements) elsif !(i > 1 && w[i - 2] =~ /[BHD]/ || # e.g., 'hugh' i > 2 && w[i - 3] =~ /[BHD]/ || # e.g., 'bough' i > 3 && w[i - 4] =~ /[BH]/) # e.g., 'broughton' # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if i > 2 && w[i - 3, 3] =~ /[CGLRT].U/ code.add 'F', 'F' elsif i > 0 && w[i - 1] != 'I' code.add 'K', 'K' end end end
encode_gn(w, i, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 461 def self.encode_gn(w, i, code) if i == 1 && vowel?(w[0]) && !slavo_germanic?(w) code.add 'KN', 'N' # not e.g. 'cagney' elsif w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w) code.add 'N', 'KN' else code.add 'KN', 'KN' end end
encode_h(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 218 def self.encode_h(w, i, len, code) r = 1 # keep if btw. 2 vowels if i > 0 && vowel?(w[i - 1]) && vowel?(w[i + 1]) code.add 'H', 'H' r += 1 end r end
encode_j(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 228 def self.encode_j(w, i, len, code) r = 1 last = len - 1 # obvious spanish, 'jose', 'san jacinto' if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/ if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/ code.add 'H', 'H' else code.add 'J', 'H' end else if i == 0 && w[i, 4] != 'JOSE' code.add 'J', 'A' # Yankelovich/Jankelowicz else # spanish pron. of e.g. 'bajador' if j_spanish_pron?(w, i) code.add 'J', 'H' elsif i == last code.add 'J', '' elsif w[i + 1] !~ /[LTKSNMBZ]/ && !(i > 0 && w[i - 1] =~ /[SKL]/) code.add 'J', 'J' end end r += 1 if w[i + 1] == 'J' end r end
encode_l(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 257 def self.encode_l(w, i, len, code) r = 1 if w[i + 1] == 'L' # spanish e.g. 'cabrillo', 'gallegos' if ll_spanish?(w, i, len) code.add 'L', '' else code.add 'L', 'L' end r += 1 else code.add 'L', 'L' end r end
encode_m(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 273 def self.encode_m(w, i, len, code) r = 1 # 'dumb','thumb' r += 1 if i > 0 && w[i - 1, 5] =~ /UMB( |ER)/ || w[i + 1] == 'M' code.add 'M', 'M' r end
encode_p(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 281 def self.encode_p(w, i, len, code) r = 1 if w[i + 1] == 'H' code.add 'F', 'F' r += 1 else # also account for "campbell", "raspberry" r += 1 if w[i + 1] =~ /[PB]/ code.add 'P', 'P' end r end
encode_r(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 294 def self.encode_r(w, i, len, code) last = len - 1 # french e.g. 'rogier', but exclude 'hochmeier' if r_french?(w, i, last) code.add '', 'R' else code.add 'R', 'R' end w[i + 1] == 'R' ? 2 : 1 end
encode_s(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 305 def self.encode_s(w, i, len, code) r = 1 last = len - 1 case # special cases 'island', 'isle', 'carlisle', 'carlysle' when i > 0 && w[i - 1, 3] =~ /[IY]SL/ when w[i, 2] == 'SH' r += encode_sh(w, i, code) # italian & armenian when w[i, 3] =~ /SI[OA]/ if !slavo_germanic?(w) code.add 'S', 'X' else code.add 'S', 'S' end r += 2 # -sz- in slavic language altho in hungarian it is pronounced 's' when w[i, 2] == 'SZ' code.add 'S', 'X' r += 1 when w[i, 2] == 'SC' r += encode_sc(w, i, code) else if s_french?(w, i, last) code.add '', 'S' else code.add 'S', 'S' end r += 1 if w[i + 1] =~ /[SZ]/ end r end
encode_sc(w, i, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 482 def self.encode_sc(w, i, code) # Schlesinger's rule if w[i + 2] == 'H' # dutch origin, e.g. 'school', 'schooner' if w[i + 3, 2] =~ /OO|UY|E[DM]/ code.add 'SK', 'SK' # 'schermerhorn', 'schenker' elsif w[i + 3, 2] =~ /E[RN]/ code.add 'X', 'SK' elsif i == 0 && !vowel?(w[3]) && w[3] != 'W' code.add 'X', 'S' else code.add 'X', 'X' end elsif w[i + 2] =~ /[IEY]/ code.add 'S', 'S' else code.add 'SK', 'SK' end 2 end
encode_sh(w, i, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 472 def self.encode_sh(w, i, code) # germanic if w[i + 1, 4] =~ /H(EIM|OEK|OL[MZ])/ code.add 'S', 'S' else code.add 'X', 'X' end 1 end
encode_start_of_word(w, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 107 def self.encode_start_of_word(w, code) i = 0 START_OF_WORD_MAP.each do |r, v| if w =~ r code.add v[0], v[1] i = v[2] break end end i end
encode_t(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 338 def self.encode_t(w, i, len, code) r = 1 if w[i, 4] =~ /^(TION|TIA|TCH)/ code.add 'X', 'X' r += 2 elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH' # special case 'thomas', 'thames' or germanic if w[i + 2, 2] =~ /[OA]M/ || w[0, 4] =~ /^(VAN |VON |SCH)/ code.add 'T', 'T' else code.add '0', 'T' end r += 1 else r += 1 if w[i + 1] =~ /[TD]/ code.add 'T', 'T' end r end
encode_w(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 358 def self.encode_w(w, i, len, code) last = len - 1 r = 1 # can also be in middle of word if w[i, 2] == 'WR' code.add 'R', 'R' r += 1 else # Arnow should match Arnoff if i == last && i > 0 && vowel?(w[i - 1]) || i > 0 && w[i - 1, 5] =~ /[EO]WSK[IY]/ || w[0, 3] == 'SCH' code.add '', 'F' elsif w[i, 4] =~ /WICZ|WITZ/ # polish e.g. 'filipowicz' code.add 'TS', 'FX' r += 3 end end r end
encode_x(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 380 def self.encode_x(w, i, len, code) # french e.g. breaux code.add 'KS', 'KS' unless x_french?(w, i, len - 1) w[i + 1] =~ /[CX]/ ? 2 : 1 end
encode_z(w, i, len, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 386 def self.encode_z(w, i, len, code) r = 1 # chinese pinyin e.g. 'zhao' if w[i + 1] == 'H' code.add 'J', 'J' r += 1 else if w[i + 1, 2] =~ /Z[OIA]/ || slavo_germanic?(w) && i > 0 && w[i - 1] != 'T' code.add 'S', 'TS'; else code.add 'S', 'S'; end r += 1 if w[i + 1] == 'Z' end r end
g_ger_or_gy?(w, i)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 528 def self.g_ger_or_gy?(w, i) # -ger-, -gy- w[i + 1, 2] =~ /^(ER|Y)/ && w[0, 6] !~ /[DRM]ANGER/ && !(i > 0 && w[i - 1] =~ /[EI]/) && !(i > 0 && w[i - 1, 3] =~ /[RO]GY/) end
g_italian?(w, i)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 536 def self.g_italian?(w, i) # italian e.g, 'biaggi' w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/) end
gen_encode(w, i, primary, secondary, code)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 119 def self.gen_encode(w, i, primary, secondary, code) code.add primary, secondary w[i + 1] == w[i] ? 2 : 1 end
j_spanish_pron?(w, i)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 541 def self.j_spanish_pron?(w, i) # spanish pron. of e.g. 'bajador' i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && w[i + 1] =~ /[AO]/ end
ll_spanish?(w, i, len)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 546 def self.ll_spanish?(w, i, len) last = len - 1 # spanish e.g. 'cabrillo', 'gallegos' (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILL[OA]|ALLE/) || (last > 0 && w[last - 1, 2] =~ /[AO]S/ || w[last] =~ /[AO]/) && (i > 0 && w[i - 1, 4] == 'ALLE') end
r_french?(w, i, last)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 554 def self.r_french?(w, i, last) # french e.g. 'rogier', but exclude 'hochmeier' i == last && !slavo_germanic?(w) && i > 1 && w[i - 2, 2] == 'IE' && !(i > 3 && w[i - 4, 2] =~ /M[EA]/) end
s_french?(w, i, last)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 561 def self.s_french?(w, i, last) # french e.g. 'resnais', 'artois' i == last && i > 1 && w[i - 2, 2] =~ /[AO]I/ end
slavo_germanic?(w)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 504 def self.slavo_germanic?(w) w =~ /W|K|CZ|WITZ/ end
vowel?(c)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 508 def self.vowel?(c) c =~ /[AEIOUY]/ end
x_french?(w, i, last)
click to toggle source
# File lib/phonetic/double_metaphone.rb, line 566 def self.x_french?(w, i, last) # french e.g. breaux i == last && (i > 2 && w[i - 3, 3] =~ /[IE]AU/ || i > 1 && w[i - 2, 2] =~ /[AO]U/) end