class TrieTree
Trie tree class
Constants
- ALPHA
- CYRILLIC
- DEFAULT
char type github.com/buruzaemon/natto/wiki/Node-Parsing-char_type#appendix-d-node-parsing-and-char_type
- GREEK
- HIRAGANA
- KANJI
- KANJINUMERIC
- KATAKANA
- NUMERIC
- SPACE
- SYMBOL
Public Class Methods
new()
click to toggle source
# File lib/trie_suggest.rb, line 20 def initialize @root = TrieNode.new('') @romaji_dictionary = {} # you need install natto https://github.com/buruzaemon/natto/ @nm = Natto::MeCab.new end
Public Instance Methods
add_word(keyword, score)
click to toggle source
# File lib/trie_suggest.rb, line 37 def add_word(keyword, score) return if keyword.blank? return if score < 10 keyword = normalize(keyword) return if keyword.to_s.empty? romaji = to_romaji(keyword) if @romaji_dictionary[romaji].nil? || @romaji_dictionary[romaji][1] < score @romaji_dictionary[romaji] = [keyword, score] end romaji += '+' current_node = @root for i in 0..romaji.size - 1 char = romaji.slice(0, i + 1) idx = romaji[i].ord current_node.next[idx] = TrieNode.new(char) if current_node.next[idx].nil? current_node.next[idx].score += score current_node = current_node.next[idx] end end
levenshtein(word1, word2)
click to toggle source
# File lib/trie_suggest.rb, line 119 def levenshtein(word1, word2) columns = word1.size + 1 rows = word2.size + 1 current_row = [0] for column in 1..columns current_row << current_row[column - 1] + 1 end for row in 1..rows previous_row = current_row current_row = [ previous_row[0] + 1 ] for column in 1..columns insert_cost = current_row[column - 1] + 1 delete_cost = previous_row[column] + 1 if word1[column - 1] != word2[row - 1] replace_cost = previous_row[ column - 1 ] + 1 else replace_cost = previous_row[ column - 1 ] end current_row << [insert_cost, delete_cost, replace_cost].min end end return current_row[-1] end
match(keyword)
click to toggle source
# File lib/trie_suggest.rb, line 90 def match(keyword) return '' if keyword.blank? || keyword == '+' keyword = normalize(keyword) romaji = to_romaji(keyword) current_node = @root for i in 0..romaji.size - 1 idx = romaji[i].ord return '' if current_node.next[idx].nil? current_node = current_node.next[idx] end return '' if current_node.next['+'.ord].nil? @romaji_dictionary[current_node.val][0] end
spellcheck(keyword, max_cost = 3)
click to toggle source
# File lib/trie_suggest.rb, line 105 def spellcheck(keyword, max_cost = 3) return [] if keyword.blank? || keyword == '+' keyword += '+' current_node = @root current_row = [] for i in 0..keyword.size current_row[i] = i end res = [] # recursively search each branch of the trie current_node.next.map { |node| search_recursive(node[1], node[1].val[-1], keyword, current_row, res, max_cost) } end
suggest(keyword)
click to toggle source
# File lib/trie_suggest.rb, line 58 def suggest(keyword) return [] if keyword.blank? || keyword == '+' res = [] keyword = normalize(keyword) romaji = to_romaji(keyword) current_node = @root # search keyword olog(keyword.length) for i in 0..romaji.size - 1 idx = romaji[i].ord return res if current_node.next[idx].nil? current_node = current_node.next[idx] end # Traversal node next_level = current_node.next.sort_by { |r| r[1].score }.reverse until next_level.empty? node = next_level.shift if node[1].val.last == '+' next if @romaji_dictionary[node[1].val.chop].nil? sufrace = @romaji_dictionary[node[1].val.chop][0] res << [sufrace, node[1].score] return res if res.size > 9 else next if node[1].next.empty? next_level << node[1].next.sort_by { |r| r[1].score }.reverse[0] next_level = next_level.sort_by { |r| r[1].score }.reverse next_level = next_level[0..9] end end res end
Private Instance Methods
normalize(string)
click to toggle source
# File lib/trie_suggest.rb, line 173 def normalize(string) return '' if string.blank? # stringの長さはtrie treeの深さを決めるので、検索走査のスピードも影響を及ぼす、変更する場合は慎重に return '' if string.size > 30 string.scrub! string.unicode_normalize! string = string.tr('0-9a-zA-Z', '0-9a-zA-Z').downcase string.gsub(/[^-’[^\p{P}]]|’$|’”$/, '') end
search_recursive(node, letter, keyword, previous_row, res, max_cost)
click to toggle source
# File lib/trie_suggest.rb, line 147 def search_recursive(node, letter, keyword, previous_row, res, max_cost) current_row = [previous_row[0] + 1] for column in 1..keyword.size insert_cost = current_row[column - 1] + 1 delete_cost = previous_row[column] + 1 replace_cost = if keyword[column - 1] != letter previous_row[column - 1] + 1 else previous_row[column - 1] end current_row << [insert_cost, delete_cost, replace_cost].min end if current_row[-1] != 0 && current_row[-1] <= max_cost && letter == '+' && !@romaji_dictionary[node.val.chop].nil? res << [@romaji_dictionary[node.val.chop][0], node.score, current_row[-1]] end if current_row.min <= max_cost node.next.map { |next_node| search_recursive(next_node[1], next_node[1].val[-1], keyword, current_row, res, max_cost) } end res end
to_romaji(string)
click to toggle source
# File lib/trie_suggest.rb, line 183 def to_romaji(string) string = string.delete(' ') string = to_yomi(string) if string =~ /\p{Han}/ Romaji.kana2romaji(string) end
to_yomi(string)
click to toggle source
# File lib/trie_suggest.rb, line 189 def to_yomi(string) @nm.enum_parse(string).map do |node| if node.char_type.in?([KANJI, HIRAGANA, KATAKANA, KANJINUMERIC]) node.feature.split(',')[8] else node.surface end end.join end