class Bm25::Parser
Public Class Methods
new()
click to toggle source
# File lib/bm25/parser.rb, line 7 def initialize @base_document = '' @docs = [] @idf_map = {} @all_word_length = 0 end
Public Instance Methods
create_data()
click to toggle source
# File lib/bm25/parser.rb, line 14 def create_data self.create_docs self.create_idf_map dataset = self.get_dataset return dataset end
create_docs()
click to toggle source
# File lib/bm25/parser.rb, line 37 def create_docs nm = Natto::MeCab.new doc_list = Bm25::Utils.separate_document(@base_document) doc_list.each do |d| total_words = Bm25::Utils.separate_words(d) word_map = {} total_words.each do |w| count = 0 #単語数 count = d.scan(/#{Regexp.escape(w)}/).length if word_map[w].nil? word_map[w] = { count: count, tf: count.to_f / total_words.length } end end avarage_word_length = @all_word_length / doc_list.length @docs << { document: d, words: word_map, words_length: total_words.length, dl: total_words.length / avarage_word_length.to_f } end end
create_idf_map()
click to toggle source
# File lib/bm25/parser.rb, line 65 def create_idf_map words = [] @docs.each do |d| d[:words].each_pair{|k, v| words << k } end words = words.uniq words.each do |word| f = 0 @docs.each{|d| f = f + 1 if d[:words][word]} idf = f === 0 ? 0 : @docs.length / f @idf_map[word] = { df: f, idf: Math.log(idf) + 1 } end end
execute(document)
click to toggle source
# File lib/bm25/parser.rb, line 21 def execute(document) if document.length < 1 raise '文字を渡してください' end @allword_length = 0 @idf_map = {} @docs = [] @base_document = document @all_word_length = Bm25::Utils.separate_words(document).length data = self.create_data data = self.get_important_keyword(data) return data end
get_dataset()
click to toggle source
# File lib/bm25/parser.rb, line 83 def get_dataset data = [] @docs.each do |d| new_words = [] k1 = 1.2 b = 0.75 # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ] d[:words].each_pair do |k, v| tfidf = @idf_map[k][:idf] * v[:tf] new_words << { word: k, tf: v[:tf], idf: @idf_map[k][:idf], tfidf: tfidf, bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf]) } end data << { document: d[:document], words: new_words.sort_by{|w| -w[:bm25]} } end return data end
get_important_keyword(dataset)
click to toggle source
# File lib/bm25/parser.rb, line 108 def get_important_keyword(dataset) word_map = {} dataset.each do |data| data[:words].each do |val| k = val[:word] bm25 = val[:bm25] if word_map[k] word_map[k] = word_map[k] + bm25 else word_map[k] = bm25 end end end return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 } end