class Bm25::Parser

Public Class Methods

new() click to toggle source
# File lib/bm25/parser.rb, line 7
def initialize
  @base_document = ''
  @docs = []
  @idf_map = {}
  @all_word_length = 0
end

Public Instance Methods

create_data() click to toggle source
# File lib/bm25/parser.rb, line 14
def create_data
  self.create_docs
  self.create_idf_map
  dataset = self.get_dataset
  return dataset
end
create_docs() click to toggle source
# File lib/bm25/parser.rb, line 37
def create_docs
  nm = Natto::MeCab.new
  doc_list = Bm25::Utils.separate_document(@base_document)

  doc_list.each do |d|
    total_words = Bm25::Utils.separate_words(d)
    word_map = {}
    total_words.each do |w|
      count = 0
      #単語数
      count = d.scan(/#{Regexp.escape(w)}/).length
      if word_map[w].nil?
        word_map[w] = {
          count: count,
          tf: count.to_f / total_words.length
        }
      end
    end
    avarage_word_length = @all_word_length / doc_list.length
    @docs << {
      document: d,
      words: word_map,
      words_length: total_words.length,
      dl: total_words.length / avarage_word_length.to_f
    }
  end
end
create_idf_map() click to toggle source
# File lib/bm25/parser.rb, line 65
def create_idf_map
  words = []
  @docs.each do |d|
    d[:words].each_pair{|k, v| words << k }
  end

  words = words.uniq
  words.each do |word|
    f = 0
    @docs.each{|d| f = f + 1 if d[:words][word]}
    idf = f === 0 ? 0 : @docs.length / f
    @idf_map[word] = {
      df: f,
      idf: Math.log(idf) + 1
    }
  end
end
execute(document) click to toggle source
# File lib/bm25/parser.rb, line 21
def execute(document)
  if document.length < 1
    raise '文字を渡してください'
  end
  @allword_length = 0
  @idf_map = {}
  @docs = []

  @base_document = document
  @all_word_length = Bm25::Utils.separate_words(document).length

  data = self.create_data
  data = self.get_important_keyword(data)
  return data
end
get_dataset() click to toggle source
# File lib/bm25/parser.rb, line 83
def get_dataset
  data = []
  @docs.each do |d|
    new_words = []
    k1 = 1.2
    b = 0.75
    # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
    d[:words].each_pair do |k, v|
      tfidf = @idf_map[k][:idf] * v[:tf]
      new_words << {
        word: k,
        tf: v[:tf],
        idf: @idf_map[k][:idf],
        tfidf: tfidf,
        bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
      }
    end
    data << {
      document: d[:document],
      words: new_words.sort_by{|w| -w[:bm25]}
    }
  end
  return data
end
get_important_keyword(dataset) click to toggle source
# File lib/bm25/parser.rb, line 108
def get_important_keyword(dataset)
  word_map = {}
  dataset.each do |data|
    data[:words].each do |val|
      k = val[:word]
      bm25 = val[:bm25]
      if word_map[k]
        word_map[k] = word_map[k] + bm25
      else
        word_map[k] = bm25
      end
    end
  end
  return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
end