class TeRex::Classifier::Bayes

Attributes

category_counts[R]
messages[R]
total_words[R]

Public Class Methods

new(*categories) click to toggle source

categories = [{:tag => “Thing1”, :msg => “Thing1 message”}, {:tag => “Thing2”, :msg => “Thing2 message”}] initialize({:tag => “Refund”, :msg => “You'll get a refund”}, {:tag => “Nonrefund”, :msg => “You won't get a refund”})

# File lib/te_rex/bayes.rb, line 12
def initialize(*categories)
  @clasif = Hash.new
  @messages = Hash.new
  categories.each {|cat| @clasif[TeRex::Format.category_term(cat[:tag])] = Hash.new}
  categories.each {|cat| @messages[cat[:tag]] = cat[:msg]}
  @total_words = 0
  @category_counts = Hash.new(0)
end

Public Instance Methods

categories() click to toggle source
# File lib/te_rex/bayes.rb, line 57
def categories
  @classif.keys.collect {|c| c.to_s}
end
classifications(text) click to toggle source
# File lib/te_rex/bayes.rb, line 33
def classifications(text)
  score = Hash.new
  training_count = @category_counts.values.inject {|x,y| x+y}.to_f

  @clasif.each do |category, category_words|
    score[category.to_s] = 0
    total = category_words.values.inject(0) {|sum, element| sum+element}
    BayesData.index_frequency(text).each do |word, count|
      s = category_words.has_key?(word) ? category_words[word] : 0.1
      score[category.to_s] += Math.log(s/total.to_f)
    end

    k = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
    score[category.to_s] += Math.log(k/training_count)
  end

  score
end
classify(text) click to toggle source
# File lib/te_rex/bayes.rb, line 52
def classify(text)
  tag = (classifications(text).sort_by{|a| -a[1]})[0][0]
  [tag, @messages[tag]]
end
train(ctgry, text) click to toggle source
# File lib/te_rex/bayes.rb, line 21
def train(ctgry, text)
  category = TeRex::Format.category_term(ctgry)
  @category_counts[category] += 1

  BayesData.index_frequency(text).each do |word, count|
    @clasif[category][word] ||= 0
    @clasif[category][word] += count

    @total_words += count
  end
end
training_description() click to toggle source
# File lib/te_rex/bayes.rb, line 61
def training_description
  max_threshold = (@total_words/self.category_counts.keys.count).to_f
  tmp = []
  @clasif.each_pair do |term,val|
    cc = self.category_counts[term]
    train_ratio = (@total_words/cc).to_f
    tmp << [(train_ratio >= max_threshold), term, "description" => {"training_ratio" => "#{train_ratio}", "threshold" => "#{max_threshold}", "category_counts" => "#{cc}", "total_words" => "#{@total_words}"}]
  end
  tmp
end
under_trained?() click to toggle source
# File lib/te_rex/bayes.rb, line 72
def under_trained?
  training_description.select {|ut| ut.first == true}
end