class Yanbi::Bayes

Public Class Methods

default(*categories) click to toggle source
# File lib/bayes/bayes.rb, line 34
def self.default(*categories)
  self.new(WordBag, *categories)
end
load(fname) click to toggle source
# File lib/bayes/bayes.rb, line 38
def self.load(fname)
  c = YAML::load(File.read(fname + ".obj"))
  raise LoadError unless c.is_a? self 
  c
end
new(klass, *categories) click to toggle source
# File lib/bayes/bayes.rb, line 18
def initialize(klass, *categories)
  raise ArgumentError unless categories.size > 1
  @categories = categories
  @category_counts = {}
  @document_counts = {}
  @category_sizes = {}
  
  @categories.each do |category|
    cat = category.to_sym
    @category_counts[cat] = {}
    @document_counts[cat] = 0 
  end

  @bag_class = klass.to_s.split('::').last
end

Public Instance Methods

classify(document) click to toggle source
# File lib/bayes/bayes.rb, line 62
def classify(document)
  return nil if document.empty?
  weights = scores(document)
  weights.max_by(&:last).first
end
classify_raw(text) click to toggle source
# File lib/bayes/bayes.rb, line 78
def classify_raw(text)
  classify(self.newdoc(text))
end
newdoc(doc) click to toggle source
# File lib/bayes/bayes.rb, line 95
def newdoc(doc)
  Yanbi.const_get(@bag_class).new(doc)
end
save(name) click to toggle source
# File lib/bayes/bayes.rb, line 44
def save(name)
  File.open(name + ".obj", 'w') do |out|
     YAML.dump(self, out)
  end
end
scores(document) click to toggle source
# File lib/bayes/bayes.rb, line 68
def scores(document)
  scores = {}
  @categories.each {|c| scores[c] = score(c, document)}
  scores
end
scores_raw(text) click to toggle source
# File lib/bayes/bayes.rb, line 82
def scores_raw(text)
  scores(self.newdoc(text))
end
set_significance(cutoff, category=nil) click to toggle source
# File lib/bayes/bayes.rb, line 86
def set_significance(cutoff, category=nil)
  categories = (category.nil? ? @categories : [category])
  categories.each do |category|
    cat = category.to_sym
    @category_counts[cat].reject! {|k,v| v < cutoff}
    @category_sizes[cat] = category_size(cat)
  end
end
train(category, document) click to toggle source
# File lib/bayes/bayes.rb, line 50
def train(category, document)
  cat = category.to_sym
  @document_counts[cat] += 1    
  
  document.words.uniq.each do |word|
    @category_counts[cat][word] ||= 0
    @category_counts[cat][word] += 1
  end

  @category_sizes[cat] = category_size(cat)
end
train_raw(category, text) click to toggle source
# File lib/bayes/bayes.rb, line 74
def train_raw(category, text)
  train(category, self.newdoc(text))
end

Private Instance Methods

category_size(cat) click to toggle source
# File lib/bayes/bayes.rb, line 112
def category_size(cat)
  @category_counts[cat].values.reduce(&:+).to_i
end
score(cat, document) click to toggle source
# File lib/bayes/bayes.rb, line 101
def score(cat, document)
  total_docs = @document_counts.values.reduce(:+).to_f
  document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)     
  document_prob + Math.log(@document_counts[cat] / total_docs) 
end
word_prob(cat, word) click to toggle source
# File lib/bayes/bayes.rb, line 107
def word_prob(cat, word)
  count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1 
  Math.log(count / @category_sizes[cat])
end