class Yanbi::Bayes
Public Class Methods
default(*categories)
click to toggle source
# File lib/bayes/bayes.rb, line 34 def self.default(*categories) self.new(WordBag, *categories) end
load(fname)
click to toggle source
# File lib/bayes/bayes.rb, line 38 def self.load(fname) c = YAML::load(File.read(fname + ".obj")) raise LoadError unless c.is_a? self c end
new(klass, *categories)
click to toggle source
# File lib/bayes/bayes.rb, line 18 def initialize(klass, *categories) raise ArgumentError unless categories.size > 1 @categories = categories @category_counts = {} @document_counts = {} @category_sizes = {} @categories.each do |category| cat = category.to_sym @category_counts[cat] = {} @document_counts[cat] = 0 end @bag_class = klass.to_s.split('::').last end
Public Instance Methods
classify(document)
click to toggle source
# File lib/bayes/bayes.rb, line 62 def classify(document) return nil if document.empty? weights = scores(document) weights.max_by(&:last).first end
classify_raw(text)
click to toggle source
# File lib/bayes/bayes.rb, line 78 def classify_raw(text) classify(self.newdoc(text)) end
newdoc(doc)
click to toggle source
# File lib/bayes/bayes.rb, line 95 def newdoc(doc) Yanbi.const_get(@bag_class).new(doc) end
save(name)
click to toggle source
# File lib/bayes/bayes.rb, line 44 def save(name) File.open(name + ".obj", 'w') do |out| YAML.dump(self, out) end end
scores(document)
click to toggle source
# File lib/bayes/bayes.rb, line 68 def scores(document) scores = {} @categories.each {|c| scores[c] = score(c, document)} scores end
scores_raw(text)
click to toggle source
# File lib/bayes/bayes.rb, line 82 def scores_raw(text) scores(self.newdoc(text)) end
set_significance(cutoff, category=nil)
click to toggle source
# File lib/bayes/bayes.rb, line 86 def set_significance(cutoff, category=nil) categories = (category.nil? ? @categories : [category]) categories.each do |category| cat = category.to_sym @category_counts[cat].reject! {|k,v| v < cutoff} @category_sizes[cat] = category_size(cat) end end
train(category, document)
click to toggle source
# File lib/bayes/bayes.rb, line 50 def train(category, document) cat = category.to_sym @document_counts[cat] += 1 document.words.uniq.each do |word| @category_counts[cat][word] ||= 0 @category_counts[cat][word] += 1 end @category_sizes[cat] = category_size(cat) end
train_raw(category, text)
click to toggle source
# File lib/bayes/bayes.rb, line 74 def train_raw(category, text) train(category, self.newdoc(text)) end
Private Instance Methods
category_size(cat)
click to toggle source
# File lib/bayes/bayes.rb, line 112 def category_size(cat) @category_counts[cat].values.reduce(&:+).to_i end
score(cat, document)
click to toggle source
# File lib/bayes/bayes.rb, line 101 def score(cat, document) total_docs = @document_counts.values.reduce(:+).to_f document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+) document_prob + Math.log(@document_counts[cat] / total_docs) end
word_prob(cat, word)
click to toggle source
# File lib/bayes/bayes.rb, line 107 def word_prob(cat, word) count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1 Math.log(count / @category_sizes[cat]) end