class BayesNaiveJdp::Classifier

Public Class Methods

new() click to toggle source
# File lib/bayes_naive_jdp.rb, line 6
def initialize
        @custom_tokenizer = nil
        clear
end

Public Instance Methods

classify(document) click to toggle source
# File lib/bayes_naive_jdp.rb, line 19
def classify(document)
        tokens = tokenize(document)
        scores = {}
        
        @labels.each do |label, label_frequency|
                log_sum = 0         #      use logs to avoid floating point errors... underflow in particular
                tokens.each do |token|
                        token_freq = @tokens[token].values.inject(0) { |sum, count| sum + count }
                        if token_freq > 0
                                token_prob = @tokens[token][label].to_f / label_frequency
                                token_inverse_prob = (token_freq - @tokens[token][label]).to_f / (@training_set_size - label_frequency)
                                wordicity = token_prob / (token_prob + token_inverse_prob)
                                
                                # pull harder toward neutral (0.5) if we have a small sample size
                                # (by averaging adjustment_weight fake 0.5 scores and our actual scores)
                                adjustment_weight = 1
                                wordicity = (adjustment_weight * 0.5 + token_freq * wordicity) / (adjustment_weight + token_freq)

                                # avoid breaking log
                                wordicity = 0.01 if wordicity == 0
                                wordicity = 0.00 if wordicity == 1
                                log_sum += Math.log(1 - wordicity) - Math.log(wordicity)
                        end
                end
                scores[label] = 1 / (1 + Math.exp(log_sum));
        end

        winner = scores.max_by { |k, v| v }
        { 
                :winner => {:classification => winner[0], :confidence => winner[1] },
                :all_scores => scores
        }
end
tokenizer(&block) click to toggle source

supply a custom tokenizer as a block: String => [String]

# File lib/bayes_naive_jdp.rb, line 54
def tokenizer(&block)
        @custom_tokenizer = block
end
train(document, label) click to toggle source
# File lib/bayes_naive_jdp.rb, line 11
def train(document, label)
        @training_set_size += 1
        label_seen(label)
        tokenize(document).each do |token|
                token_seen(token, label)
        end
end

Protected Instance Methods

clear() click to toggle source
# File lib/bayes_naive_jdp.rb, line 60
def clear
        @labels = Hash.new(0)
        @tokens = Hash.new({})
        @training_set_size = 0
end
label_seen(label) click to toggle source
# File lib/bayes_naive_jdp.rb, line 73
def label_seen(label)
        @labels[label] += 1
end
token_seen(token, label) click to toggle source
# File lib/bayes_naive_jdp.rb, line 77
def token_seen(token, label)
        @tokens[token] = Hash.new(0) unless @tokens.has_key? token
        @tokens[token][label] += 1
end
tokenize(document) click to toggle source
# File lib/bayes_naive_jdp.rb, line 66
def tokenize(document)
        return @custom_tokenizer.call(document) if @custom_tokenizer && @custom_tokenizer.lambda?
                
        #    default tokenizer: strip punctuation, splits words, and take unique occurrences
        document.downcase.gsub(/[^a-zA-Z 0-9]/, '').split(' ').uniq
end