class ClassifierReborn::Bayes

Constants

CategoryNotFoundError

Attributes

threshold[RW]

Retrieve the current threshold value

Public Class Methods

new(*args) click to toggle source

The class can be created with one or more categories, each of which will be initialized and given a training method. E.g.,

b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'

Options available are:

language:         'en'                    Used to select language specific stop words
auto_categorize:  false                   When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
enable_threshold: false                   When true, enables a threshold requirement for classifition
threshold:        0.0                     Default threshold, only used when enabled
enable_stemmer:   true                    When false, disables word stemming
stopwords:        nil                     Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
backend:          BayesMemoryBackend.new  Alternatively, BayesRedisBackend.new for persistent storage
# File lib/classifier-reborn/bayes.rb, line 27
def initialize(*args)
  @initial_categories = []
  options = { language:         'en',
              enable_threshold: false,
              threshold:        0.0,
              enable_stemmer:   true,
              backend:          BayesMemoryBackend.new
            }
  args.flatten.each do |arg|
    if arg.is_a?(Hash)
      options.merge!(arg)
    else
      @initial_categories.push(arg)
    end
  end

  unless options.key?(:auto_categorize)
    options[:auto_categorize] = @initial_categories.empty? ? true : false
  end

  @language            = options[:language]
  @auto_categorize     = options[:auto_categorize]
  @enable_threshold    = options[:enable_threshold]
  @threshold           = options[:threshold]
  @enable_stemmer      = options[:enable_stemmer]
  @backend             = options[:backend]

  populate_initial_categories

  if options.key?(:stopwords)
    custom_stopwords options[:stopwords]
  end
end

Public Instance Methods

add_category(category) click to toggle source

Allows you to add categories to the classifier. For example:

b.add_category "Not spam"

WARNING: Adding categories to a trained classifier will result in an undertrained category that will tend to match more criteria than the trained selective categories. In short, try to initialize your categories at initialization.

# File lib/classifier-reborn/bayes.rb, line 238
def add_category(category)
  category = CategoryNamer.prepare_name(category)
  @backend.add_category(category)
end
Also aliased as: append_category
append_category(category)
Alias for: add_category
categories() click to toggle source

Provides a list of category names For example:

b.categories
=>   ["This", "That", "The other"]
# File lib/classifier-reborn/bayes.rb, line 218
def categories
  category_keys.collect(&:to_s)
end
category_keys() click to toggle source

Provides a list of category keys as symbols For example:

b.categories
=>   [:This, :That, :"The other"]
# File lib/classifier-reborn/bayes.rb, line 226
def category_keys
  @backend.category_keys
end
classifications(text) click to toggle source

Returns the scores in each category the provided text. E.g.,

b.classifications "I hate bad words and you"
=>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}

The largest of these scores (the one closest to 0) is the one picked out by classify

# File lib/classifier-reborn/bayes.rb, line 121
def classifications(text)
  score = {}
  word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
  if word_hash.empty?
    category_keys.each do |category|
      score[category.to_s] = Float::INFINITY
    end
    return score
  end
  category_keys.each do |category|
    score[category.to_s] = 0
    total = (@backend.category_word_count(category) || 1).to_f
    word_hash.each do |word, _count|
      s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
      score[category.to_s] += Math.log(s / total)
    end
    # now add prior probability for the category
    s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
    score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
  end
  score
end
classify(text) click to toggle source

Return the classification without the score

# File lib/classifier-reborn/bayes.rb, line 153
def classify(text)
  result, score = classify_with_score(text)
  result = nil if score < @threshold || score == Float::INFINITY if threshold_enabled?
  result
end
classify_with_score(text) click to toggle source

Returns the classification of the provided text, which is one of the categories given in the initializer along with the score. E.g.,

b.classify "I hate bad words and you"
=>  ['Uninteresting', -4.852030263919617]
# File lib/classifier-reborn/bayes.rb, line 148
def classify_with_score(text)
  (classifications(text).sort_by { |a| -a[1] })[0]
end
disable_threshold() click to toggle source

Dynamically disable threshold for classify results

# File lib/classifier-reborn/bayes.rb, line 171
def disable_threshold
  @enable_threshold = false
end
enable_threshold() click to toggle source

Dynamically enable threshold for classify results

# File lib/classifier-reborn/bayes.rb, line 166
def enable_threshold
  @enable_threshold = true
end
method_missing(name, *args) click to toggle source

Provides training and untraining methods for the categories specified in Bayes#new For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train_this "This text"
b.train_that "That text"
b.untrain_that "That text"
b.train_the_other "The other text"
Calls superclass method
# File lib/classifier-reborn/bayes.rb, line 202
def method_missing(name, *args)
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
  category = CategoryNamer.prepare_name(cleaned_name)
  if category_keys.include?(category)
    args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
  elsif name.to_s =~ /(un)?train_([\w]+)/
    raise StandardError, "No such category: #{category}"
  else
    super # raise StandardError, "No such method: #{name}"
  end
end
reset() click to toggle source
# File lib/classifier-reborn/bayes.rb, line 245
def reset
  @backend.reset
  populate_initial_categories
end
stemmer_disabled?() click to toggle source

Is word stemming disabled?

# File lib/classifier-reborn/bayes.rb, line 191
def stemmer_disabled?
  !@enable_stemmer
end
stemmer_enabled?() click to toggle source

Is word stemming enabled?

# File lib/classifier-reborn/bayes.rb, line 186
def stemmer_enabled?
  @enable_stemmer
end
threshold_disabled?() click to toggle source

is threshold processing disabled?

# File lib/classifier-reborn/bayes.rb, line 181
def threshold_disabled?
  !@enable_threshold
end
threshold_enabled?() click to toggle source

Is threshold processing enabled?

# File lib/classifier-reborn/bayes.rb, line 176
def threshold_enabled?
  @enable_threshold
end
train(category, text) click to toggle source

Provides a general training method for all categories specified in Bayes#new For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train :this, "This text"
b.train "that", "That text"
b.train "The other", "The other text"
# File lib/classifier-reborn/bayes.rb, line 67
def train(category, text)
  word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
  return if word_hash.empty?
  category = CategoryNamer.prepare_name(category)

  # Add the category dynamically or raise an error
  unless category_keys.include?(category)
    if @auto_categorize
      add_category(category)
    else
      raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
    end
  end

  word_hash.each do |word, count|
    @backend.update_category_word_frequency(category, word, count)
    @backend.update_category_word_count(category, count)
    @backend.update_total_words(count)
  end
  @backend.update_total_trainings(1)
  @backend.update_category_training_count(category, 1)
end
untrain(category, text) click to toggle source

Provides a untraining method for all categories specified in Bayes#new Be very careful with this method.

For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train :this, "This text"
b.untrain :this, "This text"
# File lib/classifier-reborn/bayes.rb, line 97
def untrain(category, text)
  word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
  return if word_hash.empty?
  category = CategoryNamer.prepare_name(category)
  word_hash.each do |word, count|
    next if @backend.total_words < 0
    orig = @backend.category_word_frequency(category, word) || 0
    @backend.update_category_word_frequency(category, word, -count)
    if @backend.category_word_frequency(category, word) <= 0
      @backend.delete_category_word(category, word)
      count = orig
    end

    @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
    @backend.update_total_words(-count)
  end
  @backend.update_total_trainings(-1)
  @backend.update_category_training_count(category, -1)
end

Private Instance Methods

custom_stopwords(stopwords) click to toggle source

Overwrites the default stopwords for current language with supplied list of stopwords or file

# File lib/classifier-reborn/bayes.rb, line 259
def custom_stopwords(stopwords)
  unless stopwords.is_a?(Enumerable)
    if stopwords.strip.empty?
      stopwords = []
    elsif File.exist?(stopwords)
      stopwords = File.read(stopwords).force_encoding("utf-8").split
    else
      return # Do not overwrite the default
    end
  end
  Hasher::STOPWORDS[@language] = Set.new stopwords
end
populate_initial_categories() click to toggle source
# File lib/classifier-reborn/bayes.rb, line 252
def populate_initial_categories
  @initial_categories.each do |c|
    add_category(c)
  end
end