class Splam::Ngram

Public Class Methods

new(site_id=nil) click to toggle source
# File lib/splam/ngram.rb, line 26
def initialize site_id=nil
  @site_id = site_id
end
trigram(text) click to toggle source
# File lib/splam/ngram.rb, line 3
def self.trigram text
  # this won't be utf-8 happy. Oh well!
  words = text.gsub("'", "").split(/\W/)
  hash = Hash.new 0
  i = 0
  while (i < words.length)
    tri = []
    count = 0
    while ((words.length > i + count) && (tri.length < 3))
      word = words[i + count]
      if word && word != ""
        tri << words[i + count]
      end
      count += 1
    end
    if tri.length == 3
      hash[tri.join(' ')] += 1
    end
    i += 1
  end
  hash
end

Public Instance Methods

compare(text) click to toggle source
# File lib/splam/ngram.rb, line 48
def compare text
  tri = self.class.trigram(text)
  score = 0
  spam = 0
  
  ham_key = @site_id ? "ham-#{@site_id}" : "ham"
  spam_key = @site_id ? "spam-#{@site_id}" : "spam"

  @ham_tri = Hash.new 0
  @spam_tri = Hash.new 0

  tri.each do |key,value|
    next if key.nil? || key.strip == ""
    hmatch = REDIS.hget(ham_key, key).to_i #  ham_tri[key]
    smatch = REDIS.hget(spam_key, key).to_i  # spam_tri[key]

    if hmatch > 0 && smatch > 0
      # tri appears in both
      # ignore.
      next
    end
    if hmatch > 0
      score += hmatch + value
    elsif smatch > 0
      spam += smatch + value
    end
  end
  [score, spam]
end
train(words, spam = false, retrain = false) click to toggle source

Train the temporary corpus with your data

# File lib/splam/ngram.rb, line 31
def train words, spam = false, retrain = false
  if words.is_a?(String)
    words = self.class.trigram(words)
  end
  words.each do |word,value|
    key = spam ? "spam" : "ham"
    REDIS.hincrby key, word, value
    REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
    if retrain
      # Remove phrases from existing corpus
      key = spam ? "ham" : "spam"
      REDIS.hincrby key, word, -value
      REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
    end
  end
end