class Rlid::NaiveBayesModels

> prova di una stringa molto lunga lunghissima davvero lunga yyyy default = 10 ita(99.97) : cat(0.026) : spa(0.0023) default = 1 ita(99.995) : cat(0.0045) : por(0.00019) default = 0.1 ita(99.9990) : cat(0.00086) : rum(3.7e-05)

Constants

CUTOFF

top ngrams kept for every language

FILENAME
MAX_STRING_LENGTH
OTHER

special feature

Attributes

default_count[RW]
n[RW]

Public Class Methods

generate_models(file=nil, n=3) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 34
def self.generate_models(file=nil, n=3)
  file ||= FILENAME
  models = NaiveBayesModels.new(n)
  puts "Training started.."
  models.train
  File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f|
    f.write Marshal.dump(models)
    puts "Models saved to #{DATA_DIRECTORY}/#{file}"
  end
end
load(file=nil) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 45
def self.load(file=nil)
  file ||= FILENAME
  Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}"))
end
new(n=3) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 29
def initialize(n=3)
  @n=n
  @default_count=1
end

Public Instance Methods

probabilities(string) { |lang, prob| ... } click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 50
def probabilities(string)
  if not string.is_a? String
    raise InvalidArgument
  end
  @ngram_frequency.keys.each do |lang|
    prob = 1
    string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram|
      if lang == :eng
        #print ngram, ",  "
      end
      prob *= frequency_of(lang, ngram)
    end
    yield lang, prob
  end
end
probabilities_h(string) click to toggle source

returns a hash

# File lib/rlid/models/naive_bayes_models.rb, line 67
def probabilities_h(string)
  #puts "#{@n}: #{total_ngrams(:ita)}"
  res = {}
  probabilities(string) do |lang, prob|
    res[lang] = prob
  end
  res
end
train() click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 76
def train
  ngram_counts = get_ngram_counts
  # ngrams for which we want to store information (all languages)
  @stored_ngrams = top_ngrams(ngram_counts)

  puts "- processing ngrams"
  # content: ngram_frequency[lang][ngram] = freq
  @ngram_frequency = Hash.new
  # content: total_ngrams_found[lang] = total count of ngrams encountered
  @total_ngrams_found= Hash.new
  # content: total_ngrams_not_found[lang] = n of ngrams not found
  @total_ngrams_not_found = Hash.new

  ngram_counts.each do |lang, counts|
    @ngram_frequency[lang] = Hash.new(0)
    @total_ngrams_found[lang] = 0
    counts.each do |ngram, count|
      if @stored_ngrams.include?(ngram)
        @ngram_frequency[lang][ngram] = count
      else
        @ngram_frequency[lang][OTHER] += count
      end
      @total_ngrams_found[lang] += count
    end
    
    not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
    @total_ngrams_not_found[lang] = not_found

    puts_info(lang)
  end

  # add language :nnn
  n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
  @total_ngrams_found[:nnn] = n
  @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
  @total_ngrams_not_found[:nnn] = @stored_ngrams.size
  
  #puts "total frequencies saved: #{freqs}"
  #puts "default values used: #{default_count} (#{100*default_count/freqs}%)"
  #@ngram_frequency
end

Protected Instance Methods

frequency_of(lang, ngram) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 123
def frequency_of(lang, ngram)
  if not @stored_ngrams.include?(ngram)
    #warn "  :#{ngram}: is in OTHER!" if lang == :eng
    ngram = OTHER
  end
  if @ngram_frequency[lang].include?(ngram)
    count = @ngram_frequency[lang][ngram]
  else
    count = @default_count
  end
  count.to_f / total_ngrams(lang)
end
total_ngrams(lang) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 119
def total_ngrams(lang)
  @total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count
end

Private Instance Methods

get_ngram_counts() click to toggle source

gets all ngram_counts and returns an hash having: ngram_counts[ngram] = count

# File lib/rlid/models/naive_bayes_models.rb, line 150
def get_ngram_counts
  @stored_ngrams = Set.new
  ngram_counts = Hash.new
  Language.each_file("corpus") do |file, lang|
    puts "- I'm learning #{lang}"
    ngram_counts[lang] = Hash.new(0) # default is 1
    file.read.each_ngram(@n) do |ngram|
      ngram_counts[lang][ngram] += 1
    end

    # top ngrams (transformed into arrays)
    arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]}
    @stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]}
  end
  ngram_counts
end
puts_info(lang) click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 138
def puts_info(lang)
  # default count of 1 is supposed
  tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang]
  d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1)
  o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1)
  puts "  #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%"
end
top_ngrams(ngram_counts) click to toggle source

extract the top ngrams for every language

# File lib/rlid/models/naive_bayes_models.rb, line 168
def top_ngrams(ngram_counts)
  res = Set.new
  ngram_counts.values.each do |hash|
    # top ngrams (transformed into arrays)
    arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]}
    res += arrays[0...CUTOFF].map{|x| x[0]}
  end
  res
end