class Rlid::NaiveBayesModels
> prova di una stringa molto lunga lunghissima davvero lunga yyyy default = 10 ita(99.97) : cat(0.026) : spa(0.0023) default = 1 ita(99.995) : cat(0.0045) : por(0.00019) default = 0.1 ita(99.9990) : cat(0.00086) : rum(3.7e-05)
Constants
- CUTOFF
top ngrams kept for every language
- FILENAME
- MAX_STRING_LENGTH
- OTHER
special feature
Attributes
default_count[RW]
n[RW]
Public Class Methods
generate_models(file=nil, n=3)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 34 def self.generate_models(file=nil, n=3) file ||= FILENAME models = NaiveBayesModels.new(n) puts "Training started.." models.train File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f| f.write Marshal.dump(models) puts "Models saved to #{DATA_DIRECTORY}/#{file}" end end
load(file=nil)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 45 def self.load(file=nil) file ||= FILENAME Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}")) end
new(n=3)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 29 def initialize(n=3) @n=n @default_count=1 end
Public Instance Methods
probabilities(string) { |lang, prob| ... }
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 50 def probabilities(string) if not string.is_a? String raise InvalidArgument end @ngram_frequency.keys.each do |lang| prob = 1 string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram| if lang == :eng #print ngram, ", " end prob *= frequency_of(lang, ngram) end yield lang, prob end end
probabilities_h(string)
click to toggle source
returns a hash
# File lib/rlid/models/naive_bayes_models.rb, line 67 def probabilities_h(string) #puts "#{@n}: #{total_ngrams(:ita)}" res = {} probabilities(string) do |lang, prob| res[lang] = prob end res end
train()
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 76 def train ngram_counts = get_ngram_counts # ngrams for which we want to store information (all languages) @stored_ngrams = top_ngrams(ngram_counts) puts "- processing ngrams" # content: ngram_frequency[lang][ngram] = freq @ngram_frequency = Hash.new # content: total_ngrams_found[lang] = total count of ngrams encountered @total_ngrams_found= Hash.new # content: total_ngrams_not_found[lang] = n of ngrams not found @total_ngrams_not_found = Hash.new ngram_counts.each do |lang, counts| @ngram_frequency[lang] = Hash.new(0) @total_ngrams_found[lang] = 0 counts.each do |ngram, count| if @stored_ngrams.include?(ngram) @ngram_frequency[lang][ngram] = count else @ngram_frequency[lang][OTHER] += count end @total_ngrams_found[lang] += count end not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size @total_ngrams_not_found[lang] = not_found puts_info(lang) end # add language :nnn n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5) @total_ngrams_found[:nnn] = n @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n} @total_ngrams_not_found[:nnn] = @stored_ngrams.size #puts "total frequencies saved: #{freqs}" #puts "default values used: #{default_count} (#{100*default_count/freqs}%)" #@ngram_frequency end
Protected Instance Methods
frequency_of(lang, ngram)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 123 def frequency_of(lang, ngram) if not @stored_ngrams.include?(ngram) #warn " :#{ngram}: is in OTHER!" if lang == :eng ngram = OTHER end if @ngram_frequency[lang].include?(ngram) count = @ngram_frequency[lang][ngram] else count = @default_count end count.to_f / total_ngrams(lang) end
total_ngrams(lang)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 119 def total_ngrams(lang) @total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count end
Private Instance Methods
get_ngram_counts()
click to toggle source
gets all ngram_counts and returns an hash having: ngram_counts[ngram] = count
# File lib/rlid/models/naive_bayes_models.rb, line 150 def get_ngram_counts @stored_ngrams = Set.new ngram_counts = Hash.new Language.each_file("corpus") do |file, lang| puts "- I'm learning #{lang}" ngram_counts[lang] = Hash.new(0) # default is 1 file.read.each_ngram(@n) do |ngram| ngram_counts[lang][ngram] += 1 end # top ngrams (transformed into arrays) arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]} @stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]} end ngram_counts end
puts_info(lang)
click to toggle source
# File lib/rlid/models/naive_bayes_models.rb, line 138 def puts_info(lang) # default count of 1 is supposed tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang] d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1) o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1) puts " #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%" end
top_ngrams(ngram_counts)
click to toggle source
extract the top ngrams for every language
# File lib/rlid/models/naive_bayes_models.rb, line 168 def top_ngrams(ngram_counts) res = Set.new ngram_counts.values.each do |hash| # top ngrams (transformed into arrays) arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]} res += arrays[0...CUTOFF].map{|x| x[0]} end res end