class Zipfy::Zipf
Constants
- TAB
Attributes
distribution[R]
total[R]
zipf_constant[R]
Public Instance Methods
calculate_std_dev_from_reg()
click to toggle source
must calculate zipfness first
# File lib/zipfy.rb, line 101 def calculate_std_dev_from_reg #binding.pry #the zipfness should be equal to 1/rank. lets see the average deviation from that number length = @distribution.length deviations = [] @distribution.each_with_index do |wd,i| theoretical = 1.0/(length - i).to_f actual = wd.zip_number deviation = (theoretical - actual).abs deviations << deviation end sum = 0.0 deviations.each {|d| sum += d} (sum/length.to_f) end
calculate_zipf_constant()
click to toggle source
# File lib/zipfy.rb, line 87 def calculate_zipf_constant #this word has rank =1 , so its zipf factor should be 1/n = 1/1 = 1 # so find its percentage and then multiply it up to one. #make the following true #freq of most common word / total words = 1 #by adding a constant to the right side # where constant = total words/ freq of most common word @zipf_constant = @total.to_f / @distribution.last.frequency.to_f @zipf_constant end
calculate_zipfness()
click to toggle source
# File lib/zipfy.rb, line 58 def calculate_zipfness calculate_zipf_constant unless @zipf_constant #zipf number is equal to freq * zipf_const/total # so call zipf_const/total 'z' z = @zipf_constant/@total.to_f length = @distribution.length @distribution.map do |d| d.zip_number = z * d.frequency end end
create_distribution(words)
click to toggle source
sets instance var distribution to the distirubtion of the words set passed
# File lib/zipfy.rb, line 38 def create_distribution words temp_hash = Hash.new(0) @distribution = [] @total = 0 @zipf_constant = nil words.each do |word| temp_hash[word] += 1 @total += 1 end temp_hash.keys.each do |k| @distribution << WordData.new(k, temp_hash[k]) end sort_distribution end
load_file(file_path)
click to toggle source
Parses a file into an array of words
# File lib/zipfy.rb, line 22 def load_file file_path words = [] File.open(file_path, 'r') do |f| words = f.read.to_s.split(/[\s,-]/) end words.each do |word| word.gsub!(/\W+/, '') #remove non characters word.downcase! end words.delete("") #remove blank entries words end
puts_distribution()
click to toggle source
# File lib/zipfy.rb, line 71 def puts_distribution underline " "*6 + "Word Distirbution" + " "*6 length = @distribution.length @distribution.each_with_index do |wd, i| tabs = 4 - (wd.word.length / 8).floor puts wd.word + (TAB * tabs).to_s + wd.frequency.to_s + TAB + (length - i).to_s + TAB + wd.zip_number.to_s end end
save_dist_to_file(file_path, overwrite_file = false)
click to toggle source
# File lib/zipfy.rb, line 80 def save_dist_to_file file_path, overwrite_file = false if File.exists?(file_path) && overwrite_file puts "Requires --force to overwrite existing file '#{file_path}'" exit end end
sort_distribution()
click to toggle source
# File lib/zipfy.rb, line 54 def sort_distribution @distribution = @distribution.sort_by {|wd| wd.frequency.to_i} end