class Zipfy::Zipf

Constants

TAB

Attributes

distribution[R]
total[R]
zipf_constant[R]

Public Instance Methods

calculate_std_dev_from_reg() click to toggle source

must calculate zipfness first

# File lib/zipfy.rb, line 101
def calculate_std_dev_from_reg
    #binding.pry
    #the zipfness should be equal to 1/rank. lets see the average deviation from that number
    length = @distribution.length
    deviations = []

    @distribution.each_with_index do |wd,i|
        theoretical = 1.0/(length - i).to_f
        actual = wd.zip_number
        deviation = (theoretical - actual).abs
        deviations << deviation
    end
    sum = 0.0
    deviations.each {|d| sum += d}
    (sum/length.to_f)
end
calculate_zipf_constant() click to toggle source
# File lib/zipfy.rb, line 87
def calculate_zipf_constant
    #this word has rank =1 , so its zipf factor should be 1/n = 1/1 = 1
    # so find its percentage and then multiply it up to one.

    #make the following true
    #freq of most common word / total words = 1
    #by adding a constant to the right side
    # where constant = total words/ freq of most common word

    @zipf_constant = @total.to_f / @distribution.last.frequency.to_f
    @zipf_constant
end
calculate_zipfness() click to toggle source
# File lib/zipfy.rb, line 58
def calculate_zipfness
    calculate_zipf_constant unless @zipf_constant

    #zipf number is equal to freq * zipf_const/total
    # so call zipf_const/total 'z'
    z = @zipf_constant/@total.to_f
    length = @distribution.length

    @distribution.map do |d|
       d.zip_number = z * d.frequency
    end
end
create_distribution(words) click to toggle source

sets instance var distribution to the distirubtion of the words set passed

# File lib/zipfy.rb, line 38
def create_distribution words
    temp_hash = Hash.new(0)
    @distribution = []
    @total = 0
    @zipf_constant = nil

    words.each do |word|
        temp_hash[word] += 1
        @total += 1
    end
    temp_hash.keys.each do |k|
        @distribution << WordData.new(k, temp_hash[k])
    end
    sort_distribution
end
load_file(file_path) click to toggle source

Parses a file into an array of words

# File lib/zipfy.rb, line 22
def load_file file_path
    words = []
    File.open(file_path, 'r') do |f|
        words = f.read.to_s.split(/[\s,-]/)
    end
    words.each do |word|
        word.gsub!(/\W+/, '') #remove non characters
        word.downcase!
    end
    words.delete("") #remove blank entries
    words
end
puts_distribution() click to toggle source
# File lib/zipfy.rb, line 71
def puts_distribution
    underline " "*6 + "Word Distirbution" + " "*6
    length = @distribution.length
    @distribution.each_with_index do |wd, i|
        tabs = 4 - (wd.word.length / 8).floor
        puts wd.word + (TAB * tabs).to_s + wd.frequency.to_s + TAB + (length - i).to_s + TAB + wd.zip_number.to_s
    end
end
save_dist_to_file(file_path, overwrite_file = false) click to toggle source
# File lib/zipfy.rb, line 80
def save_dist_to_file file_path, overwrite_file = false
    if File.exists?(file_path) && overwrite_file
        puts "Requires --force to overwrite existing file '#{file_path}'"
        exit
    end
end
sort_distribution() click to toggle source
# File lib/zipfy.rb, line 54
def sort_distribution
    @distribution = @distribution.sort_by {|wd| wd.frequency.to_i}
end