class Baobab::Dataset

Represents a dataset or subset thereof. Is an array of hashes where all hashes contain the same keys.

Public Class Methods

from_json(filename) click to toggle source
# File lib/baobab/dataset.rb, line 14
def self.from_json filename
    text = File.read(filename)
    self.new JSON.parse(text)
end
new(data) click to toggle source

Receives an array of hashes. All hashes must contain the same keys.

# File lib/baobab/dataset.rb, line 8
def initialize data
    data.each do |row|
        self << row
    end
end

Public Instance Methods

attribute_names(class_var) click to toggle source
# File lib/baobab/dataset.rb, line 19
def attribute_names class_var
    self.column_names.reject{|name| name == class_var}
end
column_names() click to toggle source

Returns an array of the attribute names in the dataset Careful: it’s empty on an empty set.

# File lib/baobab/dataset.rb, line 25
def column_names
    self[0].keys
end
column_values(attribute) click to toggle source

Returns an array of the values of an attribute in the dataset. Careful: it’s empty on an empty set.

# File lib/baobab/dataset.rb, line 31
def column_values attribute
    self.map{|row| row[attribute]}.to_a.uniq
end
entropy(class_var) click to toggle source
# File lib/baobab/dataset.rb, line 46
def entropy class_var
    class_vals = self.column_values(class_var)
    probabilities = class_vals.map do |class_val|
        self.probability(class_var, class_val)
    end
    Shannon::entropy *probabilities
end
probability(var, val) click to toggle source

Evaluates the probability that var be val in this dataset. Can also be used for subsets.

# File lib/baobab/dataset.rb, line 56
def probability var, val
    unless self.count.zero?
        self.count{|r| r[var] == val}.fdiv(self.count)
    else
        0
    end
end
subset(conditions) click to toggle source

Gets a subset with given conditions. Keys must be of the same type as in the dataset (be careful with symbols).

# File lib/baobab/dataset.rb, line 37
def subset conditions
    rows = self.select do |row|
        conditions.reduce(true) do |memo, (var, val)|
            memo and row[var] == val
        end
    end
    Dataset.new rows
end
validate() click to toggle source
# File lib/baobab/dataset.rb, line 64
def validate
    raise 'Dataset is empty' if self.empty?
    self.reduce(self[0].keys) do |memo, row|
        if memo == row.keys then
            memo
        else
            raise 'Dataset is inconsistent'
        end
    end
    return nil
end