class Baobab::DecisionTreeNode

Attributes

children[RW]

A list of nodes

conditions[R]

Accumulates the conditions down the tree

entropy[R]
next_attribute[RW]
probability[R]

The conditional probability that variable=value

tree[R]

The decision tree

value[R]

The value for the variable, a string

variable[R]

The variable on which this operates, a string

Public Class Methods

new(tree, parent, variable, value, entropy, conditions=nil) click to toggle source
# File lib/baobab/node.rb, line 27
def initialize tree, parent, variable, value, entropy, conditions=nil
    @tree = tree
    @parent = parent
    @variable = variable
    @value = value
    @entropy = entropy
    @conditions = conditions ? conditions : {}
    @children = Array.new
    @subset = nil
end

Public Instance Methods

build_subtree(recursive=true) click to toggle source
# File lib/baobab/node.rb, line 71
def build_subtree recursive=true
    if self.try_finish
        return
    end
    subset = self.subset
    subset_count = self.subset.count
    entropies = self.entropies
    inf_gain = entropies.each.with_object({}) do |(a, e), o|
        o[a] = @entropy - e
    end
    max_attr, max_gain = inf_gain.max_by{|v, g| g}
    self.next_attribute = max_attr
    @children += self.tree.dataset.column_values(max_attr).map do |value|
        conditions = self.full_conditions
        DecisionTreeNode.new(
            tree=@tree, parent=self,
            attribute=max_attr, value=value,
            entropy=entropies[max_attr], conditions=conditions
        )
    end
    if recursive
        @children.each do |c|
            c.build_subtree
        end
    end
end
clear() click to toggle source
# File lib/baobab/node.rb, line 51
def clear
    @subset = nil
end
entropies() click to toggle source

Returns a hash of {attribute, entropy} given that we divide the dataset on attribute.

# File lib/baobab/node.rb, line 176
def entropies
    self.pending_attrs.each.with_object({}) do |a, o|
        values = @tree.dataset.column_values(a)
        val_probabilities = values.each.with_object({}) do |v, o|
            o[v] = subset.probability a, v
        end
        val_entropies = values.each.with_object({}) do |v, o|
            o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
        end
        o[a] = values.reduce(0) do |memo, v|
            memo + val_entropies[v] * val_probabilities[v]
        end
    end
end
full_conditions() click to toggle source

Returns a subset selected on all the parent node’s conditions plus this node’s attribute and its value.

# File lib/baobab/node.rb, line 63
def full_conditions
    if @variable
        conditions.merge({@variable => @value})
    else
        conditions
    end
end
most_common_value() click to toggle source

Returns the most common class value in the dataset.

# File lib/baobab/node.rb, line 162
def most_common_value
    class_var = @tree.class_var
    class_values = @tree.dataset.column_values(class_var)
    count = class_values.each.with_object({}) do |val, o|
        o[val] = 0
    end
    self.subset.each.with_object(count) do |row, o|
        count[row[class_var]] += 1
    end
    count.max_by{|v, c| c}[0]
end
pending_attrs() click to toggle source
# File lib/baobab/node.rb, line 55
def pending_attrs
    @tree.dataset.column_names.reject do |name|
        @tree.class_var == name or @conditions.include? name
    end
end
subset() click to toggle source
# File lib/baobab/node.rb, line 43
def subset
    unless @subset.nil?
       @subset
    else
        @subset = @tree.dataset.subset(self.full_conditions)
    end
end
to_s() click to toggle source
# File lib/baobab/node.rb, line 38
def to_s
    s = @variable ? "#{@variable} => #{@value}" : "ROOT"
    s += " (#{@entropy.round(3)})"
end
try_finish() click to toggle source

Checks whether any of three is true:

  • All elements in the subset belong to the same class value: a leaf with

this value is created.

  • There are no more attributes to be selected: a leaf with the most common

class value is selected

  • There are no more rows in the dataset: a leaf with the most common class

value in the parent node is created.

# File lib/baobab/node.rb, line 105
def try_finish
    var = self.tree.class_var
    val = (
        self.try_finish_single_value_class or
        self.try_finish_empty_subset or
        self.try_finish_no_more_attributes
        )
    if val
        @next_attribute = @tree.class_var
        self.children << DecisionTreeNode.new(
            tree=@tree, parent=self,
            variable=@tree.class_var, value=val,
            entropy=0, conditions=self.full_conditions
            )
    else
        false
    end
end
try_finish_empty_subset() click to toggle source

If the subset is empty, returns the most common value in the parent node’s subset.

# File lib/baobab/node.rb, line 153
def try_finish_empty_subset
    if self.subset.empty?
        @parent.most_common_value
    else
        nil
    end
end
try_finish_no_more_attributes() click to toggle source

If there are not more attributes, returns the most common class value; else, nil.

# File lib/baobab/node.rb, line 143
def try_finish_no_more_attributes
    if self.pending_attrs.empty? then
        self.most_common_value
    else
        nil
    end
end
try_finish_single_value_class() click to toggle source

If all class values are the same, returns that value; else, nil.

# File lib/baobab/node.rb, line 125
def try_finish_single_value_class
    if self.subset.any?
        v0 = self.subset[0][@tree.class_var]
        self.subset.slice(1...-1).reduce(v0) do |memo, row|
            if memo != row[@tree.class_var]
                return nil
            else
                memo
            end
        end
    else
        nil
    end

end