class Aggregate

Implements aggregate statistics and maintains configurable histogram for a set of given samples. Convenient for tracking high throughput data.

Constants

VERSION

Attributes

count[R]

The current number of samples

max[R]

The maximum sample value

min[R]

The minimum samples value

outliers_high[R]

The number of samples falling above the highest valued histogram bucket

outliers_low[R]

The number of samples falling below the lowest valued histogram bucket

sum[R]

The sum of all samples

Public Class Methods

new(low=nil, high=nil, width=nil) click to toggle source

Create a new Aggregate that maintains a binary logarithmic histogram by default. Specifying values for low, high, and width configures the aggregate to maintain a linear histogram with (high - low)/width buckets

# File lib/aggregate.rb, line 29
def initialize (low=nil, high=nil, width=nil)
  @count = 0
  @sum = 0.0
  @sum2 = 0.0
  @outliers_low = 0
  @outliers_high = 0

  # If the user asks we maintain a linear histogram where
  # values in the range [low, high) are bucketed in multiples
  # of width
  if (nil != low && nil != high && nil != width)

    #Validate linear specification
    if high <= low
      raise ArgumentError, "High bucket must be > Low bucket"
    end

    if high - low < width
      raise ArgumentError, "Histogram width must be <= histogram range"
    end

    if 0 != (high - low).modulo(width)
      raise ArgumentError, "Histogram range (high - low) must be a multiple of width"
    end

    @low = low
    @high = high
    @width = width
  else
    @low = 1
    @width = nil
    @high = to_bucket(@@LOG_BUCKETS - 1)
  end

  #Initialize all buckets to 0
  @buckets = Array.new(bucket_count, 0)
end

Public Instance Methods

<<(data) click to toggle source

Include a sample in the aggregate

# File lib/aggregate.rb, line 68
def << data

  # Update min/max
  if 0 == @count
    @min = data
    @max = data
  else
    @max = data if data > @max
    @min = data if data < @min
  end

  # Update the running info
  @count += 1
  @sum += data
  @sum2 += (data * data)

  # Update the bucket
  @buckets[to_index(data)] += 1 unless outlier?(data)
end
each() { |to_bucket(index), count| ... } click to toggle source

Iterate through each bucket in the histogram regardless of its contents

# File lib/aggregate.rb, line 187
def each
  @buckets.each_with_index do |count, index|
    yield(to_bucket(index), count)
  end
end
each_nonzero() { |to_bucket(index), count| ... } click to toggle source

Iterate through only the buckets in the histogram that contain samples

# File lib/aggregate.rb, line 195
def each_nonzero
  @buckets.each_with_index do |count, index|
    yield(to_bucket(index), count) if count != 0
  end
end
mean() click to toggle source

The current average of all samples

# File lib/aggregate.rb, line 89
def mean
  @sum / @count
end
skip_row(value_width) click to toggle source

We denote empty buckets with a '~'

# File lib/aggregate.rb, line 149
def skip_row(value_width)
  sprintf("%#{value_width}s ~\n", " ")
end
std_dev() click to toggle source

Calculate the standard deviation

# File lib/aggregate.rb, line 94
def std_dev
  Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
end
to_s(columns=nil) click to toggle source

Generate a pretty-printed ASCII representation of the histogram

# File lib/aggregate.rb, line 107
def to_s(columns=nil)

  #default to an 80 column terminal, don't support < 80 for now
  if nil == columns
    columns = 80
  else
    raise ArgumentError if columns < 80
  end

  #Find the largest bucket and create an array of the rows we intend to print
  disp_buckets = Array.new
  max_count = 0
  total = 0
  @buckets.each_with_index do |count, idx|
    next if 0 == count
    max_count = [max_count, count].max
    disp_buckets << [idx, to_bucket(idx), count]
    total += count
  end

  #XXX: Better to print just header --> footer
  return "Empty histogram" if 0 == disp_buckets.length

  #Figure out how wide the value and count columns need to be based on their
  #largest respective numbers
  value_str = "value"
  count_str = "count"
  total_str = "Total"
  value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
  value_width = [value_width, total_str.length].max
  count_width = [total.to_s.length, count_str.length].max
  max_bar_width  = columns - (value_width + " |".length + "| ".length + count_width)

  #Determine the value of a '@'
  weight = [max_count.to_f/max_bar_width.to_f, 1.0].max

  #format the header
  histogram = sprintf("%#{value_width}s |", value_str)
  max_bar_width.times { histogram << "-"}
  histogram << sprintf("| %#{count_width}s\n", count_str)

  # We denote empty buckets with a '~'
  def skip_row(value_width)
    sprintf("%#{value_width}s ~\n", " ")
  end

  #Loop through each bucket to be displayed and output the correct number
  prev_index = disp_buckets[0][0] - 1

  disp_buckets.each do |x|
    #Denote skipped empty buckets with a ~
    histogram << skip_row(value_width) unless prev_index == x[0] - 1
    prev_index = x[0]

    #Add the value
    row = sprintf("%#{value_width}d |", x[1])

    #Add the bar
    bar_size = (x[2]/weight).to_i
    bar_size.times { row += "@"}
    (max_bar_width - bar_size).times { row += " " }

    #Add the count
    row << sprintf("| %#{count_width}d\n", x[2])

    #Append the finished row onto the histogram
    histogram << row
  end

  #End the table
  histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
  histogram << sprintf("%#{value_width}s", "Total")
  histogram << " |"
  max_bar_width.times {histogram << "-"}
  histogram << "| "
  histogram << sprintf("%#{count_width}d\n", total)
end

Private Instance Methods

bucket_count() click to toggle source
# File lib/aggregate.rb, line 218
def bucket_count
  if linear?
    return (@high-@low)/@width
  else
    return @@LOG_BUCKETS
  end
end
linear?() click to toggle source
# File lib/aggregate.rb, line 203
def linear?
  nil != @width
end
log2( x ) click to toggle source
# File lib/aggregate.rb, line 282
def log2( x )
 Math.log(x) / @@LOG2_DIVEDEND
end
outlier?(data) click to toggle source
# File lib/aggregate.rb, line 207
def outlier? (data)

  if data < @low
    @outliers_low += 1
  elsif data >= @high
    @outliers_high += 1
  else
    return false
  end
end
right_bucket?(index, data) click to toggle source
# File lib/aggregate.rb, line 234
def right_bucket? index, data

  # check invariant
  raise unless linear?

  bucket = to_bucket(index)

  #It's the right bucket if data falls between bucket and next bucket
  bucket <= data && data < bucket + @width
end
to_bucket(index) click to toggle source
# File lib/aggregate.rb, line 226
def to_bucket(index)
  if linear?
    return @low + (index * @width)
  else
    return 2**(index)
  end
end
to_index(data) click to toggle source

A data point is added to the bucket where the data point is less than the value represented by bucket, but greater than the value represented by bucket

# File lib/aggregate.rb, line 265
def to_index (data)

  # basic case is simple
  return log2(data).to_i if !linear?

  # Search for the right bucket in the linear case
  @buckets.each_with_index do |count, idx|
    return idx if right_bucket?(idx, data)
  end
  #find_bucket(0, bucket_count-1, data)

  #Should not get here
  raise "#{data}"
end