class Splashy::Buckets

Public Class Methods

new( wanted_distribution, wanted_count=nil ) click to toggle source

wanted_distribution - A Hash of desired distributions:

{ :a => 0.2, :b => 0.5, :c => 0.3 }

wanted_count - (optional) Maximum total elements to be selected.

otherwise, the maximum size set is selected.
# File lib/splashy/buckets.rb, line 7
def initialize( wanted_distribution, wanted_count=nil )
  unless wanted_distribution.values.inject(0){ |m,v| m + v } == 1.0
    raise ArgumentError.new( "Distribution must sum to 1.0" )
  end
  @wanted_distribution = wanted_distribution
  @wanted_count = wanted_count
  @buckets = {}
  @wanted_distribution.keys.each do |bucket_name|
    @buckets[bucket_name] = Bucket.new( bucket_name )
  end
  @total_count = 0
end

Protected Class Methods

elements_count( hash ) click to toggle source

Protected

hash - Hash of Objects that respond to `count` (usually Arrays).

Returns count of all elements in the Hash's Array values.

# File lib/splashy/buckets.rb, line 187
def self.elements_count( hash )
  hash.values.inject(0){ |memo, array| memo + array.count }
end

Public Instance Methods

add( bucket_name, element ) click to toggle source

Public: Add a single element to a bucket.

# File lib/splashy/buckets.rb, line 45
def add( bucket_name, element )
  unless @wanted_distribution[bucket_name]
    raise ArgumentError.new( "#{bucket_name.inspect} is not a valid bucket." )
  end
  @buckets[bucket_name] << element
  @total_count += 1
end
fill( bucket_name = nil, &block ) click to toggle source

Public: Put elements into buckets with a block.

bucket_name - If supplied, all yielded elements will be added to that

bucket.

&block - A block that returns (if `bucket_name` is not supplied) an

Array: [bucket_name, element]. If `bucket_name` is supplied, only
the element needs to be returned.

Examples

fill { return [bucket_name, element] }
fill( :bucket_name ) { return element }
# File lib/splashy/buckets.rb, line 32
def fill( bucket_name = nil, &block )
  if bucket_name
    while element = yield( @total_count )
      self.add( bucket_name, element )
    end
  else
    while pair = yield( @total_count )
      self.add( *pair )
    end
  end
end
neediest_buckets() click to toggle source

Array of the buckets that need more elements to match the desired distribution, sorted descending by how much more they need.

# File lib/splashy/buckets.rb, line 85
def neediest_buckets
  multipliers = self.needed_multipliers( self._select_all, @wanted_distribution ).to_a
  multipliers.sort! { |a, b| b[1] <=> a[1] } # Sort on multiplier descending
  multipliers.map{ |bucket_name, multiplier| bucket_name }
end
satisfied?() click to toggle source

Public

Returns true if the conditions (distribution and, optionally, count) are satisfied enough to do a final selection of elements.

# File lib/splashy/buckets.rb, line 57
def satisfied?
  begin
    self.assert_satisfied!
    true
  rescue DistributionUnsatisfiedError => e
    false
  end
end
select( opts = {} ) click to toggle source

Public: Return a distribution of elements based on the desired distribution. If a satisfactory distribution is not possible, a DistributionUnsatisfiedError is raised.

Returns a Hash of elements matching the desired distribution, keyed by the bucket names.

# File lib/splashy/buckets.rb, line 72
def select( opts = {} )
  self.assert_satisfied!
  opts = { :random => false }.merge( opts )
  
  selected = self._select_wanted( opts[:random] )
  
  # Sometimes we need to fudge by a few to meet the `@wanted_count`
  selected = self.trim( selected, @wanted_count ) if @wanted_count
  selected
end

Protected Instance Methods

_select_all() click to toggle source

Protected

Returns Hash of all bucket elements, keyed by bucket name.

# File lib/splashy/buckets.rb, line 96
def _select_all
  @buckets.values.inject({}) do |memo, bucket|
    memo[bucket.name] = bucket.elements
    memo
  end
end
_select_wanted( randomly = false ) click to toggle source

Protected

Returns Hash of bucket elements, matching the wanted distribution as closely as possible.

# File lib/splashy/buckets.rb, line 107
def _select_wanted( randomly = false )
  final_count = self.estimated_final_count
  
  @buckets.values.inject({}) do |memo, bucket|
    count = ( final_count * @wanted_distribution[bucket.name] ).round
    count = [1, count].max # Ensure every bucket has at least one element
    if randomly
      memo[bucket.name] = bucket.random_elements( count )
    else
      memo[bucket.name] = bucket.elements( count )
    end
    memo
  end
end
assert_satisfied!() click to toggle source

Protected

Raises a DistributionUnsatisfiedError if we can't meet the wanted distribution or count (or both).

# File lib/splashy/buckets.rb, line 207
def assert_satisfied!
  if @total_count < @wanted_distribution.size
    raise DistributionUnsatisfiedError.new(
      "Not enough elements (#{@total_count})."
    )
  end
  
  empty_buckets = @buckets.keys.select{ |name| @buckets[name].empty? }
  unless empty_buckets.empty?
    raise DistributionUnsatisfiedError.new(
      "The following buckets are empty: #{empty_buckets.map{|b| b}.join(', ')}."
    )
  end
  
  if @wanted_count
    if @total_count < @wanted_count
      raise DistributionUnsatisfiedError.new(
        "Not enough elements (#{@total_count}) to satisfy your desired count (#{@wanted_count})."
      )
    end
    
    if self.estimated_final_count < @wanted_count
      raise DistributionUnsatisfiedError.new(
        "Distribution prevents the satisfaction of your desired count (#{@wanted_count})."
      )
    end
  end
end
estimated_final_count() click to toggle source

Protected

Returns projected final number of elements that will be returned to satisfy the requirements. If this is less than `@wanted_count`, if supplied, we can't meet the requirements.

# File lib/splashy/buckets.rb, line 196
def estimated_final_count
  limiter_bucket = self.limiter_bucket
  final_count = ( limiter_bucket.count / @wanted_distribution[limiter_bucket.name] ).ceil # go upward here to avoid missing elements in low-quantity situations
  final_count = [@wanted_count, final_count].min if @wanted_count
  final_count
end
limiter_bucket() click to toggle source

Protected

Return the Bucket that is the current limiter in the distribution. In other words, this bucket is limiting the total size of the final selection.

# File lib/splashy/buckets.rb, line 241
def limiter_bucket
  # Smallest value of "count / desired percent" is the limiter.
  @buckets.values.map do |bucket|
    [bucket, bucket.count / @wanted_distribution[bucket.name]]
  end.sort { |a, b| a[1] <=> b[1] }[0][0]
end
needed_multipliers( current_selections, wanted_distribution ) click to toggle source

Protected

current_selections - Hash of element Arrays, keyed by bucket name. wanted_distribution - The wanted distribution as a hash of percentage

Floats.

Returns Hash of multipliers needd for each bucket to reach its current wanted distribution.

# File lib/splashy/buckets.rb, line 165
def needed_multipliers( current_selections, wanted_distribution )
  total_size = self.class.elements_count( current_selections )
  
  current_selections.keys.inject({}) do |memo, bucket_name|
    bucket_size = current_selections[bucket_name].size
    desired_pct = wanted_distribution[bucket_name]
    current_pct = bucket_size.to_f / total_size
    current_pct = 0 if current_pct.nan?
    if current_pct > 0
      memo[bucket_name] = desired_pct - current_pct
    else
      memo[bucket_name] = desired_pct
    end
    memo
  end
end
trim( selected, size ) click to toggle source

Protected: Trim a given Hash of Arrays – keyed by bucket names – until it satisfies @wanted_count.

selected - A Hash of selected elements, keyed by the bucket names. All

values must be Arrays (or respond to `size`).

size - The desired total size of `selected`.

# File lib/splashy/buckets.rb, line 128
def trim( selected, size )
  raise ArgumentError.new( "Can't trim to a nil size" ) unless size
  while self.class.elements_count( selected ) > size
    candidates = self.trim_candidates( selected, @wanted_distribution )
    selected[candidates.first].pop
  end
  
  selected
end
trim_candidates( current_selections, wanted_distribution ) click to toggle source

Protected

current_selections - Hash of element Arrays, keyed by bucket name. wanted_distribution - The wanted distribution as a hash of percentage

Floats.

Returns Array of bucket names for buckets that are good trim candidates, ordered by best candidates first.

# File lib/splashy/buckets.rb, line 146
def trim_candidates( current_selections, wanted_distribution )
  multipliers = self.needed_multipliers( current_selections, wanted_distribution ).to_a
  multipliers.select do |bucket_name, multiplier|
    # Can't trim empty buckets
    @buckets[bucket_name].count != 0
  end
  return multipliers if multipliers.empty?
  multipliers.sort! { |a, b| a[1] <=> b[1] } # Sort on multiplier ascending
  multipliers.map{ |bucket_name, multiplier| bucket_name }
end