class FlatKit::FieldStats

Collect stats on a single field. We may not know what the field data type is to start with, so collect a bunch of values until we have the threshold, and then calculte states based upon the data types determined by the guess

Constants

ALL_STATS
CARDINALITY_STATS
CORE_STATS
DEFAULT_GUESS_THRESHOLD
EXPORT_FIELDS

Attributes

field_type[R]
name[R]
type_counts[R]

Public Class Methods

new(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD) click to toggle source
# File lib/flat_kit/field_stats.rb, line 46
def initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD)
  @name              = name
  @field_type        = type
  @guess_threshold   = guess_threshold
  @type_counts       = Hash.new(0)
  @out_of_type_count = 0
  @values            = []
  @stats             = nil
  @length_stats      = nil
  @stats_to_collect  = [stats_to_collect].flatten

  @stats_to_collect.each do |collection_set|
    next if ALL_STATS.include?(collection_set)
    raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{ALL_STATS.map { |s| s.to_s }.join(", ") }"
  end
  raise ArgumentError, "type: must be FieldType subclasses - not #{type}" unless type.kind_of?(Class) && (type.superclass == ::FlatKit::FieldType)
end

Public Instance Methods

collecting_frequencies?() click to toggle source
# File lib/flat_kit/field_stats.rb, line 82
def collecting_frequencies?
  @stats_to_collect.include?(CARDINALITY_STATS)
end
count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 90
def count
  stats.count
end
field_type_determined?() click to toggle source
# File lib/flat_kit/field_stats.rb, line 64
def field_type_determined?
  @field_type != ::FlatKit::FieldType::GuessType
end
frequencies() click to toggle source
# File lib/flat_kit/field_stats.rb, line 126
def frequencies
  stats.frequencies if collecting_frequencies?
end
length_frequencies() click to toggle source
# File lib/flat_kit/field_stats.rb, line 158
def length_frequencies
  length_stats.frequencies if @length_stats && collecting_frequencies?
end
max() click to toggle source
# File lib/flat_kit/field_stats.rb, line 94
def max
  stats.max if stats.respond_to?(:max)
end
max_length() click to toggle source
# File lib/flat_kit/field_stats.rb, line 134
def max_length
  length_stats.max if @length_stats
end
mean() click to toggle source
# File lib/flat_kit/field_stats.rb, line 98
def mean
  stats.mean if stats.respond_to?(:mean)
end
mean_length() click to toggle source
# File lib/flat_kit/field_stats.rb, line 138
def mean_length
  length_stats.mean if @length_stats
end
min() click to toggle source
# File lib/flat_kit/field_stats.rb, line 102
def min
  stats.min if stats.respond_to?(:min)
end
min_length() click to toggle source
# File lib/flat_kit/field_stats.rb, line 130
def min_length
  length_stats.min if @length_stats
end
mode() click to toggle source
# File lib/flat_kit/field_stats.rb, line 114
def mode
  stats.mode if collecting_frequencies?
end
mode_length() click to toggle source
# File lib/flat_kit/field_stats.rb, line 146
def mode_length
  length_stats.mode if @length_stats && collecting_frequencies?
end
null_count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 162
def null_count
  type_counts[FieldType::NullType]
end
null_percent() click to toggle source
# File lib/flat_kit/field_stats.rb, line 174
def null_percent
  return 0 if total_count.zero?
  ((null_count.to_f / total_count) * 100.0).truncate(2)
end
out_of_type_count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 170
def out_of_type_count
  @out_of_type_count
end
stddev() click to toggle source
# File lib/flat_kit/field_stats.rb, line 106
def stddev
  stats.stddev if stats.respond_to?(:stddev)
end
stddev_length() click to toggle source
# File lib/flat_kit/field_stats.rb, line 142
def stddev_length
  length_stats.stddev if @length_stats
end
sum() click to toggle source
# File lib/flat_kit/field_stats.rb, line 110
def sum
  stats.sum if stats.respond_to?(:sum)
end
to_hash() click to toggle source
# File lib/flat_kit/field_stats.rb, line 188
def to_hash
  resolve_guess

  Hash.new.tap do |h|
    EXPORT_FIELDS.each do |n|
      h[n] = self.send(n)
    end
  end
end
total_count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 166
def total_count
  stats.count + @out_of_type_count
end
type() click to toggle source
# File lib/flat_kit/field_stats.rb, line 86
def type
  @field_type.type_name
end
unique_count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 118
def unique_count
  stats.unique_count if collecting_frequencies?
end
unique_count_lengths() click to toggle source
# File lib/flat_kit/field_stats.rb, line 150
def unique_count_lengths
  length_stats.unique_count if @length_stats && collecting_frequencies?
end
unique_values() click to toggle source
# File lib/flat_kit/field_stats.rb, line 122
def unique_values
  stats.unique_values if collecting_frequencies?
end
unique_values_lengths() click to toggle source
# File lib/flat_kit/field_stats.rb, line 154
def unique_values_lengths
  length_stats.unique_values if @length_stats && collecting_frequencies?
end
unknown_count() click to toggle source
# File lib/flat_kit/field_stats.rb, line 179
def unknown_count
  type_counts[FieldType::UnknownType]
end
unknown_percent() click to toggle source
# File lib/flat_kit/field_stats.rb, line 183
def unknown_percent
  return 0 if total_count.zero?
  ((unknown_count.to_f / total_count) * 100.0).truncate(2)
end
update(value) click to toggle source
# File lib/flat_kit/field_stats.rb, line 68
def update(value)
  update_type_count(value)

  if field_type_determined? then
    update_stats(value)
  else
    @values << value

    if @values.size >= @guess_threshold then
      resolve_guess
    end
  end
end

Private Instance Methods

length_stats() click to toggle source
# File lib/flat_kit/field_stats.rb, line 205
def length_stats
  resolve_guess
  @length_stats
end
resolve_guess() click to toggle source
# File lib/flat_kit/field_stats.rb, line 227
def resolve_guess
  return if field_type_determined?
  best_guess_type, _best_guess_count = type_counts.max_by { |k, v| v }
  @field_type = best_guess_type
  @stats = StatType.for(@field_type).new(collecting_frequencies: collecting_frequencies?)
  if @field_type == ::FlatKit::FieldType::StringType then
    @length_stats = ::FlatKit::StatType::NumericalStats.new(collecting_frequencies: collecting_frequencies?)
  end
  @values.each do |v|
    update_stats(v)
  end
  @values.clear
end
stats() click to toggle source
# File lib/flat_kit/field_stats.rb, line 200
def stats
  resolve_guess
  @stats
end
update_stats(value) click to toggle source
# File lib/flat_kit/field_stats.rb, line 210
def update_stats(value)
  coerced_value = @field_type.coerce(value)
  if coerced_value == FieldType::CoerceFailure then
    @out_of_type_count += 1
    return
  end

  @stats.update(coerced_value)
  @length_stats.update(coerced_value.to_s.length) if @length_stats
end
update_type_count(value) click to toggle source
# File lib/flat_kit/field_stats.rb, line 221
def update_type_count(value)
  guess = FieldType.best_guess(value)
  type_counts[guess] += 1
  return guess
end