class FlatKit::FieldStats
Collect stats on a single field. We may not know what the field data type is to start with, so collect a bunch of values until we have the threshold, and then calculte states based upon the data types determined by the guess
Constants
- ALL_STATS
- CARDINALITY_STATS
- CORE_STATS
- DEFAULT_GUESS_THRESHOLD
- EXPORT_FIELDS
Attributes
field_type[R]
name[R]
type_counts[R]
Public Class Methods
new(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD)
click to toggle source
# File lib/flat_kit/field_stats.rb, line 46 def initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD) @name = name @field_type = type @guess_threshold = guess_threshold @type_counts = Hash.new(0) @out_of_type_count = 0 @values = [] @stats = nil @length_stats = nil @stats_to_collect = [stats_to_collect].flatten @stats_to_collect.each do |collection_set| next if ALL_STATS.include?(collection_set) raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{ALL_STATS.map { |s| s.to_s }.join(", ") }" end raise ArgumentError, "type: must be FieldType subclasses - not #{type}" unless type.kind_of?(Class) && (type.superclass == ::FlatKit::FieldType) end
Public Instance Methods
collecting_frequencies?()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 82 def collecting_frequencies? @stats_to_collect.include?(CARDINALITY_STATS) end
count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 90 def count stats.count end
field_type_determined?()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 64 def field_type_determined? @field_type != ::FlatKit::FieldType::GuessType end
frequencies()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 126 def frequencies stats.frequencies if collecting_frequencies? end
length_frequencies()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 158 def length_frequencies length_stats.frequencies if @length_stats && collecting_frequencies? end
max()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 94 def max stats.max if stats.respond_to?(:max) end
max_length()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 134 def max_length length_stats.max if @length_stats end
mean()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 98 def mean stats.mean if stats.respond_to?(:mean) end
mean_length()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 138 def mean_length length_stats.mean if @length_stats end
min()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 102 def min stats.min if stats.respond_to?(:min) end
min_length()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 130 def min_length length_stats.min if @length_stats end
mode()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 114 def mode stats.mode if collecting_frequencies? end
mode_length()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 146 def mode_length length_stats.mode if @length_stats && collecting_frequencies? end
null_count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 162 def null_count type_counts[FieldType::NullType] end
null_percent()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 174 def null_percent return 0 if total_count.zero? ((null_count.to_f / total_count) * 100.0).truncate(2) end
out_of_type_count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 170 def out_of_type_count @out_of_type_count end
stddev()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 106 def stddev stats.stddev if stats.respond_to?(:stddev) end
stddev_length()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 142 def stddev_length length_stats.stddev if @length_stats end
sum()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 110 def sum stats.sum if stats.respond_to?(:sum) end
to_hash()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 188 def to_hash resolve_guess Hash.new.tap do |h| EXPORT_FIELDS.each do |n| h[n] = self.send(n) end end end
total_count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 166 def total_count stats.count + @out_of_type_count end
type()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 86 def type @field_type.type_name end
unique_count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 118 def unique_count stats.unique_count if collecting_frequencies? end
unique_count_lengths()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 150 def unique_count_lengths length_stats.unique_count if @length_stats && collecting_frequencies? end
unique_values()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 122 def unique_values stats.unique_values if collecting_frequencies? end
unique_values_lengths()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 154 def unique_values_lengths length_stats.unique_values if @length_stats && collecting_frequencies? end
unknown_count()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 179 def unknown_count type_counts[FieldType::UnknownType] end
unknown_percent()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 183 def unknown_percent return 0 if total_count.zero? ((unknown_count.to_f / total_count) * 100.0).truncate(2) end
update(value)
click to toggle source
# File lib/flat_kit/field_stats.rb, line 68 def update(value) update_type_count(value) if field_type_determined? then update_stats(value) else @values << value if @values.size >= @guess_threshold then resolve_guess end end end
Private Instance Methods
length_stats()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 205 def length_stats resolve_guess @length_stats end
resolve_guess()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 227 def resolve_guess return if field_type_determined? best_guess_type, _best_guess_count = type_counts.max_by { |k, v| v } @field_type = best_guess_type @stats = StatType.for(@field_type).new(collecting_frequencies: collecting_frequencies?) if @field_type == ::FlatKit::FieldType::StringType then @length_stats = ::FlatKit::StatType::NumericalStats.new(collecting_frequencies: collecting_frequencies?) end @values.each do |v| update_stats(v) end @values.clear end
stats()
click to toggle source
# File lib/flat_kit/field_stats.rb, line 200 def stats resolve_guess @stats end
update_stats(value)
click to toggle source
# File lib/flat_kit/field_stats.rb, line 210 def update_stats(value) coerced_value = @field_type.coerce(value) if coerced_value == FieldType::CoerceFailure then @out_of_type_count += 1 return end @stats.update(coerced_value) @length_stats.update(coerced_value.to_s.length) if @length_stats end
update_type_count(value)
click to toggle source
# File lib/flat_kit/field_stats.rb, line 221 def update_type_count(value) guess = FieldType.best_guess(value) type_counts[guess] += 1 return guess end