module Embulk::Guess::SchemaGuess

Constants

FALSE_STRINGS

When matching to false string, then retrun 'true'

TRUE_STRINGS

taken from CsvParserPlugin.TRUE_STRINGS

TYPE_COALESCE

Public Class Methods

from_array_records(column_names, samples) click to toggle source
# File lib/embulk/guess/schema_guess.rb, line 26
def from_array_records(column_names, samples)
  column_types = types_from_array_records(samples)
  columns = column_types.zip(column_names).map do |(type,name)|
    hash = {name: name, type: type.to_sym}
    hash[:format] = type.format if type.is_a?(TimestampTypeMatch)
    Embulk::Column.new(hash)
  end
  return Embulk::Schema.new(columns)
end
from_hash_records(array_of_hash) click to toggle source
# File lib/embulk/guess/schema_guess.rb, line 16
def from_hash_records(array_of_hash)
  array_of_hash = Array(array_of_hash)
  if array_of_hash.empty?
    raise "SchemaGuess Can't guess schema from no records"
  end
  column_names = array_of_hash.first.keys
  samples = array_of_hash.to_a.map {|hash| column_names.map {|name| hash[name] } }
  from_array_records(column_names, samples)
end
types_from_array_records(samples) click to toggle source

TODO this method will be private once guess/csv is refactored

# File lib/embulk/guess/schema_guess.rb, line 37
def types_from_array_records(samples)
  columnar_types = []
  samples.each do |record|
    record.each_with_index {|value,i| (columnar_types[i] ||= []) << guess_type(value) }
  end
  columnar_types.map {|types| merge_types(types) }
end

Private Class Methods

guess_type(str) click to toggle source
# File lib/embulk/guess/schema_guess.rb, line 47
def guess_type(str)
  if str.is_a?(Hash) || str.is_a?(Array)
    return "json"
  end
  str = str.to_s

  if TRUE_STRINGS[str] || FALSE_STRINGS[str]
    return "boolean"
  end

  if TimeFormatGuess.guess(str)
    return TimestampTypeMatch.new(str)
  end

  if str.to_i.to_s == str
    return "long"
  end

  # Introduce a regular expression to make better suggestion to double type. It refers to Guava 21.0's regular
  # expression in Doubles#fpPattern() but, there're difference as following:
  # * It intentionaly rejects float values when they start with "0" like "001.0", "010.01". "0.1" is ok.
  # * It doesn't support hexadecimal representation. It could be improved more later.
  if str =~ /^[+-]?(NaN|Infinity|([1-9]\d*|0)(\.\d+)([eE][+-]?\d+)?[fFdD]?)$/
    return "double"
  end

  if str.empty?
    return nil
  end

  begin
    JSON.parse(str)
    return "json"
  rescue
  end

  return "string"
end
merge_type(type1, type2) click to toggle source
# File lib/embulk/guess/schema_guess.rb, line 120
def merge_type(type1, type2)
  if type1 == type2
    type1
  elsif type1.nil? || type2.nil?
    type1 || type2
  else
    TYPE_COALESCE[[type1, type2].sort] || "string"
  end
end
merge_types(types) click to toggle source
# File lib/embulk/guess/schema_guess.rb, line 86
def merge_types(types)
  t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
  if t.is_a?(TimestampTypeMatch)
    format = TimeFormatGuess.guess(types.map {|type| type.is_a?(TimestampTypeMatch) ? type.format : nil }.compact)
    return TimestampTypeMatch.new(format)
  else
    return t
  end
end