class IOStreams::Tabular

Common handling for efficiently processing tabular data such as CSV, spreadsheet or other tabular files on a line by line basis.

Tabular consists of a table of data where the first row is usually the header, and subsequent rows are the data elements.

Tabular applies the header information to every row of data when as_hash is called.

Example using the default CSV parser:

tabular = Tabular.new
tabular.parse_header("first field,Second,thirD")
# => ["first field", "Second", "thirD"]

tabular.cleanse_header!
# => ["first_field", "second", "third"]

tabular.record_parse("1,2,3")
# => {"first_field"=>"1", "second"=>"2", "third"=>"3"}

tabular.record_parse([1,2,3])
# => {"first_field"=>1, "second"=>2, "third"=>3}

tabular.render([5,6,9])
# => "5,6,9"

tabular.render({"third"=>"3", "first_field"=>"1" })
# => "1,,3"

Attributes

format[R]
header[R]
parser[R]

Public Class Methods

deregister_format(format) click to toggle source

De-Register a file format

Returns [Symbol] the format removed, or nil if the format was not registered

Example:

register_extension(:xls)
# File lib/io_streams/tabular.rb, line 181
def self.deregister_format(format)
  raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/

  @formats.delete(format.to_sym)
end
format_from_file_name(file_name) click to toggle source

Returns the registered format that will be used for the supplied file name.

# File lib/io_streams/tabular.rb, line 196
def self.format_from_file_name(file_name)
  file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
  nil
end
new(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args) click to toggle source

Parse a delimited data source.

Parameters

format: [Symbol]
  :csv, :hash, :array, :json, :psv, :fixed

file_name: [IOStreams::Path | String]
  When `:format` is not supplied the file name can be used to infer the required format.
  Optional. Default: nil

format_options: [Hash]
  Any specialized format specific options. For example, `:fixed` format requires the file definition.

columns [Array<String>]
  The header columns when the file does not include a header row.
  Note:
    It is recommended to keep all columns as strings to avoid any issues when persistence
    with MongoDB when it converts symbol keys to strings.

allowed_columns [Array<String>]
  List of columns to allow.
  Default: nil ( Allow all columns )
  Note:
    When supplied any columns that are rejected will be returned in the cleansed columns
    as nil so that they can be ignored during processing.

required_columns [Array<String>]
  List of columns that must be present, otherwise an Exception is raised.

skip_unknown [true|false]
  true:
    Skip columns not present in the `allowed_columns` by cleansing them to nil.
    #as_hash will skip these additional columns entirely as if they were not in the file at all.
  false:
    Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.

default_format: [Symbol]
  When the format is not supplied, and the format cannot be inferred from the supplied file name
  then this default format will be used.
  Default: :csv
  Set to nil to force it to raise an exception when the format is undefined.
# File lib/io_streams/tabular.rb, line 90
def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
  @header = Header.new(**args)
  @format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
  @format ||= default_format
  raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format

  klass   = self.class.parser_class(@format)
  @parser = format_options ? klass.new(**format_options) : klass.new
end
parser_class(format) click to toggle source

Returns the parser class for the registered format.

# File lib/io_streams/tabular.rb, line 202
def self.parser_class(format)
  @formats[format.nil? ? nil : format.to_sym] ||
    raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
end
register_format(format, parser) click to toggle source

Register a format and the parser class for it.

Example:

register_format(:csv, IOStreams::Tabular::Parser::Csv)
# File lib/io_streams/tabular.rb, line 169
def self.register_format(format, parser)
  raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/

  @formats[format.to_sym] = parser
end
registered_formats() click to toggle source

Returns [Array<Symbol>] the list of registered formats

# File lib/io_streams/tabular.rb, line 188
def self.registered_formats
  @formats.keys
end

Public Instance Methods

cleanse_header!() click to toggle source

Returns [Array<String>] the cleansed columns

# File lib/io_streams/tabular.rb, line 160
def cleanse_header!
  header.cleanse!
  header.columns
end
header?() click to toggle source

Returns [true|false] whether a header is still required in order to parse or render the current format.

# File lib/io_streams/tabular.rb, line 101
def header?
  parser.requires_header? && IOStreams::Utils.blank?(header.columns)
end
parse_header(line) click to toggle source

Returns [Array] the header row/line after parsing and cleansing. Returns `nil` if the row/line is blank, or a header is not required for the supplied format (:json, :hash).

Notes:

  • Call `header?` first to determine if the header should be parsed first.

  • The header columns are set after parsing the row, but the header is not cleansed.

# File lib/io_streams/tabular.rb, line 116
def parse_header(line)
  return if IOStreams::Utils.blank?(line) || !parser.requires_header?

  header.columns = parser.parse(line)
end
record_parse(line) click to toggle source

Returns [Hash<String,Object>] the line as a hash. Returns nil if the line is blank.

# File lib/io_streams/tabular.rb, line 124
def record_parse(line)
  line = row_parse(line)
  header.to_hash(line) if line
end
render(row) click to toggle source

Renders the output row

# File lib/io_streams/tabular.rb, line 138
def render(row)
  return if IOStreams::Utils.blank?(row)

  parser.render(row, header)
end
render_header() click to toggle source

Returns [String] the header rendered for the output format Return nil if no header is required.

# File lib/io_streams/tabular.rb, line 146
def render_header
  return unless requires_header?

  if IOStreams::Utils.blank?(header.columns)
    raise(
      Errors::MissingHeader,
      "Header columns must be set before attempting to render a header for format: #{format.inspect}"
    )
  end

  parser.render(header.columns, header)
end
requires_header?() click to toggle source

Returns [true|false] whether a header row show be rendered on output.

# File lib/io_streams/tabular.rb, line 106
def requires_header?
  parser.requires_header?
end
row_parse(line) click to toggle source

Returns [Array] the row/line as a parsed Array of values. Returns nil if the row/line is blank.

# File lib/io_streams/tabular.rb, line 131
def row_parse(line)
  return if IOStreams::Utils.blank?(line)

  parser.parse(line)
end