class Fech::MapGenerator

Helper class to generate mapping hashes from source csv data. Needed to rebuild rendered_maps.rb with new source data, not used in main gem.

rake fech:maps

Constants

BASE_ROW_TYPES
FILING_VERSIONS
PAPER_BASE_ROW_TYPES
PAPER_FILING_VERSIONS
ROW_TYPE_MATCHERS

Attributes

map[RW]

Public Class Methods

convert_header_file_to_row_files(source_dir) click to toggle source

Goes through all version header summary files and generates row map files for each type of row inside them.

# File lib/fech/map_generator.rb, line 74
def self.convert_header_file_to_row_files(source_dir)
  data = {}
  hybrid_data = {}

  ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }

  # Create a hash of data with an entry for each row type found in the source
  # version summary files. Each row has an entry for each version map that
  # exists for it. If maps for two different versions are identical, they
  # are combined.
  FILING_VERSIONS.each do |version|
    filepath = version_summary_file(source_dir, version)

    # Clean the source files by removing unparseable characters
    if RUBY_VERSION < "1.9.3"
      require 'iconv'
      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
      valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
    else
      valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
      valid_string = valid_string.encode!('UTF-8', 'UTF-16')
    end
    open(filepath, 'w').write(valid_string)

    Fech::Csv.foreach(filepath) do |row|
      # Each row of a version summary file contains the ordered list of
      # column names.
      data[row.first] ||= {}
      hybrid_data[row.first] ||= {}
      row_version_data = remove_ignored_fields(row, ignored_fields)

      # Check the maps for this row type in already-processed versions.
      # If this map is identical to a previous map, tack this version on to
      # to it instead of creating a new one.
      data[row.first][version] = row_version_data
      data[row.first].each do |k, v|
        # skip the row we just added

        next if k == version
        if v == row_version_data
          # Create the new hybrid entry
          hybrid_data[row.first]["#{k}|#{version}"] = row_version_data

          # Delete the old entry, and the one for this version only
          data[row.first].delete(k)
          data[row.first].delete(version)
        end
      end
      data[row.first].update(hybrid_data[row.first])
    end
  end

  # Go through each row type and create a base map management file that
  # will serve as a template for organizing which fields are the same
  # between versions. This file will need to then be arranged by hand to
  # clean up the data. Each row will represent a column across versions,
  # each column a unique map for that row for one or more versions.
  data.each do |row_type, row_data|
    file_path = write_row_map_file(source_dir, row_type)
    next unless File.exists?(file_path)
    File.open(file_path, 'w') do |f|
      f.write('canonical')

      to_transpose = []
      row_data.sort.reverse.each do |version, version_data|
        to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
        to_transpose << [nil, version_data].flatten
      end

      # standardize row size
      max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
      to_transpose.each { |r| r[max_size - 1] ||= nil }
      transposed = to_transpose.transpose

      transposed.each do |transposed_data|
        transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
        canonical = transposed_data[1] # first description
        if canonical
          canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
          transposed_data = [canonical, transposed_data].flatten
        end
        f.write(transposed_data.join(','))
        f.write("\n")
      end
    end
  end

end
dump_row_maps_to_ruby(source_dir, file_path) click to toggle source

Generates the mapping for each row type in BASE_ROW_TYPES, writes them out to file for inclusion in the gem.

# File lib/fech/map_generator.rb, line 165
def self.dump_row_maps_to_ruby(source_dir, file_path)
  File.open(file_path, 'w') do |f|
    f.write("# Generated automatically by Fech::MapGenerator.\n\n")
    f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
    f.write("#   contain an entry for each distinct map between a row's labels and the\n")
    f.write("#   indexes where their values can be found.\n")
    f.write("module Fech\n")
    f.write("  RENDERED_MAPS = {\n")
    BASE_ROW_TYPES.each do |row_type|
      f.write("    \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
      generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
        f.write("      \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
      end
      f.write("    },\n")
    end
    f.write("  }\n")
    f.write("end")
  end
end
generate_row_map_from_file(source_dir, row_type) click to toggle source

For a given row type, parses its source file and returns a mapping object for it.

# File lib/fech/map_generator.rb, line 187
def self.generate_row_map_from_file(source_dir, row_type)
  versions = []
  version_indexes = []
  data = {}
  text = open(row_map_file(source_dir, row_type)).read
  split_char = text.index(/\r/) ? /\r/ : /\n/
  rows = text.split(split_char).collect {|x| x.split(',')}
  rows.each do |row|
    row = row.collect {|x| x.gsub("\n", "")}
    if row.first.nil?
      require 'ruby-debug'; debugger
    end
    if row.first.downcase == "canonical"
      versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
      row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
      version_indexes.slice!(0, 1)
      versions.each {|x| data[x] = [] }

    elsif row.first.size > 0
      canonical = row.first

      versions.zip(version_indexes).each do |version, row_index|
        index = row[row_index]
        data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
      end
    end
  end

  row_map = {}
  data.each {|key, value| row_map[key] = value}
  row_map
end
ignored_fields_file(source_dir) click to toggle source
# File lib/fech/map_generator.rb, line 231
def self.ignored_fields_file(source_dir)
  File.join(source_dir, 'headers', 'ignore.csv')
end
remove_ignored_fields(row, ignore) click to toggle source

Remove both the row type from the beginning of the row, and any fields marked as “ignore” in sources/headers/ignore.csv

# File lib/fech/map_generator.rb, line 222
def self.remove_ignored_fields(row, ignore)
  data = row[1..-1].compact # strip off the row type
  data.reject { |f| ignore.include?(f) }
end
row_map_file(source_dir, row_type) click to toggle source
# File lib/fech/map_generator.rb, line 227
def self.row_map_file(source_dir, row_type)
  File.join(source_dir, row_type + '.csv')
end
version_summary_file(source_dir, version) click to toggle source
# File lib/fech/map_generator.rb, line 235
def self.version_summary_file(source_dir, version)
  File.join(source_dir, 'headers', version + '.csv')
end
write_row_map_file(source_dir, row_type) click to toggle source
# File lib/fech/map_generator.rb, line 239
def self.write_row_map_file(source_dir, row_type)
  File.join(source_dir, 'rows', row_type + '.csv')
end