class Fech::MapGenerator
Helper class to generate mapping hashes from source csv data. Needed to rebuild rendered_maps.rb with new source data, not used in main gem.
rake fech:maps
Constants
- BASE_ROW_TYPES
- FILING_VERSIONS
- PAPER_BASE_ROW_TYPES
- PAPER_FILING_VERSIONS
- ROW_TYPE_MATCHERS
Attributes
map[RW]
Public Class Methods
convert_header_file_to_row_files(source_dir)
click to toggle source
Goes through all version header summary files and generates row map files for each type of row inside them.
# File lib/fech/map_generator.rb, line 74 def self.convert_header_file_to_row_files(source_dir) data = {} hybrid_data = {} ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip } # Create a hash of data with an entry for each row type found in the source # version summary files. Each row has an entry for each version map that # exists for it. If maps for two different versions are identical, they # are combined. FILING_VERSIONS.each do |version| filepath = version_summary_file(source_dir, version) # Clean the source files by removing unparseable characters if RUBY_VERSION < "1.9.3" require 'iconv' ic = Iconv.new('UTF-8//IGNORE', 'UTF-8') valid_string = ic.iconv(open(filepath).read << ' ')[0..-2] else valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '') valid_string = valid_string.encode!('UTF-8', 'UTF-16') end open(filepath, 'w').write(valid_string) Fech::Csv.foreach(filepath) do |row| # Each row of a version summary file contains the ordered list of # column names. data[row.first] ||= {} hybrid_data[row.first] ||= {} row_version_data = remove_ignored_fields(row, ignored_fields) # Check the maps for this row type in already-processed versions. # If this map is identical to a previous map, tack this version on to # to it instead of creating a new one. data[row.first][version] = row_version_data data[row.first].each do |k, v| # skip the row we just added next if k == version if v == row_version_data # Create the new hybrid entry hybrid_data[row.first]["#{k}|#{version}"] = row_version_data # Delete the old entry, and the one for this version only data[row.first].delete(k) data[row.first].delete(version) end end data[row.first].update(hybrid_data[row.first]) end end # Go through each row type and create a base map management file that # will serve as a template for organizing which fields are the same # between versions. This file will need to then be arranged by hand to # clean up the data. Each row will represent a column across versions, # each column a unique map for that row for one or more versions. data.each do |row_type, row_data| file_path = write_row_map_file(source_dir, row_type) next unless File.exists?(file_path) File.open(file_path, 'w') do |f| f.write('canonical') to_transpose = [] row_data.sort.reverse.each do |version, version_data| to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten to_transpose << [nil, version_data].flatten end # standardize row size max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size to_transpose.each { |r| r[max_size - 1] ||= nil } transposed = to_transpose.transpose transposed.each do |transposed_data| transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')} canonical = transposed_data[1] # first description if canonical canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase transposed_data = [canonical, transposed_data].flatten end f.write(transposed_data.join(',')) f.write("\n") end end end end
dump_row_maps_to_ruby(source_dir, file_path)
click to toggle source
Generates the mapping for each row type in BASE_ROW_TYPES
, writes them out to file for inclusion in the gem.
# File lib/fech/map_generator.rb, line 165 def self.dump_row_maps_to_ruby(source_dir, file_path) File.open(file_path, 'w') do |f| f.write("# Generated automatically by Fech::MapGenerator.\n\n") f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n") f.write("# contain an entry for each distinct map between a row's labels and the\n") f.write("# indexes where their values can be found.\n") f.write("module Fech\n") f.write(" RENDERED_MAPS = {\n") BASE_ROW_TYPES.each do |row_type| f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n") generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v| f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n") end f.write(" },\n") end f.write(" }\n") f.write("end") end end
generate_row_map_from_file(source_dir, row_type)
click to toggle source
For a given row type, parses its source file and returns a mapping object for it.
# File lib/fech/map_generator.rb, line 187 def self.generate_row_map_from_file(source_dir, row_type) versions = [] version_indexes = [] data = {} text = open(row_map_file(source_dir, row_type)).read split_char = text.index(/\r/) ? /\r/ : /\n/ rows = text.split(split_char).collect {|x| x.split(',')} rows.each do |row| row = row.collect {|x| x.gsub("\n", "")} if row.first.nil? require 'ruby-debug'; debugger end if row.first.downcase == "canonical" versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1) version_indexes.slice!(0, 1) versions.each {|x| data[x] = [] } elsif row.first.size > 0 canonical = row.first versions.zip(version_indexes).each do |version, row_index| index = row[row_index] data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0 end end end row_map = {} data.each {|key, value| row_map[key] = value} row_map end
ignored_fields_file(source_dir)
click to toggle source
# File lib/fech/map_generator.rb, line 231 def self.ignored_fields_file(source_dir) File.join(source_dir, 'headers', 'ignore.csv') end
remove_ignored_fields(row, ignore)
click to toggle source
Remove both the row type from the beginning of the row, and any fields marked as “ignore” in sources/headers/ignore.csv
# File lib/fech/map_generator.rb, line 222 def self.remove_ignored_fields(row, ignore) data = row[1..-1].compact # strip off the row type data.reject { |f| ignore.include?(f) } end
row_map_file(source_dir, row_type)
click to toggle source
# File lib/fech/map_generator.rb, line 227 def self.row_map_file(source_dir, row_type) File.join(source_dir, row_type + '.csv') end
version_summary_file(source_dir, version)
click to toggle source
# File lib/fech/map_generator.rb, line 235 def self.version_summary_file(source_dir, version) File.join(source_dir, 'headers', version + '.csv') end
write_row_map_file(source_dir, row_type)
click to toggle source
# File lib/fech/map_generator.rb, line 239 def self.write_row_map_file(source_dir, row_type) File.join(source_dir, 'rows', row_type + '.csv') end