class GnListResolver::Reader

Reads supplied csv file and creates ruby structure to compare with a Global Names Resolver source

Attributes

col_sep[R]
original_fields[R]

Public Class Methods

new(csv_io, input_name, skip_original, alt_headers, stats) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 9
def initialize(csv_io, input_name, skip_original, alt_headers, stats)
  @stats = stats
  @alt_headers = alt_headers
  @csv_io = csv_io
  @col_sep = detect_col_sep
  @quote_char = quote_char(@col_sep)
  @original_fields = nil
  @input_name = input_name
  @skip_original = skip_original
end

Public Instance Methods

read() click to toggle source
# File lib/gn_list_resolver/reader.rb, line 20
def read
  @stats.stats[:ingestion_start] = Time.now
  @stats.stats[:status] = :ingestion
  GnListResolver.log("Read input from #{@input_name}")
  block_given? ? parse_input(&Proc.new) : parse_input
end

Private Instance Methods

detect_col_sep() click to toggle source
# File lib/gn_list_resolver/reader.rb, line 29
def detect_col_sep
  line = @csv_io.first
  @stats.stats[:total_records] = @csv_io.readlines.size
  @csv_io.rewind
  data = [[";", 1], [",", 0], ["\t", 2]].map do |s, weight|
    [line.count(s), weight, s]
  end
  data.sort.last.last
end
headers(row) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 100
def headers(row)
  hdrs = row.dup
  @skip_original ? taxon_id_header(hdrs) : hdrs
end
log_progress(count) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 92
def log_progress(count)
  return false unless (count % 1_000).zero?
  GnListResolver.log("Ingesting csv row #{count + 1}")
  @stats.stats[:ingested_records] = count + 1
  @stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
  true
end
parse_input() { |stats| ... } click to toggle source
# File lib/gn_list_resolver/reader.rb, line 43
def parse_input
  dc = Collector.new(@skip_original)
  csv = CSV.new(@csv_io, col_sep: @col_sep, quote_char: @quote_char)
  block_given? ? process(csv, dc, &Proc.new) : process(csv, dc)
  wrap_up
  yield @stats.stats if block_given?
  dc.data
end
process(csv, data_collector) { |stats| ... } click to toggle source
# File lib/gn_list_resolver/reader.rb, line 52
def process(csv, data_collector)
  counter = 0
  loop do
    yield @stats.stats if log_progress(counter) && block_given?
    rl = read_line(csv, data_collector)
    break unless rl
    counter += 1
  end && @csv_io.close
end
process_headers(row) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 76
def process_headers(row)
  @original_fields = headers(row)
  row = produce_alt_headers(row) unless @alt_headers&.empty?
  row
end
produce_alt_headers(row) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 82
def produce_alt_headers(row)
  tail_size = row.size - @alt_headers.size
  if tail_size <= 0
    @alt_headers = @alt_headers[0..row.size - 1]
  else
    tail_size.times { @alt_headers << "" }
  end
  @alt_headers = @alt_headers.map { |h| h.nil? ? "" : h }
end
quote_char(col_sep) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 39
def quote_char(col_sep)
  col_sep == "\t" ? "\x00" : '"'
end
read_line(csv, data_collector) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 62
def read_line(csv, data_collector)
  row = csv.readline
  return false if row.nil?
  row = process_headers(row) if @original_fields.nil?
  data_collector.process_row(row)
rescue CSV::MalformedCSVError => e
  @stats.stats[:errors] << e.message if @stats.stats[:errors].size < 10
end
taxon_id_header(hdrs) click to toggle source
# File lib/gn_list_resolver/reader.rb, line 105
def taxon_id_header(hdrs)
  hdrs.each do |h|
    return [h] if h&.match?(/taxonid\s*$/i)
  end
  []
end
wrap_up() click to toggle source
# File lib/gn_list_resolver/reader.rb, line 71
def wrap_up
  @stats.stats[:ingested_records] = @stats.stats[:total_records]
  @stats.stats[:ingestion_span] = Time.now - @stats.stats[:ingestion_start]
end