class CsvFormatGuesser

Constants

COMMON_QUOTE_CHARS
DEFAULT_ENCODING
DEFAULT_QUOTE_CHAR
POTENTIAL_COL_SEP_REGEX
PREVIEW_BYTES
PREVIEW_LINES
VERSION

Attributes

col_sep[R]
encoding[R]
quote_char[R]

Public Class Methods

new(path) click to toggle source
# File lib/csv_format_guesser.rb, line 11
def initialize(path)
  @path = path
  guess_encoding()
  guess_col_sep()
  guess_quote_char()
end

Public Instance Methods

csv_opts() click to toggle source
# File lib/csv_format_guesser.rb, line 18
def csv_opts
  {
    encoding: @encoding,
    col_sep: @col_sep,
    quote_char: @quote_char,
  }
end

Protected Instance Methods

find_header() click to toggle source
# File lib/csv_format_guesser.rb, line 60
def find_header
  preview_lines.each do |line|
    return line if line.scan(POTENTIAL_COL_SEP_REGEX).any?
  end
end
guess_col_sep() click to toggle source

we assume that the separater is non alphanumeric and has the same occurencies in the top lines

# File lib/csv_format_guesser.rb, line 48
def guess_col_sep
  header = find_header
  raise "Could not find header_row from file: #{@path}" unless header
  char_stats = header.scan(POTENTIAL_COL_SEP_REGEX).inject(Hash.new(0)) {|h,char| h[char]+=1; h}
  # here we sort all possible col seps by their count in the header
  @most_appearing = char_stats.to_a.sort{|a,b| b[1] <=> a[1]}.first
  @col_sep = @most_appearing.first if @most_appearing
  raise "Could not guess column_separator from file: #{@path}" unless @col_sep
rescue => e
  @col_sep ||= ','
end
guess_encoding() click to toggle source
# File lib/csv_format_guesser.rb, line 28
def guess_encoding
  cd = CharDet.detect(File.read(@path, PREVIEW_BYTES))
  @encoding = cd['encoding'] if cd
  @encoding ||= DEFAULT_ENCODING
  try_encoding_with_fallback!
rescue Encoding::UndefinedConversionError => e
  @encoding = 'ISO-8859-1' if @encoding == 'ISO-8859-7'
 rescue => e
   @encoding ||= DEFAULT_ENCODING
end
guess_quote_char() click to toggle source
# File lib/csv_format_guesser.rb, line 67
def guess_quote_char
  readlines do |line|
    @quote_char = search_quote_char(line)
    return if @quote_char
  end
  @quote_char = DEFAULT_QUOTE_CHAR
end
preview_lines() click to toggle source
# File lib/csv_format_guesser.rb, line 91
def preview_lines
  @preview_lines ||= readlines(PREVIEW_LINES)
end
readlines(max = nil) { |line| ... } click to toggle source
# File lib/csv_format_guesser.rb, line 95
def readlines(max = nil, &block)
  lines = []
  File.open(@path, "r:#{@encoding}:utf-8") do |f|
    i = 0
    f.each_line do |line|
      i += 1
      break if max && i > max
      if block
        yield(line)
      else
        lines << line
      end
    end
  end
  return lines
end
search_quote_char(line) click to toggle source
# File lib/csv_format_guesser.rb, line 75
def search_quote_char(line)
  @used_quote_chars ||= []
  COMMON_QUOTE_CHARS.each do |char|
    next unless line.include?(char)
    @used_quote_chars << char
    # should be next to field separator
    if line.include?(char)
      enclosed = @col_sep + line + @col_sep
      openings = enclosed.scan( Regexp.new(Regexp.escape(@col_sep+char)) ).length
      closings = enclosed.scan( Regexp.new(Regexp.escape(char + @col_sep)) ).length
      return char if openings > 0 && openings == closings
    end
  end
  return nil
end
try_encoding_with_fallback!() click to toggle source
# File lib/csv_format_guesser.rb, line 39
def try_encoding_with_fallback!
  File.open(@path, "r", encoding: @encoding) do |f|
    f.read
  end
end