class CsvFormatGuesser
Constants
- COMMON_QUOTE_CHARS
- DEFAULT_ENCODING
- DEFAULT_QUOTE_CHAR
- POTENTIAL_COL_SEP_REGEX
- PREVIEW_BYTES
- PREVIEW_LINES
- VERSION
Attributes
col_sep[R]
encoding[R]
quote_char[R]
Public Class Methods
new(path)
click to toggle source
# File lib/csv_format_guesser.rb, line 11 def initialize(path) @path = path guess_encoding() guess_col_sep() guess_quote_char() end
Public Instance Methods
csv_opts()
click to toggle source
# File lib/csv_format_guesser.rb, line 18 def csv_opts { encoding: @encoding, col_sep: @col_sep, quote_char: @quote_char, } end
Protected Instance Methods
find_header()
click to toggle source
# File lib/csv_format_guesser.rb, line 60 def find_header preview_lines.each do |line| return line if line.scan(POTENTIAL_COL_SEP_REGEX).any? end end
guess_col_sep()
click to toggle source
we assume that the separater is non alphanumeric and has the same occurencies in the top lines
# File lib/csv_format_guesser.rb, line 48 def guess_col_sep header = find_header raise "Could not find header_row from file: #{@path}" unless header char_stats = header.scan(POTENTIAL_COL_SEP_REGEX).inject(Hash.new(0)) {|h,char| h[char]+=1; h} # here we sort all possible col seps by their count in the header @most_appearing = char_stats.to_a.sort{|a,b| b[1] <=> a[1]}.first @col_sep = @most_appearing.first if @most_appearing raise "Could not guess column_separator from file: #{@path}" unless @col_sep rescue => e @col_sep ||= ',' end
guess_encoding()
click to toggle source
# File lib/csv_format_guesser.rb, line 28 def guess_encoding cd = CharDet.detect(File.read(@path, PREVIEW_BYTES)) @encoding = cd['encoding'] if cd @encoding ||= DEFAULT_ENCODING try_encoding_with_fallback! rescue Encoding::UndefinedConversionError => e @encoding = 'ISO-8859-1' if @encoding == 'ISO-8859-7' rescue => e @encoding ||= DEFAULT_ENCODING end
guess_quote_char()
click to toggle source
# File lib/csv_format_guesser.rb, line 67 def guess_quote_char readlines do |line| @quote_char = search_quote_char(line) return if @quote_char end @quote_char = DEFAULT_QUOTE_CHAR end
preview_lines()
click to toggle source
# File lib/csv_format_guesser.rb, line 91 def preview_lines @preview_lines ||= readlines(PREVIEW_LINES) end
readlines(max = nil) { |line| ... }
click to toggle source
# File lib/csv_format_guesser.rb, line 95 def readlines(max = nil, &block) lines = [] File.open(@path, "r:#{@encoding}:utf-8") do |f| i = 0 f.each_line do |line| i += 1 break if max && i > max if block yield(line) else lines << line end end end return lines end
search_quote_char(line)
click to toggle source
# File lib/csv_format_guesser.rb, line 75 def search_quote_char(line) @used_quote_chars ||= [] COMMON_QUOTE_CHARS.each do |char| next unless line.include?(char) @used_quote_chars << char # should be next to field separator if line.include?(char) enclosed = @col_sep + line + @col_sep openings = enclosed.scan( Regexp.new(Regexp.escape(@col_sep+char)) ).length closings = enclosed.scan( Regexp.new(Regexp.escape(char + @col_sep)) ).length return char if openings > 0 && openings == closings end end return nil end
try_encoding_with_fallback!()
click to toggle source
# File lib/csv_format_guesser.rb, line 39 def try_encoding_with_fallback! File.open(@path, "r", encoding: @encoding) do |f| f.read end end