class CsvUtils
check/use class or module ???
note: for now CsvUtils
is a class!!! NOT a module - change - why? why not?
Constants
- MAJOR
- MINOR
- PATCH
- VERSION
Public Class Methods
csv_row( *values, sep: ',' )
click to toggle source
(simple) helper for “csv-encoding” values / row
todo: check for newline in value too? why? why not?
# File lib/csvutils/utils.rb, line 18 def self.csv_row( *values, sep: ',' ) values.map do |value| if value && (value.index( sep ) || value.index('"')) ## double quotes and enclose in double qoutes value = %Q{"#{value.gsub('"', '""')}"} else value end end end
cut( path, *columns, output: path, sep: ',' )
click to toggle source
# File lib/csvutils/cut.rb, line 8 def self.cut( path, *columns, output: path, sep: ',' ) inpath = path outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!) puts "cvscut in: >#{inpath}< out: >#{outpath}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns csv_options = { sep: sep } recs = CsvHash.read( inpath, csv_options ) ## for convenience - make sure parent folders/directories exist FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath )) ## note: ## todo/fix: add two trailing spaces for pretty printing - why? why not? File.open( outpath, 'w:utf-8' ) do |out| out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns out << "\n" recs.each do |rec| values = columns.map { |col| rec[col] } ## find data for column out << csv_row( *values, sep: sep ).join( sep ) out << "\n" end end puts 'Done.' end
head( path, sep: ',', n: 4 )
click to toggle source
test or dry run to check if rows can get read/scanned
# File lib/csvutils/head.rb, line 7 def self.head( path, sep: ',', n: 4 ) i = 0 csv_options = { sep: sep } CsvHash.foreach( path, csv_options ) do |rec| i += 1 pp rec break if i >= n end puts " #{i} records" end
header( path, sep: ',', debug: false )
click to toggle source
# File lib/csvutils/header.rb, line 6 def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)? row = CsvReader.header( path, sep: sep ) pp row if debug ## e.g.: # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n" row end
pp_header( headers )
click to toggle source
# File lib/csvutils/utils.rb, line 6 def self.pp_header( headers ) ## check: rename to print_headers or prettyprint_header - why? why not? puts "#{headers.size} columns:" headers.each_with_index do |header,i| puts " #{i+1}: #{header}" end end
root()
click to toggle source
# File lib/csvutils/version.rb, line 20 def self.root File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) end
split( path, *columns, sep: ',' ) { |column_values, chunk_with_headers| ... }
click to toggle source
# File lib/csvutils/split.rb, line 8 def self.split( path, *columns, sep: ',', &blk ) puts "cvssplit in: >#{path}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns ## note: do NOT use headers ## for easy sorting use "plain" array of array for records csv_options = { sep: sep } data = CsvReader.read( path, csv_options ) ## todo/check: (auto-) strip (remove all leading and trailing spaces) ## from all values - why? why not? ## check if CSV.parse has an option for it? headers = data.shift ## remove top array item (that is, row with headers) header_mapping = {} headers.each_with_index { | header,i | header_mapping[header]=i } pp header_mapping ## map columns to array indices e.g. ['Season', 'Div'] => [1,2] column_indices = columns.map { |col| header_mapping[col] } pp column_indices ################################################### ## note: sort data by columns (before split) data = data.sort do |row1,row2| res = 0 column_indices.each do |col| res = row1[col] <=> row2[col] if res == 0 end res end chunk = [] data.each_with_index do |row,i| chunk << row next_row = data[i+1] changed = false if next_row.nil? ## end-of-file changed = true else column_indices.each do |col| if row[col] != next_row[col] changed = true break ## out of each column_indices loop end end end if changed puts "save new chunk:" column_values = column_indices.map {|col| row[col] } pp column_values # note: add header(s) row upfront (as first row) to chunk (with unshift) chunk_with_headers = chunk.unshift( headers ) if blk yield( column_values, chunk_with_headers ) else ## auto-save (write-to-file) by default - why? why not? split_write( path, column_values, chunk_with_headers, sep: sep ) end chunk = [] ## reset chunk for next batch of records end end puts 'Done.' end
split_write( inpath, values, chunk, sep: )
click to toggle source
# File lib/csvutils/split.rb, line 87 def self.split_write( inpath, values, chunk, sep: ) basename = File.basename( inpath, '.*' ) dirname = File.dirname( inpath ) ## check/change invalid filename chars ## e.g. change 1990/91 to 1990-91 extraname = values.map {|value| value.tr('/','-')}.join('~') outpath = "#{dirname}/#{basename}_#{extraname}.csv" puts "saving >#{basename}_#{extraname}.csv<..." File.open( outpath, 'w:utf-8' ) do |out| chunk.each do |row| out << csv_row( *row, sep: sep ).join( sep ) out << "\n" end end end
stat( path, *columns, sep: ',', debug: false )
click to toggle source
# File lib/csvutils/stat.rb, line 6 def self.stat( path, *columns, sep: ',', debug: false ) csv_options = { sep: sep } values = {} nulls = {} # check 1) nulls/nils (e.g. empty strings ""), # 2) not/appliation or available n/a NA or NaN or ... # 3) missing - e.g. ? i=0 CsvHash.foreach( path, csv_options ) do |rec| i += 1 pp rec if i == 1 && debug print '.' if i % 100 == 0 ## collect unique values for passed in columns columns.each do |col| value = rec[col] ## note: value might be nil!!!!! values[col] ||= Hash.new(0) values[col][ value ? value : '<nil>' ] +=1 end ## alway track nulls - why? why not rec.each do |col,value| ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?) ## puts "[debug] nil value in row:" ## puts "#{col} = #{value.inspect} : #{value.class.name}" ## end if value.nil? nulls[col] ||= Hash.new(0) nulls[col]['nil'] +=1 elsif value.empty? nulls[col] ||= Hash.new(0) nulls[col]['empty'] +=1 elsif ['na', 'n/a', '-'].include?( value.downcase ) nulls[col] ||= Hash.new(0) nulls[col]['na'] +=1 elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not? nulls[col] ||= Hash.new(0) nulls[col]['?'] +=1 else # do nothing; "regular" value end end end puts " #{i} rows" puts puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):" puts " #{nulls.inspect}" puts ## dump headers first (first row with names of columns) headers = header( path, sep: sep, debug: debug ) pp_header( headers ) ## pretty print header columns puts if values.any? ## pretty print (pp) / dump unique values for passed in columns values.each do |col,h| puts " column >#{col}< #{h.size} unique values:" ## sort by name/value for now (not frequency) - change - why? why not? sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] } sorted_values.each do |rec| puts " #{rec[1]} x #{rec[0]}" end end end end
test( path, sep: ',' )
click to toggle source
test or dry run to check if rows can get read/scanned
# File lib/csvutils/test.rb, line 7 def self.test( path, sep: ',' ) i = 0 csv_options = { sep: sep } CsvHash.foreach( path, csv_options ) do |rec| i += 1 print '.' if i % 100 == 0 end puts " #{i} rows" end
version()
click to toggle source
# File lib/csvutils/version.rb, line 12 def self.version VERSION end