class Quandl::Babelfish::DateMaid
responsible for number formatting
Public Class Methods
analyze_date_format(example)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 46 def analyze_date_format(example) return nil if example.nil? # Regular formats and Custom formats (where Date.parse and Date.strptime # fear to tread) if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012" if re[1].to_i > 12 return '%d-%m-%Y', nil else return '%m-%d-%Y', nil end end if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12" if re[1].to_i > 12 return '%d-%m-%y', nil else return '%m-%d-%y', nil end end # order these guys from most specific to most general return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/ return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it) return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08 return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01 return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits #our custom formats return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel # No, try default date parse # raise PostProcessorException, "Unable to guess date format for #{example}" [nil, nil] end
disinfect(dates)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 100 def disinfect(dates) [*dates].collect do |date| date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace) date.to_s.gsub!(/[^\x01-\x7f]/,'') date.to_s.strip.gsub(/\s\s+/, ' ') end end
init(user_settings)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 14 def init(user_settings) @settings=@defaults.merge(user_settings) end
sweep(all_dates)
click to toggle source
looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
# File lib/quandl/babelfish/date_maid.rb, line 19 def sweep(all_dates) return nil if all_dates.nil? all_dates = disinfect all_dates if @settings[:format].nil? #find good example and extract all info from it and apply it to each of the dates in the set good_sample = find_good_date(all_dates) raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil? date_format, frequency = analyze_date_format(good_sample) else date_format = @settings[:format] end iso_dates=[] all_dates.each_with_index do |fuzzy_date, i| temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" ) iso_dates << frequency_transform(temp_date, frequency) end iso_dates end
Private Class Methods
convert(fuzzy_date, date_format)
click to toggle source
converts date to specified format
# File lib/quandl/babelfish/date_maid.rb, line 111 def convert(fuzzy_date, date_format) if date_format.nil? # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY) tokens = fuzzy_date.split(/\D/) if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/ # Its ISO return DateTime.parse(fuzzy_date.to_s).to_date else # Guessing US return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i) end else case date_format when ':year_quarter' return year_quarter_formatter(fuzzy_date) when ':excel-1900' return excel_1900_formatter(fuzzy_date) else #regular ruby formatter return regular_formatter(fuzzy_date, date_format) end end end
excel_1900_formatter(fuzzy_date)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 143 def excel_1900_formatter(fuzzy_date) # handle Lotus 123 bug has 1900 as a leap year Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0 end
find_good_date(all_dates, strict=true)
click to toggle source
find good example of date to use as template for format if strict == true, no ambiguity is tolerated. If strict= false, we will accept abbiguity. (02/05/2009)
# File lib/quandl/babelfish/date_maid.rb, line 177 def find_good_date(all_dates, strict=true) good_sample=nil all_dates.each do |fuzzy_date| if usable_cell(fuzzy_date,strict) good_sample = fuzzy_date break end end if good_sample == nil and strict==true # We could not find a single unambiguous cell. Let's now be less strict and see if we can find something find_good_date(all_dates,false) else good_sample end end
frequency_transform(date, frequency)
click to toggle source
Bump date to the end of the respective periods
# File lib/quandl/babelfish/date_maid.rb, line 227 def frequency_transform(date, frequency) case frequency when 'annual' date = Date.new(date.year,12,31) when 'quarterly' month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12 date = Date.new(date.year, month, 1).next_month-1 when 'monthly' date = Date.new(date.year, date.month,1).next_month-1 else # Do nothing for daily or weekly end date end
regular_formatter(fuzzy_date, date_format)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 148 def regular_formatter(fuzzy_date, date_format) # We have a date format - oh so pretty, but... date_string = fuzzy_date # normalize delimiters to hyphens so we do not have to make a format for each one. # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh. # only if no format where provided date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil? #epoch date string if date_format == 'epoch' news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/) formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date else if date_string.to_s =~ /^(\w{3})\D(\d{2})$/ century = $2.to_i < 25 ? '20' : '19' date_string = "#{$1} #{century}#{$2}" formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date else formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date end end formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays. formatted_date end
usable_cell(cell,strict)
click to toggle source
if strict == true then we refuse to accept any ambiguity if strict == false, we’ll settle for a bit of ambiguity
# File lib/quandl/babelfish/date_maid.rb, line 195 def usable_cell(cell,strict) return false if cell.nil? || cell.to_s.empty? return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20 return true if cell.to_s =~ /^\w{3}\D[456789]\d$/ # date is not usable as an example if it is ambiguous as to day and month # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012 if re[1].to_i <= 12 and re[2].to_i <= 12 return strict==true ? false : true else return true end end if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07 if re[1].to_i <= 12 and re[2].to_i <= 12 return false else return true end end return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous false # Thank you, come again end
year_quarter_formatter(fuzzy_date)
click to toggle source
# File lib/quandl/babelfish/date_maid.rb, line 136 def year_quarter_formatter(fuzzy_date) raw_date = fuzzy_date tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/) tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1) end