module Worldfootball
todo/check: move MODS
and SCORE_ERRORS
out-of-lib
and into config or such - why? why not?
Constants
- MAX_HEADERS
- MIN_HEADERS
- MODS
- OFFSETS
Brasilia - Distrito Federal, Brasil (GMT-3) -- summer time? Ciudad de México, CDMX, México (GMT-5) -- summer time? Londres, Reino Unido (GMT+1) Madrid -- ? Lisboa -- ? Moskow -- ?
todo/check - quick fix timezone offsets for leagues for now
- find something better - why? why not?
note: assume time is in GMT+1
- ROUND_TO_EN
- SCORE_ERRORS
fix/patch known score format errors in at/de cups
new convention for a fix require league, date, and team1 & team2 for now!!!! - do NOT use some "generic" fix / patch!!!!
old de/at patches/fixes:
'0-1 (0-0, 0-0, 0-0) n.V.' => '0-1 (0-0, 0-0) n.V.', # too long '2-1 (1-1, 1-1, 1-0) n.V.' => '2-1 (1-1, 1-1) n.V.', '4-2 (0-0, 0-0) i.E.' => '4-2 (0-0, 0-0, 0-0) i.E.', # too short
Public Class Methods
build( rows, season:, league:, stage: '' )
click to toggle source
build “standard” match records from “raw” table rows
# File lib/football-sources/worldfootball/build.rb, line 20 def self.build( rows, season:, league:, stage: '' ) ## rename to fixup or such - why? why not? season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.) raise ArgumentError, "league key as string expected" unless league.is_a?(String) ## note: do NOT pass in league struct! pass in key (string) print " #{rows.size} rows - build #{league} #{season}" print " - #{stage}" unless stage.empty? print "\n" ## note: use only first part from key for lookup ## e.g. at.1 => at ## eng.1 => eng ## and so on mods = MODS[ league.split('.')[0] ] || {} score_errors = SCORE_ERRORS[ league ] || {} i = 0 recs = [] rows.each do |row| i += 1 if row[:round] =~ /Spieltag/ puts print '[%03d] ' % (i+1) print row[:round] if m = row[:round].match( /([0-9]+)\. Spieltag/ ) ## todo/check: always use a string even if number (as a string eg. '1' etc.) round = m[1] ## note: keep as string (NOT number) print " => #{round}" else puts "!! ERROR: cannot find matchday number" exit 1 end print "\n" elsif row[:round] =~ /[1-9]\.[ ]Runde| Achtelfinale| Viertelfinale| Halbfinale| Finale /x puts print '[%03d] ' % (i+1) print row[:round] ## do NOT translate rounds (to english) - keep in german / deutsch (de) if ['at.cup', 'at.1', ## at.1 - incl. europa league playoff 'de.cup'].include?( league ) round = row[:round] else round = ROUND_TO_EN[ row[:round] ] if round.nil? puts "!! ERROR: no mapping for round to english (en) found >#{row[:round]}<:" pp row exit 1 end print " => #{round}" end print "\n" else puts "!! ERROR: unknown round >#{row[:round]}< for league >#{league}<:" pp row exit 1 end date_str = row[:date] time_str = row[:time] team1_str = row[:team1] team2_str = row[:team2] score_str = row[:score] ## convert date from string e.g. 2019-25-10 date = Date.strptime( date_str, '%Y-%m-%d' ) ### check for score_error; first (step 1) lookup by date score_error = score_errors[ date.strftime('%Y-%m-%d') ] if score_error if team1_str == score_error[0] && team2_str == score_error[1] ## check if team names match too; if yes, apply fix/patch!! if score_str != score_error[2][0] puts "!! WARN - score fix changed? - expected #{score_error[2][0]}, got #{score_str} - fixing to #{score_error[2][1]}" pp row end puts "FIX - applying score error fix - from #{score_error[2][0]} to => #{score_error[2][1]}" score_str = score_error[2][1] end end print '[%03d] ' % (i+1) print "%-10s | " % date_str print "%-5s | " % time_str print "%-22s | " % team1_str print "%-22s | " % team2_str print score_str print "\n" ## check for 0:3 Wert. - change Wert. to awd. (awarded) score_str = score_str.sub( /Wert\./i, 'awd.' ) ## clean team name (e.g. remove (old)) ## and asciify (e.g. ’ to ' ) team1_str = norm_team( team1_str ) team2_str = norm_team( team2_str ) team1_str = mods[ team1_str ] if mods[ team1_str ] team2_str = mods[ team2_str ] if mods[ team2_str ] ht, ft, et, pen, comments = parse_score( score_str ) recs << [stage, round, date.strftime( '%Y-%m-%d' ), time_str, team1_str, ft, ht, team2_str, et, # extra: incl. extra time pen, # extra: incl. penalties comments] end # each row recs end
convert( league:, season:, offset: nil )
click to toggle source
# File lib/football-sources/worldfootball/convert.rb, line 6 def self.convert( league:, season:, offset: nil ) ## check: rename (optional) offset to time_offset or such? season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.) league = find_league( league ) pages = league.pages( season: season ) # note: assume stages if pages is an array (of hash table/records) # (and NOT a single hash table/record) if pages.is_a?(Array) recs = [] pages.each do |page_meta| slug = page_meta[:slug] stage_name = page_meta[:stage] ## todo/fix: report error/check if stage.name is nil!!! print " parsing #{slug}..." # unless File.exist?( path ) # puts "!! WARN - missing stage >#{stage_name}< source - >#{path}<" # next # end page = Page::Schedule.from_cache( slug ) print " title=>#{page.title}<..." print "\n" rows = page.matches stage_recs = build( rows, season: season, league: league.key, stage: stage_name ) pp stage_recs[0] ## check first record recs += stage_recs end else page_meta = pages slug = page_meta[:slug] print " parsing #{slug}..." page = Page::Schedule.from_cache( slug ) print " title=>#{page.title}<..." print "\n" rows = page.matches recs = build( rows, season: season, league: league.key ) pp recs[0] ## check first record end recs = recs.map { |rec| fix_date( rec, offset ) } if offset ## note: sort matches by date before saving/writing!!!! ## note: for now assume date in string in 1999-11-30 format (allows sort by "simple" a-z) ## note: assume date is third column!!! (stage/round/date/...) recs = recs.sort { |l,r| l[2] <=> r[2] } ## reformat date / beautify e.g. Sat Aug 7 1993 recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) } ## remove unused columns (e.g. stage, et, p, etc.) recs, headers = vacuum( recs ) puts headers pp recs[0] ## check first record out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}.csv" puts "write #{out_path}..." Cache::CsvMatchWriter.write( out_path, recs, headers: headers ) end
convert_reports( league:, season: )
click to toggle source
# File lib/football-sources/worldfootball/convert_reports.rb, line 4 def self.convert_reports( league:, season: ) season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.) league = find_league( league ) ## note: use only first part from key for lookup ## e.g. at.1 => at ## eng.1 => eng ## and so on mods = MODS[ league.key.split('.')[0] ] || {} pages = league.pages( season: season ) recs = [] ## if single (simple) page setup - wrap in array pages = pages.is_a?(Array) ? pages : [pages] pages.each do |page_meta| # note: use page_info for now (or page_rec or page_meta or such) page = Page::Schedule.from_cache( page_meta[:slug] ) print " page title=>#{page.title}<..." print "\n" matches = page.matches puts "matches - #{matches.size} rows:" pp matches[0] puts "#{page.generated_in_days_ago} - #{page.generated}" matches.each_with_index do |match,i| report_ref = match[:report_ref] if report_ref.nil? puts "!! WARN: no match report ref found for match:" pp match next end puts "reading #{i+1}/#{matches.size} - #{report_ref}..." report = Page::Report.from_cache( report_ref ) puts puts report.title puts report.generated rows = report.goals puts "goals - #{rows.size} records" ## pp rows if rows.size > 0 ## add goals date = Date.strptime( match[:date], '%Y-%m-%d') team1 = match[:team1] team2 = match[:team2] ## clean team name (e.g. remove (old)) ## and asciify (e.g. ’ to ' ) team1 = norm_team( team1 ) team2 = norm_team( team2 ) team1 = mods[ team1 ] if mods[ team1 ] team2 = mods[ team2 ] if mods[ team2 ] match_id = "#{team1} - #{team2} | #{date.strftime('%b %-d %Y')}" rows.each do |row| extra = if row[:owngoal] '(og)' ## or use OG or O.G.- why? why not? elsif row[:penalty] '(pen)' ## or use P or PEN - why? why not? else '' end rec = [match_id, row[:score], "#{row[:minute]}'", extra, row[:player], row[:notes]] recs << rec end end end # each match end # each page ## pp recs out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}~goals.csv" headers = ['Match', 'Score', 'Minute', 'Extra', 'Player', 'Notes'] puts "write #{out_path}..." Cache::CsvMatchWriter.write( out_path, recs, headers: headers ) end
fix_date( row, offset )
click to toggle source
helper to fix dates to use local timezone (and not utc/london time)
# File lib/football-sources/worldfootball/convert.rb, line 79 def self.fix_date( row, offset ) return row if row[3].nil? || row[3].empty? ## note: time (column) required for fix col = row[2] if col =~ /^\d{4}-\d{2}-\d{2}$/ date_fmt = '%Y-%m-%d' # e.g. 2002-08-17 else puts "!!! ERROR - wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry" ## todo/fix: add to errors/warns list - why? why not? exit 1 end date = DateTime.strptime( "#{row[2]} #{row[3]}", "#{date_fmt} %H:%M" ) ## NOTE - MUST be -7/24.0!!!! or such to work date = date + (offset/24.0) row[2] = date.strftime( date_fmt ) ## overwrite "old" row[3] = date.strftime( '%H:%M' ) row ## return row for possible pipelining - why? why not? end
norm_team( team )
click to toggle source
“global” helpers
# File lib/football-sources/worldfootball/mods.rb, line 10 def self.norm_team( team ) ## clean team name and asciify (e.g. ’->' ) team = team.sub( '(old)', '' ).strip team = team.gsub( '’', "'" ) ## e.g. Hawke’s Bay United FC team end
parse_score( score_str )
click to toggle source
# File lib/football-sources/worldfootball/build.rb, line 161 def self.parse_score( score_str ) comments = String.new( '' ) ## check - rename to/use status or such - why? why not? ## split score ft = '' ht = '' et = '' pen = '' if score_str == '---' ## in the future (no score yet) - was -:- ft = '' ht = '' elsif score_str == 'n.gesp.' || ## cancelled (british) / canceled (us) score_str == 'ausg.' || ## todo/check: change to some other status ???? score_str == 'annull.' ## todo/check: change to some other status (see ie 2012) ???? ft = '(*)' ht = '' comments = 'cancelled' elsif score_str == 'abgebr.' ## abandoned -- waiting for replay? ft = '(*)' ht = '' comments = 'abandoned' elsif score_str == 'verl.' ## postponed ft = '' ht = '' comments = 'postponed' # 5-4 (0-0, 1-1, 2-2) i.E. elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]* \(([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]*,[ ]* ([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]*,[ ]* ([0-9]+) [ ]*-[ ]* ([0-9]+)\) [ ]* i\.E\. /x pen = "#{$1}-#{$2}" ht = "#{$3}-#{$4}" ft = "#{$5}-#{$6}" et = "#{$7}-#{$8}" # 2-1 (1-0, 1-1) n.V elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]* \(([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]*,[ ]* ([0-9]+) [ ]*-[ ]* ([0-9]+) \) [ ]* n\.V\. /x et = "#{$1}-#{$2}" ht = "#{$3}-#{$4}" ft = "#{$5}-#{$6}" elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]* \(([0-9]+) [ ]*-[ ]* ([0-9]+) \) /x ft = "#{$1}-#{$2}" ht = "#{$3}-#{$4}" elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+) [ ]* ([a-z.]+) /x ft = "#{$1}-#{$2} (*)" ht = '' comments = $3 elsif score_str =~ /^([0-9]+)-([0-9]+)$/ ft = "#{$1}-#{$2}" ## e.g. see luxemburg and others ht = '' else puts "!! ERROR - unsupported score format >#{score_str}< - sorry; maybe add a score error fix/patch" exit 1 end [ht, ft, et, pen, comments] end
vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS )
click to toggle source
# File lib/football-sources/worldfootball/vacuum.rb, line 24 def self.vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS ) ## check for unused columns and strip/remove counter = Array.new( MAX_HEADERS.size, 0 ) rows.each do |row| row.each_with_index do |col, idx| counter[idx] += 1 unless col.nil? || col.empty? end end pp counter ## check empty columns headers = [] indices = [] empty_headers = [] empty_indices = [] counter.each_with_index do |num, idx| header = MAX_HEADERS[ idx ] if num > 0 || (num == 0 && fixed_headers.include?( header )) headers << header indices << idx else empty_headers << header empty_indices << idx end end if empty_indices.size > 0 rows = rows.map do |row| row_vacuumed = [] row.each_with_index do |col, idx| ## todo/fix: use values or such?? row_vacuumed << col unless empty_indices.include?( idx ) end row_vacuumed end end [rows, headers] end