module Worldfootball

todo/check: move MODS and SCORE_ERRORS out-of-lib

and into config or such - why? why not?

Constants

MAX_HEADERS
MIN_HEADERS
MODS
OFFSETS
Brasilia - Distrito Federal, Brasil  (GMT-3)  -- summer time?
Ciudad de México, CDMX, México       (GMT-5)  -- summer time?
Londres, Reino Unido (GMT+1)
 Madrid -- ?
 Lisboa -- ?
 Moskow -- ?

todo/check - quick fix timezone offsets for leagues for now

- find something better - why? why not?

note: assume time is in GMT+1

ROUND_TO_EN
SCORE_ERRORS

fix/patch known score format errors in at/de cups

new convention
for a fix require league, date, and team1 & team2 for now!!!!
 - do NOT use some "generic" fix / patch!!!!

old de/at patches/fixes:

'0-1 (0-0, 0-0, 0-0) n.V.' => '0-1 (0-0, 0-0) n.V.',       # too long
'2-1 (1-1, 1-1, 1-0) n.V.' => '2-1 (1-1, 1-1) n.V.',
'4-2 (0-0, 0-0) i.E.'      => '4-2 (0-0, 0-0, 0-0) i.E.',  # too short

Public Class Methods

build( rows, season:, league:, stage: '' ) click to toggle source

build “standard” match records from “raw” table rows

# File lib/football-sources/worldfootball/build.rb, line 20
def self.build( rows, season:, league:, stage: '' )   ## rename to fixup or such - why? why not?
   season = Season( season )  ## cast (ensure) season class (NOT string, integer, etc.)

   raise ArgumentError, "league key as string expected"  unless league.is_a?(String)  ## note: do NOT pass in league struct! pass in key (string)

   print "  #{rows.size} rows - build #{league} #{season}"
   print " - #{stage}" unless stage.empty?
   print "\n"


   ## note: use only first part from key for lookup
   ##    e.g. at.1  => at
   ##         eng.1 => eng
   ##     and so on
   mods = MODS[ league.split('.')[0] ] || {}

   score_errors = SCORE_ERRORS[ league ] || {}


   i = 0
   recs = []
   rows.each do |row|
     i += 1


  if row[:round] =~ /Spieltag/
    puts
    print '[%03d] ' % (i+1)
    print row[:round]

    if m = row[:round].match( /([0-9]+)\. Spieltag/ )
      ## todo/check: always use a string even if number (as a string eg. '1' etc.)
      round = m[1]  ## note: keep as string (NOT number)
      print " => #{round}"
    else
      puts "!! ERROR: cannot find matchday number"
      exit 1
    end
    print "\n"
  elsif row[:round] =~ /[1-9]\.[ ]Runde|
                          Achtelfinale|
                          Viertelfinale|
                          Halbfinale|
                          Finale
                          /x
    puts
    print '[%03d] ' % (i+1)
    print row[:round]


    ## do NOT translate rounds (to english) - keep in german / deutsch (de)
    if ['at.cup', 'at.1',    ## at.1 - incl. europa league playoff
        'de.cup'].include?( league )
      round = row[:round]
    else
      round = ROUND_TO_EN[ row[:round] ]
      if round.nil?
        puts "!! ERROR: no mapping for round to english (en) found >#{row[:round]}<:"
        pp row
        exit 1
      end
      print " => #{round}"
    end
    print "\n"
  else
    puts "!! ERROR: unknown round >#{row[:round]}< for league >#{league}<:"
    pp row
    exit 1
  end


    date_str  = row[:date]
    time_str  = row[:time]
    team1_str = row[:team1]
    team2_str = row[:team2]
    score_str = row[:score]

    ## convert date from string e.g. 2019-25-10
    date = Date.strptime( date_str, '%Y-%m-%d' )


    ### check for score_error; first (step 1) lookup by date
    score_error = score_errors[ date.strftime('%Y-%m-%d') ]
    if score_error
      if team1_str == score_error[0] &&
         team2_str == score_error[1]
         ## check if team names match too; if yes, apply fix/patch!!
         if score_str != score_error[2][0]
           puts "!! WARN - score fix changed? - expected #{score_error[2][0]}, got #{score_str} - fixing to #{score_error[2][1]}"
           pp row
         end
         puts "FIX - applying score error fix - from #{score_error[2][0]} to => #{score_error[2][1]}"
         score_str = score_error[2][1]
      end
    end


    print '[%03d]    ' % (i+1)
    print "%-10s | " % date_str
    print "%-5s | "  % time_str
    print "%-22s | " % team1_str
    print "%-22s | " % team2_str
    print score_str
    print "\n"


    ## check for 0:3 Wert.   - change Wert. to awd.  (awarded)
    score_str = score_str.sub( /Wert\./i, 'awd.' )

    ## clean team name (e.g. remove (old))
    ##   and asciify (e.g. ’ to ' )
    team1_str = norm_team( team1_str )
    team2_str = norm_team( team2_str )

    team1_str = mods[ team1_str ]   if mods[ team1_str ]
    team2_str = mods[ team2_str ]   if mods[ team2_str ]




    ht, ft, et, pen, comments = parse_score( score_str )



    recs <<  [stage,
              round,
              date.strftime( '%Y-%m-%d' ),
              time_str,
              team1_str,
              ft,
              ht,
              team2_str,
              et,              # extra: incl. extra time
              pen,             # extra: incl. penalties
              comments]
   end  # each row
   recs
end
convert( league:, season:, offset: nil ) click to toggle source
# File lib/football-sources/worldfootball/convert.rb, line 6
def self.convert( league:, season:, offset: nil )  ## check: rename (optional) offset to time_offset or such?
  season = Season( season )  ## cast (ensure) season class (NOT string, integer, etc.)

  league = find_league( league )

  pages = league.pages( season: season )

  # note: assume stages if pages is an array (of hash table/records)
  #         (and NOT a single hash table/record)
  if pages.is_a?(Array)
    recs = []
    pages.each do |page_meta|
      slug       = page_meta[:slug]
      stage_name = page_meta[:stage]
      ## todo/fix: report error/check if stage.name is nil!!!

      print "  parsing #{slug}..."

      # unless File.exist?( path )
      #  puts "!! WARN - missing stage >#{stage_name}< source - >#{path}<"
      #  next
      # end

      page = Page::Schedule.from_cache( slug )
      print "  title=>#{page.title}<..."
      print "\n"

      rows = page.matches
      stage_recs = build( rows, season: season, league: league.key, stage: stage_name )

      pp stage_recs[0]   ## check first record
      recs += stage_recs
    end
  else
    page_meta = pages
    slug = page_meta[:slug]

    print "  parsing #{slug}..."

    page = Page::Schedule.from_cache( slug )
    print "  title=>#{page.title}<..."
    print "\n"

    rows = page.matches
    recs = build( rows, season: season, league: league.key )

    pp recs[0]   ## check first record
  end

  recs = recs.map { |rec| fix_date( rec, offset ) }    if offset

##   note:  sort matches by date before saving/writing!!!!
##     note: for now assume date in string in 1999-11-30 format (allows sort by "simple" a-z)
## note: assume date is third column!!! (stage/round/date/...)
recs = recs.sort { |l,r| l[2] <=> r[2] }
## reformat date / beautify e.g. Sat Aug 7 1993
recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }

   ## remove unused columns (e.g. stage, et, p, etc.)
   recs, headers = vacuum( recs )

   puts headers
   pp recs[0]   ## check first record

   out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}.csv"

   puts "write #{out_path}..."
   Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
end
convert_reports( league:, season: ) click to toggle source
# File lib/football-sources/worldfootball/convert_reports.rb, line 4
def self.convert_reports( league:, season: )
  season = Season( season )  ## cast (ensure) season class (NOT string, integer, etc.)

  league = find_league( league )

   ## note: use only first part from key for lookup
   ##    e.g. at.1  => at
   ##         eng.1 => eng
   ##     and so on
   mods = MODS[ league.key.split('.')[0] ] || {}



  pages = league.pages( season: season )

  recs = []

  ## if single (simple) page setup - wrap in array
  pages = pages.is_a?(Array) ? pages : [pages]
  pages.each do |page_meta|  # note: use page_info for now (or page_rec or page_meta or such)

    page = Page::Schedule.from_cache( page_meta[:slug] )
    print "  page title=>#{page.title}<..."
    print "\n"

    matches = page.matches

    puts "matches - #{matches.size} rows:"
    pp matches[0]

    puts "#{page.generated_in_days_ago}  - #{page.generated}"


    matches.each_with_index do |match,i|

      report_ref = match[:report_ref]
      if report_ref.nil?
        puts "!! WARN: no match report ref found for match:"
        pp match
        next
      end

      puts "reading #{i+1}/#{matches.size} - #{report_ref}..."
      report = Page::Report.from_cache( report_ref )

      puts
      puts report.title
      puts report.generated

      rows = report.goals
      puts "goals - #{rows.size} records"
      ## pp rows


      if rows.size > 0
        ## add goals
        date = Date.strptime( match[:date], '%Y-%m-%d')

        team1 = match[:team1]
        team2 = match[:team2]

        ## clean team name (e.g. remove (old))
        ##   and asciify (e.g. ’ to ' )
        team1 = norm_team( team1 )
        team2 = norm_team( team2 )

        team1 = mods[ team1 ]   if mods[ team1 ]
        team2 = mods[ team2 ]   if mods[ team2 ]

        match_id = "#{team1} - #{team2} | #{date.strftime('%b %-d %Y')}"


        rows.each do |row|
          extra = if row[:owngoal]
                   '(og)'  ## or use OG or O.G.- why? why not?
                  elsif row[:penalty]
                   '(pen)' ## or use P or PEN - why? why not?
                  else
                    ''
                  end

          rec = [match_id,
                row[:score],
                "#{row[:minute]}'",
                extra,
                row[:player],
                row[:notes]]
          recs << rec
        end
      end
     end #  each match
    end # each page

  ## pp recs

  out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}~goals.csv"

  headers  = ['Match', 'Score', 'Minute', 'Extra', 'Player', 'Notes']

  puts "write #{out_path}..."
  Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
end
fix_date( row, offset ) click to toggle source

helper to fix dates to use local timezone (and not utc/london time)

# File lib/football-sources/worldfootball/convert.rb, line 79
def self.fix_date( row, offset )
  return row    if row[3].nil? || row[3].empty?   ## note: time (column) required for fix

  col = row[2]
  if col =~ /^\d{4}-\d{2}-\d{2}$/
    date_fmt = '%Y-%m-%d'   # e.g. 2002-08-17
  else
    puts "!!! ERROR - wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry"
    ## todo/fix: add to errors/warns list - why? why not?
    exit 1
  end

  date = DateTime.strptime( "#{row[2]} #{row[3]}", "#{date_fmt} %H:%M" )
  ## NOTE - MUST be -7/24.0!!!! or such to work
  date = date + (offset/24.0)

  row[2] = date.strftime( date_fmt )  ## overwrite "old"
  row[3] = date.strftime( '%H:%M' )
  row   ## return row for possible pipelining - why? why not?
end
norm_team( team ) click to toggle source

“global” helpers

# File lib/football-sources/worldfootball/mods.rb, line 10
def self.norm_team( team )
   ## clean team name and asciify (e.g. ’->' )
   team = team.sub( '(old)', '' ).strip
   team = team.gsub( '’', "'" )     ## e.g. Hawke’s Bay United FC
   team
end
parse_score( score_str ) click to toggle source
# File lib/football-sources/worldfootball/build.rb, line 161
def self.parse_score( score_str )
  comments = String.new( '' )     ## check - rename to/use status or such - why? why not?

  ## split score
  ft  = ''
  ht  = ''
  et  = ''
  pen = ''
  if score_str == '---'   ## in the future (no score yet) - was -:-
    ft = ''
    ht = ''
  elsif score_str == 'n.gesp.' ||   ## cancelled (british) / canceled (us)
        score_str == 'ausg.'   ||   ## todo/check: change to some other status ????
        score_str == 'annull.'      ## todo/check: change to some other status (see ie 2012) ????
    ft = '(*)'
    ht = ''
    comments = 'cancelled'
  elsif score_str == 'abgebr.'  ## abandoned  -- waiting for replay?
    ft = '(*)'
    ht = ''
    comments = 'abandoned'
  elsif score_str == 'verl.'   ## postponed
    ft = ''
    ht = ''
    comments = 'postponed'
  # 5-4 (0-0, 1-1, 2-2) i.E.
  elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
                          [ ]*
                      \(([0-9]+) [ ]*-[ ]* ([0-9]+)
                          [ ]*,[ ]*
                        ([0-9]+) [ ]*-[ ]* ([0-9]+)
                          [ ]*,[ ]*
                       ([0-9]+) [ ]*-[ ]* ([0-9]+)\)
                          [ ]*
                       i\.E\.
                     /x
    pen = "#{$1}-#{$2}"
    ht  = "#{$3}-#{$4}"
    ft  = "#{$5}-#{$6}"
    et  = "#{$7}-#{$8}"
  # 2-1 (1-0, 1-1) n.V
  elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
                      [ ]*
                    \(([0-9]+) [ ]*-[ ]* ([0-9]+)
                       [ ]*,[ ]*
                      ([0-9]+) [ ]*-[ ]* ([0-9]+)
                      \)
                       [ ]*
                       n\.V\.
                     /x
    et  = "#{$1}-#{$2}"
    ht  = "#{$3}-#{$4}"
    ft  = "#{$5}-#{$6}"
  elsif score_str =~ /([0-9]+)
                          [ ]*-[ ]*
                      ([0-9]+)
                          [ ]*
                      \(([0-9]+)
                          [ ]*-[ ]*
                        ([0-9]+)
                      \)
                     /x
    ft = "#{$1}-#{$2}"
    ht = "#{$3}-#{$4}"
  elsif  score_str =~ /([0-9]+)
                         [ ]*-[ ]*
                       ([0-9]+)
                         [ ]*
                        ([a-z.]+)
                       /x
    ft = "#{$1}-#{$2} (*)"
    ht = ''
    comments = $3
  elsif score_str =~ /^([0-9]+)-([0-9]+)$/
     ft = "#{$1}-#{$2}"     ## e.g. see luxemburg and others
     ht = ''
  else
     puts "!! ERROR - unsupported score format >#{score_str}< - sorry; maybe add a score error fix/patch"
     exit 1
  end

  [ht, ft, et, pen, comments]
end
vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS ) click to toggle source
# File lib/football-sources/worldfootball/vacuum.rb, line 24
def self.vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS )
  ## check for unused columns and strip/remove
  counter = Array.new( MAX_HEADERS.size, 0 )
  rows.each do |row|
     row.each_with_index do |col, idx|
       counter[idx] += 1  unless col.nil? || col.empty?
     end
  end

  pp counter

  ## check empty columns
  headers       = []
  indices       = []
  empty_headers = []
  empty_indices = []

  counter.each_with_index do |num, idx|
     header = MAX_HEADERS[ idx ]
     if num > 0 || (num == 0 && fixed_headers.include?( header ))
       headers << header
       indices << idx
     else
       empty_headers << header
       empty_indices << idx
     end
  end

  if empty_indices.size > 0
    rows = rows.map do |row|
             row_vacuumed = []
             row.each_with_index do |col, idx|
               ## todo/fix: use values or such??
               row_vacuumed << col   unless empty_indices.include?( idx )
             end
             row_vacuumed
         end
    end

  [rows, headers]
end