class Worldfootball::Page

Constants

GENERATED_RE

<!– [generated 2020-06-30 22:30:19] –> <!– [generated 2020-06-30 22:30:19] –>

Public Class Methods

from_file( path ) click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 5
def self.from_file( path )
  html = File.open( path, 'r:utf-8' ) {|f| f.read }
  new( html )
end
new( html ) click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 10
def initialize( html )
  @html = html
end

Public Instance Methods

assert( cond, msg ) click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 96
def assert( cond, msg )
  if cond
    # do nothing
  else
    puts "!!! assert failed (in parse page) - #{msg}"
    exit 1
  end
end
doc() click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 14
def doc
  ## note: if we use a fragment and NOT a document - no access to page head (and meta elements and such)
  @doc ||= Nokogiri::HTML( @html )
end
generated() click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 64
def generated
   @generated ||= begin
     m=GENERATED_RE.match( @html )
     if m
      DateTime.strptime( "#{m[:date]} #{m[:time]}", '%Y-%m-%d %H:%M:%S')
     else
      puts "!! WARN - no generated timestamp found in page"
      nil
     end
   end
end
generated_in_days_ago() click to toggle source

convenience helper / formatter

# File lib/webget-football/worldfootball/page.rb, line 77
def generated_in_days_ago
  if generated
   diff_in_days = Date.today.jd - generated.jd
   "#{diff_in_days}d"
  else
   '?'
  end
end
keywords() click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 25
def keywords
   # <meta name="keywords"
   #  content="Bundesliga, 2010/2011, Spielplan, KSV Superfund, SC Magna Wiener Neustadt, SV Ried, FC Wacker Innsbruck, Austria Wien, Sturm Graz, SV Mattersburg, LASK Linz, Rapid Wien, RB Salzburg" />
   @keywords ||= doc.css( 'meta[name="keywords"]' ).first
   @keywords[:content]  ## get content attribute
   ## or      doc.xpath( '//meta[@name="keywords"]' ).first
   ## pp keywords
   # puts "  #{keywords[:content]}"

   # keywords = doc.at( 'meta[@name="Keywords"]' )
   # pp keywords
   ## check for
end
squish( str ) click to toggle source

helper methods

# File lib/webget-football/worldfootball/page.rb, line 89
def squish( str )
  str = str.strip
  str = str.gsub( "\u{00A0}", ' ' )  # Unicode Character 'NO-BREAK SPACE' (U+00A0)
  str = str.gsub( /[ \t\n]+/, ' ' )  ## fold whitespace to one max.
  str
end
title() click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 19
def title
 # <title>Bundesliga 2010/2011 &raquo; Spielplan</title>
   @title ||= doc.css( 'title' ).first
   @title.text  ## get element's text content
end
url() click to toggle source

<meta property=“og:url”

content="//www.weltfussball.de/alle_spiele/aut-bundesliga-2010-2011/" />
# File lib/webget-football/worldfootball/page.rb, line 41
def url
  @url ||= doc.css( 'meta[property="og:url"]' ).first
  @url[:content]
end