class Worldfootball::Page
Constants
- GENERATED_RE
<!– [generated 2020-06-30 22:30:19] –> <!– [generated 2020-06-30 22:30:19] –>
Public Class Methods
from_file( path )
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 5 def self.from_file( path ) html = File.open( path, 'r:utf-8' ) {|f| f.read } new( html ) end
new( html )
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 10 def initialize( html ) @html = html end
Public Instance Methods
assert( cond, msg )
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 96 def assert( cond, msg ) if cond # do nothing else puts "!!! assert failed (in parse page) - #{msg}" exit 1 end end
doc()
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 14 def doc ## note: if we use a fragment and NOT a document - no access to page head (and meta elements and such) @doc ||= Nokogiri::HTML( @html ) end
generated()
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 64 def generated @generated ||= begin m=GENERATED_RE.match( @html ) if m DateTime.strptime( "#{m[:date]} #{m[:time]}", '%Y-%m-%d %H:%M:%S') else puts "!! WARN - no generated timestamp found in page" nil end end end
generated_in_days_ago()
click to toggle source
convenience helper / formatter
# File lib/webget-football/worldfootball/page.rb, line 77 def generated_in_days_ago if generated diff_in_days = Date.today.jd - generated.jd "#{diff_in_days}d" else '?' end end
keywords()
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 25 def keywords # <meta name="keywords" # content="Bundesliga, 2010/2011, Spielplan, KSV Superfund, SC Magna Wiener Neustadt, SV Ried, FC Wacker Innsbruck, Austria Wien, Sturm Graz, SV Mattersburg, LASK Linz, Rapid Wien, RB Salzburg" /> @keywords ||= doc.css( 'meta[name="keywords"]' ).first @keywords[:content] ## get content attribute ## or doc.xpath( '//meta[@name="keywords"]' ).first ## pp keywords # puts " #{keywords[:content]}" # keywords = doc.at( 'meta[@name="Keywords"]' ) # pp keywords ## check for end
squish( str )
click to toggle source
helper methods
# File lib/webget-football/worldfootball/page.rb, line 89 def squish( str ) str = str.strip str = str.gsub( "\u{00A0}", ' ' ) # Unicode Character 'NO-BREAK SPACE' (U+00A0) str = str.gsub( /[ \t\n]+/, ' ' ) ## fold whitespace to one max. str end
title()
click to toggle source
# File lib/webget-football/worldfootball/page.rb, line 19 def title # <title>Bundesliga 2010/2011 » Spielplan</title> @title ||= doc.css( 'title' ).first @title.text ## get element's text content end
url()
click to toggle source
<meta property=“og:url”
content="//www.weltfussball.de/alle_spiele/aut-bundesliga-2010-2011/" />
# File lib/webget-football/worldfootball/page.rb, line 41 def url @url ||= doc.css( 'meta[property="og:url"]' ).first @url[:content] end