class Rsssf::RsssfPageFetcher
Public Class Methods
new()
click to toggle source
# File lib/rsssf/fetch.rb, line 10 def initialize @worker = Fetcher::Worker.new end
Public Instance Methods
fetch( src_url )
click to toggle source
# File lib/rsssf/fetch.rb, line 14 def fetch( src_url ) ## note: assume plain 7-bit ascii for now ## -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1) -- does NOT use utf-8 character encoding!!! html = @worker.read( src_url ) ### todo/fix: first check if html is all ascii-7bit e.g. ## includes only chars from 64 to 127!!! ## normalize newlines ## remove \r (form feed) used by Windows; just use \n (new line) html = html.gsub( "\r", '' ) ## note: ## assume (default) to ISO 3166-15 (an updated version of ISO 3166-1) for now ## ## other possible alternatives - try: ## - Windows CP 1562 or ## - ISO 3166-2 (for eastern european languages ) ## ## note: german umlaut use the same code (int) ## in ISO 3166-1/15 and 2 and Windows CP1562 (other chars ARE different!!!) html = html.force_encoding( Encoding::ISO_8859_15 ) html = html.encode( Encoding::UTF_8 ) # try conversion to utf-8 ## check for html entities html = html.gsub( "ä", 'ä' ) html = html.gsub( "ö", 'ö' ) html = html.gsub( "ü", 'ü' ) html = html.gsub( "Ä", 'Ä' ) html = html.gsub( "Ö", 'Ö' ) html = html.gsub( "Ü", 'Ü' ) html = html.gsub( "ß", 'ß' ) html = html.gsub( "&oulm;", 'ö' ) ## support typo in entity (ö) html = html.gsub( "&slig;", "ß" ) ## support typo in entity (ß) html = html.gsub( "É", 'É' ) html = html.gsub( "ø", 'ø' ) ## check for more entities html = html.gsub( /&[^;]+;/) do |match| puts "*** found unencoded html entity #{match}" match ## pass through as is (1:1) end ## todo/fix: add more entities txt = html_to_txt( html ) header = <<EOS <!-- source: #{src_url} --> EOS header+txt ## return txt w/ header end