class Pluto::FeedFetcherCondGetWithCache

Public Class Methods

new() click to toggle source
# File lib/pluto/feedfetcher/cond_get_with_cache.rb, line 10
def initialize
  @worker  = Fetcher::Worker.new
end

Public Instance Methods

debug?() click to toggle source
# File lib/pluto/feedfetcher/cond_get_with_cache.rb, line 14
def debug?()  Pluto.config.debug?;  end
fetch( feed_rec ) click to toggle source
# File lib/pluto/feedfetcher/cond_get_with_cache.rb, line 17
def fetch( feed_rec )
  #############
  # try smart http update; will update db records

  feed_url = feed_rec.feed_url
  feed_key = feed_rec.key

  ### todo/fix: normalize/unifiy feed_url
  ##  - same in fetcher - use shared utitlity method or similar

  @worker.use_cache = true
  @worker.cache[ feed_url ] = {
    'etag'          => feed_rec.http_etag,
    'last-modified' => feed_rec.http_last_modified
  }

  begin
    response = @worker.get( feed_url )

   ## todo/fix: add a retry for open timeout - why? why not?
   ##  When you run into Net::OpenTimeout, you should handle it
   ## by retrying the request a few times,
   ## or giving up and showing a helpful error to the user.
   ## --  <https://www.exceptionalcreatures.com/bestiary/Net/OpenTimeout.html>

  rescue OpenSSL::SSL::SSLError,
         Net::OpenTimeout,
         Net::ReadTimeout, 
         SocketError, 
         SystemCallError => e

    ## catch socket error for unknown domain names (e.g. pragdave.blogs.pragprog.com)
    ###  will result in SocketError -- getaddrinfo: Name or service not known
    logger.error "*** error: fetching feed '#{feed_key}' - [#{e.class.name}] #{e.to_s}"
    Activity.create!( text: "*** error: fetching feed '#{feed_key}' - [#{e.class.name}] #{e.to_s}" )

    ### todo/fix: update feed rec in db
    @worker.use_cache = false   # fix/todo: restore old use_cache setting instead of false
    return nil
  end

  @worker.use_cache = false   # fix/todo: restore old use_cache setting instead of false

  if response.code == '304'  # not modified (conditional GET - e.g. using etag/last-modified)
    logger.info "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
    logger.info "no change; request returns not modified (304); skipping parsing feed"
    return nil   # no updates available; nothing to do
  end

  feed_fetched = Time.now

  if response.code != '200'   # note Net::HTTP response.code is a string in ruby

    logger.error "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"

    feed_attribs = {
      http_code:          response.code.to_i,
      http_server:        response.header[ 'server' ],
      http_etag:          nil,
      http_last_modified: nil,
      body:               nil,
      md5:                nil,
      fetched:            feed_fetched
    }
    feed_rec.update!( feed_attribs )

    ## add log error activity -- in future add to error log - better - why? why not?
    Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )

    return nil  #  sorry; no feed for parsing available
  end

  logger.info "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"

  feed_xml = response.body
  ###
  # Note: Net::HTTP will NOT set encoding UTF-8 etc.
  # will mostly be ASCII
  # - try to change encoding to UTF-8 ourselves
  logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"


  #####
  # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here

  # try Converting ASCII-8BIT to UTF-8 based domain-specific guesses
  begin
    # Try it as UTF-8 directly
    #   Note: make a copy/dup - otherwise convert fails (because string is already changed/corrupted)
    feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
    unless feed_xml_cleaned.valid_encoding?

       logger.warn "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
       Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
       # Some of it might be old Windows code page
       # -- (Windows Code Page CP1252 is ISO_8859_1 is Latin-1 - check ??)

       # tell ruby the encoding
       # encode to utf-8
       ## use all in code encode ?? e.g. feed_xml_cleaned = feed_xml.encode( Encoding::UTF_8, Encoding::ISO_8859_1 )
       feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
    end
    feed_xml = feed_xml_cleaned
  rescue EncodingError => e
    logger.warn "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
    Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )

    # Force it to UTF-8, throwing out invalid bits
    ## todo: check options - add ?? or something to mark invalid chars ???
    feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
  end

  ## NB:
  # for now "hardcoded" to utf8 - what else can we do?
  # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
  ### old "simple" version
  ## feed_xml = feed_xml.force_encoding( Encoding::UTF_8 )


  logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"

  ## check for md5 hash for response.body

  last_feed_md5 = feed_rec.md5
  feed_md5 = Digest::MD5.hexdigest( feed_xml )

  if last_feed_md5 && last_feed_md5 == feed_md5
    # not all servers handle conditional gets, so while not much can be
    # done about the bandwidth, but if the response body is identical
    # the downstream processing (parsing, caching, ...) can be avoided.
    #  - thanks to planet mars -fido.rb for the idea, cheers.

    logger.info "no change; md5 digests match; skipping parsing feed"
    return nil   # no updates available; nothing to do
  end

  feed_attribs = {
    http_code:          response.code.to_i,
    http_server:        response.header[ 'server' ],
    http_etag:          response.header[ 'etag' ],
    http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
    body:               feed_xml,
    md5:                feed_md5,
    fetched:            feed_fetched
  }

  ## if debug?
    logger.debug "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
    logger.debug "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
    logger.debug "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
  ## end

  ### note: might crash w/ encoding errors when saving in postgress
  ##  e.g. PG::CharacterNotInRepertoire: ERROR: ...
  ##  catch error, log it and stop for now
  #
  #  in the future check for different charset than utf-8 ?? possible?? how to deal with non-utf8 charsets??

  begin
    feed_rec.update!( feed_attribs )
  rescue Exception => e
    # log db error; and continue
    logger.error "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
    Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
    return nil  #  sorry; corrupt feed; parsing not possible; fix char encoding - make it an option in config??
  end


  logger.debug "feed_xml:"
  logger.debug feed_xml[ 0..300 ] # get first 300 chars

  logger.info "Before parsing feed >#{feed_key}<..."

  ### move to feedutils
  ### logger.debug "using stdlib RSS::VERSION #{RSS::VERSION}"


  ### todo/fix:
  ###  return feed_xml !!! - move FeedUtils::Parser.parse to update or someting !!!

  feed_xml
  ## fix/todo: check for feed.nil?   -> error parsing!!!
  #    or throw exception
  ## feed = FeedUtils::Parser.parse( feed_xml )
  ## feed
end