class Fetcher::Worker
Public Class Methods
new( old_logger_do_not_use=nil )
click to toggle source
todo/fix:
remove logger from c’tor
use logutils instead
# File lib/fetcher/worker.rb, line 34 def initialize( old_logger_do_not_use=nil ) if old_logger_do_not_use != nil puts "*** depreciated API call [Fetcher.initialize] - do NOT pass in logger; no longer required/needed; logger arg will get removed" end ### cache for conditional get (e.g. etags and last-modified headers/checks) @cache = {} @use_cache = false end
Public Instance Methods
cache()
click to toggle source
# File lib/fetcher/worker.rb, line 47 def cache() @cache; end
clear_cache()
click to toggle source
note: use cache[ uri ] = hash for headers+plus body+plus code(410,etc.)
cache[ uri ]
# File lib/fetcher/worker.rb, line 46 def clear_cache() @cache = {}; end
copy( src, dest, opts={} )
click to toggle source
# File lib/fetcher/worker.rb, line 101 def copy( src, dest, opts={} ) ### fix: return true - success or # false - error!!! ## todo: add file protocol - why? why not?? logger.debug "fetch - copy src: #{src} to dest: #{dest}" response = get_response( src ) # NOTE: on error (NOK) raise exception; do NOT copy file; sorry if response.code != '200' raise HttpError.new( response.code, response.message ) end ### check: ## why not always use wb??? ## how is it differet for text files? ## will convert newlines (from windows to unix) ??? # check for content type; use 'wb' for images if response.content_type =~ /image/ || response.content_type =~ /zip/ ## use application/zip or something - why? why not?? logger.debug ' switching to binary' mode = 'wb' else mode = 'w' end mode = opts[:mode] if opts[:mode] # if mode flags passed in -take precedence File.open( dest, mode ) do |f| f.write( response.body ) end end
get( src )
click to toggle source
# File lib/fetcher/worker.rb, line 52 def get( src ) # return HTTPResponse (code,message,body,etc.) logger.debug "fetch - get(_response) src: #{src}" get_response( src ) end
get_response( src )
click to toggle source
todo: add file protocol
# File lib/fetcher/worker.rb, line 140 def get_response( src ) uri = URI.parse( src ) # new code: honor proxy env variable HTTP_PROXY proxy = ENV['HTTP_PROXY'] proxy = ENV['http_proxy'] if proxy.nil? # try possible lower/case env variable (for *nix systems) is this necessary?? if proxy proxy = URI.parse( proxy ) logger.debug "using net http proxy: proxy.host=#{proxy.host}, proxy.port=#{proxy.port}" if proxy.user && proxy.password logger.debug " using credentials: proxy.user=#{proxy.user}, proxy.password=****" else logger.debug " using no credentials" end else logger.debug "using direct net http access; no proxy configured" proxy = OpenStruct.new # all fields return nil (e.g. proxy.host, etc.) end http_proxy = Net::HTTP::Proxy( proxy.host, proxy.port, proxy.user, proxy.password ) redirect_limit = 6 response = nil until false raise ArgumentError, 'HTTP redirect too deep' if redirect_limit == 0 redirect_limit -= 1 http = http_proxy.new( uri.host, uri.port ) logger.debug "GET #{uri.request_uri} uri=#{uri}, redirect_limit=#{redirect_limit}" headers = { 'User-Agent' => "fetcher gem v#{VERSION}" } if use_cache? ## check for existing cache entry in cache store (lookup by uri) ## todo/fix: normalize uri!!!! - how? ## - remove query_string ?? fragement ?? why? why not?? ## note: using uri.to_s should return full uri e.g. http://example.com/page.html cache_entry = cache[ uri.to_s ] if cache_entry logger.info "found cache entry for >#{uri.to_s}<" if cache_entry['etag'] logger.info "adding header If-None-Match (etag) >#{cache_entry['etag']}< for conditional GET" headers['If-None-Match'] = cache_entry['etag'] end if cache_entry['last-modified'] logger.info "adding header If-Modified-Since (last-modified) >#{cache_entry['last-modified']}< for conditional GET" headers['If-Modified-Since'] = cache_entry['last-modified'] end end end request = Net::HTTP::Get.new( uri.request_uri, headers ) if uri.instance_of? URI::HTTPS http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end response = http.request( request ) if response.code == '200' logger.debug "#{response.code} #{response.message}" logger.debug " content_type: #{response.content_type}, content_length: #{response.content_length}" break # will return response elsif( response.code == '304' ) # -- Not Modified - for conditional GETs (using etag,last-modified) logger.debug "#{response.code} #{response.message}" break # will return response elsif( response.code == '301' || response.code == '302' || response.code == '303' || response.code == '307' ) # 301 = moved permanently # 302 = found # 303 = see other # 307 = temporary redirect logger.debug "#{response.code} #{response.message} location=#{response.header['location']}" newuri = URI.parse( response.header['location'] ) if newuri.relative? logger.debug "url relative; try to make it absolute" newuri = uri + response.header['location'] end uri = newuri else puts "*** error - fetch HTTP - #{response.code} #{response.message}" break # will return response end end response end
read( src )
click to toggle source
# File lib/fetcher/worker.rb, line 60 def read( src ) # return contents (response body) as (ascii/binary) string logger.debug "fetch - copy src: #{src} into string" response = get_response( src ) # on error return empty string; - check: better return nil- why? why not?? if response.code != '200' raise HttpError.new( response.code, response.message ) end response.body.dup # return string copy - why? why not?? (use to_s?) end
read_blob!( src )
click to toggle source
# File lib/fetcher/worker.rb, line 74 def read_blob!( src ) ## note: same as read for now read( src ) end
read_utf8!( src )
click to toggle source
# File lib/fetcher/worker.rb, line 79 def read_utf8!( src ) # return contents (response body) a string logger.debug "fetch - copy src: #{src} into utf8 string" response = get_response( src ) # on error throw exception - why? why not?? if response.code != '200' raise HttpError.new( response.code, response.message ) end ### # Note: Net::HTTP will NOT set encoding UTF-8 etc. # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here # thus, set/force encoding to utf-8 txt = response.body.to_s txt = txt.force_encoding( Encoding::UTF_8 ) txt end
use_cache=(true_or_false)
click to toggle source
# File lib/fetcher/worker.rb, line 48 def use_cache=(true_or_false) @use_cache=true_or_false; end
use_cache?()
click to toggle source
# File lib/fetcher/worker.rb, line 49 def use_cache?() @use_cache; end