class Polipus::HTTP
Constants
- REDIRECT_LIMIT
Maximum number of redirects to follow on each
get_response
- RESCUABLE_ERRORS
Public Class Methods
new(opts = {})
click to toggle source
# File lib/polipus/http.rb, line 28 def initialize(opts = {}) @connections = {} @connections_hits = {} @opts = opts end
Public Instance Methods
fetch_page(url, referer = nil, depth = nil, user_data = nil)
click to toggle source
fetch_pages(url, referer = nil, depth = nil, user_data = nil)
click to toggle source
Create new Pages from the response of an HTTP
request to url, including redirects
# File lib/polipus/http.rb, line 46 def fetch_pages(url, referer = nil, depth = nil, user_data = nil) url = URI(url) pages = [] get(url, referer) do |response, code, location, redirect_to, response_time| handle_compression response page = Page.new(location, body: response.body, code: code, headers: response.to_hash, referer: referer, depth: depth, redirect_to: redirect_to, response_time: response_time, fetched_at: Time.now.to_i) page.user_data = user_data unless user_data.nil? pages << page end pages rescue *RESCUABLE_ERRORS => e if verbose? puts e.inspect puts e.backtrace end page = Page.new(url, error: e, referer: referer, depth: depth) page.user_data = user_data unless user_data.nil? [page] end
open_timeout()
click to toggle source
HTTP
open timeout in seconds
# File lib/polipus/http.rb, line 139 def open_timeout @opts[:open_timeout] end
proxy_host()
click to toggle source
The proxy address string
# File lib/polipus/http.rb, line 95 def proxy_host @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host] end
proxy_host_port()
click to toggle source
Shorthand to get proxy info with a single call It returns an array of [‘addr’, port, ‘user’, ‘pass’]
# File lib/polipus/http.rb, line 125 def proxy_host_port @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port] end
proxy_pass()
click to toggle source
The proxy password
# File lib/polipus/http.rb, line 116 def proxy_pass # return proxy_host_port[3] unless @opts[:proxy_host_port].nil? @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass] end
proxy_port()
click to toggle source
The proxy port
# File lib/polipus/http.rb, line 102 def proxy_port @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port] end
proxy_user()
click to toggle source
The proxy username
# File lib/polipus/http.rb, line 109 def proxy_user @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user] end
read_timeout()
click to toggle source
HTTP
read timeout in seconds
# File lib/polipus/http.rb, line 132 def read_timeout @opts[:read_timeout] end
redirect_limit()
click to toggle source
The maximum number of redirects to follow
# File lib/polipus/http.rb, line 76 def redirect_limit @opts[:redirect_limit] || REDIRECT_LIMIT end
user_agent()
click to toggle source
The user-agent string which will be sent with each request, or nil if no such option is set
# File lib/polipus/http.rb, line 84 def user_agent if @opts[:user_agent].respond_to?(:sample) @opts[:user_agent].sample else @opts[:user_agent] end end
Private Instance Methods
allowed?(to_url, from_url)
click to toggle source
Allowed to connect to the requested url?
# File lib/polipus/http.rb, line 276 def allowed?(to_url, from_url) to_url.host.nil? || (to_url.host == from_url.host) end
connection(url)
click to toggle source
# File lib/polipus/http.rb, line 225 def connection(url) @connections[url.host] ||= {} @connections_hits[url.host] ||= {} if @connections[url.host][url.port] if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits] @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger] return refresh_connection url end @connections_hits[url.host][url.port] += 1 return @connections[url.host][url.port] end refresh_connection url end
get(url, referer = nil) { |response, code, loc, redirect_to, response_time| ... }
click to toggle source
Retrieve HTTP
responses for url, including redirects. Yields the response object, response code, and URI location for each response.
# File lib/polipus/http.rb, line 161 def get(url, referer = nil) limit = redirect_limit loc = url loop do # if redirected to a relative url, merge it with the host of the original # request url loc = url.merge(loc) if loc.relative? response, response_time = get_response(loc, referer) code = Integer(response.code) redirect_to = begin response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil rescue URI::InvalidURIError => e @opts[:logger].debug { "Request #{url} got #{e}" } if @opts[:logger] nil end yield response, code, loc, redirect_to, response_time limit -= 1 break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0 end end
get_response(url, referer = nil)
click to toggle source
Get an HTTPResponse for url, sending the appropriate User-Agent string
# File lib/polipus/http.rb, line 187 def get_response(url, referer = nil) full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}" opts = {} opts['User-Agent'] = user_agent if user_agent opts['Referer'] = referer.to_s if referer opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies? opts['Accept-Encoding'] = 'gzip,deflate' retries = 0 begin start = Time.now # format request req = Net::HTTP::Get.new(full_path, opts) # HTTP Basic authentication req.basic_auth url.user, url.password if url.user if @opts[:http_user] req.basic_auth @opts[:http_user], @opts[:http_password] end # urls auth schema has higher priority req.basic_auth url.user, url.password if url.user response = connection(url).request(req) finish = Time.now response_time = ((finish - start) * 1000).round cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie'] return response, response_time rescue *RESCUABLE_ERRORS => e puts e.inspect if verbose? refresh_connection(url) retries += 1 if retries < 3 retry else raise e end end end
handle_compression(response)
click to toggle source
# File lib/polipus/http.rb, line 280 def handle_compression(response) case response['content-encoding'] when 'gzip', 'x-gzip' body_io = StringIO.new(response.body) response.body.replace Zlib::GzipReader.new(body_io).read when 'deflate' response.body.replace Zlib::Inflate.inflate(response.body) end end
refresh_connection(url)
click to toggle source
# File lib/polipus/http.rb, line 241 def refresh_connection(url) if @opts[:logger] && proxy_host && proxy_port @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" } end # Block has higher priority unless @opts[:proxy_host_port].nil? p_host, p_port, p_user, p_pass = proxy_host_port else p_host = proxy_host p_port = proxy_port p_user = proxy_user p_pass = proxy_pass end http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass) http.read_timeout = read_timeout if read_timeout http.open_timeout = open_timeout if open_timeout if url.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end @connections_hits[url.host][url.port] = 1 @connections[url.host][url.port] = http.start end
verbose?()
click to toggle source
# File lib/polipus/http.rb, line 269 def verbose? @opts[:verbose] end