class Sunbro::HTTP
Constants
- REDIRECT_LIMIT
Maximum number of redirects to follow on each
get_response
Public Class Methods
new(opts = {})
click to toggle source
# File lib/sunbro/http.rb, line 15 def initialize(opts = {}) RestClient.proxy ||= Sunbro::Settings.proxy_url @connections = {} @opts = opts end
Public Instance Methods
close()
click to toggle source
# File lib/sunbro/http.rb, line 21 def close # Deprecated with move to RestClient true end
convert_to_uri(url)
click to toggle source
Convert the link to a valid URI if possible
# File lib/sunbro/http.rb, line 78 def convert_to_uri(url) URI(url) rescue URI::InvalidURIError URI(URI.escape(url)) end
fetch_page(url, opts={})
click to toggle source
Fetch a single Page
from the response of an HTTP
request to url. Just gets the final destination page.
# File lib/sunbro/http.rb, line 30 def fetch_page(url, opts={}) original_url = url.dup pages = fetch_pages(url, opts) if pages.count == 1 page = pages.first page.url = original_url page else page = pages.last page.redirect_from = original_url page end end
fetch_pages(url, opts={})
click to toggle source
Create new Pages from the response of an HTTP
request to url, including redirects
# File lib/sunbro/http.rb, line 48 def fetch_pages(url, opts={}) referer, depth = opts[:referer], opts[:depth] force_format = opts[:force_format] || default_page_format begin url = convert_to_uri(url) unless url.is_a?(URI) pages = [] get(url) do |response, code, location, redirect_to, response_time| pages << Page.new(location, :body => response.body.dup, :code => code, :headers => response.headers.stringify_keys, :referer => referer, :depth => depth, :redirect_to => redirect_to, :response_time => response_time, :force_format => force_format) end return pages rescue Exception => e if verbose? puts e.inspect puts e.backtrace end return [Page.new(url, :error => e)] end end
proxy_host()
click to toggle source
The proxy address string
# File lib/sunbro/http.rb, line 109 def proxy_host @opts[:proxy_host] end
proxy_port()
click to toggle source
The proxy port
# File lib/sunbro/http.rb, line 116 def proxy_port @opts[:proxy_port] end
read_timeout()
click to toggle source
HTTP
read timeout in seconds
# File lib/sunbro/http.rb, line 123 def read_timeout @opts[:read_timeout] end
redirect_limit()
click to toggle source
The maximum number of redirects to follow
# File lib/sunbro/http.rb, line 87 def redirect_limit @opts[:redirect_limit] || REDIRECT_LIMIT end
user_agent()
click to toggle source
The user-agent string which will be sent with each request, or nil if no such option is set
# File lib/sunbro/http.rb, line 95 def user_agent @opts[:agent] || Settings.user_agent end
Private Instance Methods
allowed?(to_url, from_url)
click to toggle source
Allowed to connect to the requested url?
# File lib/sunbro/http.rb, line 213 def allowed?(to_url, from_url) to_url.host.nil? || (to_url.host.sub("www.","") == from_url.host.sub("www.","")) rescue true end
connection(url)
click to toggle source
# File lib/sunbro/http.rb, line 187 def connection(url) @connections[url.host] ||= {} if conn = @connections[url.host][url.port] return conn end refresh_connection url end
default_page_format()
click to toggle source
# File lib/sunbro/http.rb, line 221 def default_page_format # Don't force the page format if the default format is set to :any return unless [:xml, :html].include? Settings.page_format Settings.page_format end
get(url) { |response, code, loc, redirect_to, response_time| ... }
click to toggle source
Retrieve HTTP
responses for url, including redirects. Yields the response object, response code, and URI location for each response.
# File lib/sunbro/http.rb, line 134 def get(url) limit = redirect_limit loc = url begin # if redirected to a relative url, merge it with the host of the original # request url loc = url.merge(loc) if loc.relative? response, response_time = get_response(loc) code = Integer(response.code) redirect_to = 300.upto(307).include?(response['code']) ? URI(response['location']).normalize : nil yield response, code, loc, redirect_to, response_time limit -= 1 end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0 end
get_response(url, referer = nil)
click to toggle source
Get an HTTPResponse for url, sending the appropriate User-Agent string
# File lib/sunbro/http.rb, line 153 def get_response(url, referer = nil) full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}" opts = {} opts[:headers] = { user_agent: user_agent } if user_agent retries = 0 begin start = Time.now() response = RestResponse.new # This causes RestClient to skip following the redirect automatically connection(url)[full_path].get(opts) do |res, request, result| response.body = res.body response.headers = res.headers response.code = res.code response.location = res.headers[:location] end finish = Time.now() response_time = ((finish - start) * 1000).round response.clean! return response, response_time rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e puts e.inspect if verbose? refresh_connection(url) retries += 1 retry unless retries > 3 end end
refresh_connection(url)
click to toggle source
# File lib/sunbro/http.rb, line 197 def refresh_connection(url) @connections[url.host][url.port] = RestClient::Resource.new( "#{url.scheme}://#{url.host}", timeout: read_timeout || 5, verify_ssl: OpenSSL::SSL::VERIFY_NONE ) end
verbose?()
click to toggle source
# File lib/sunbro/http.rb, line 206 def verbose? @opts[:verbose] end