class URLCanonicalize::Request
Make an HTTP
request
Constants
- NETWORK_EXCEPTIONS
Attributes
http[R]
http_method[R]
Public Class Methods
new(http, http_method = :head)
click to toggle source
# File lib/url_canonicalize/request.rb, line 31 def initialize(http, http_method = :head) @http = http @http_method = http_method end
Public Instance Methods
fetch()
click to toggle source
# File lib/url_canonicalize/request.rb, line 6 def fetch handle_response end
location()
click to toggle source
# File lib/url_canonicalize/request.rb, line 10 def location @location ||= relative_to_absolute(response['location']) end
with_uri(uri)
click to toggle source
# File lib/url_canonicalize/request.rb, line 14 def with_uri(uri) @uri = uri @url = nil @host = nil @request = nil @response = nil @location = nil @html = nil self end
Private Instance Methods
base_request()
click to toggle source
# File lib/url_canonicalize/request.rb, line 134 def base_request check_http_method case http_method when :head Net::HTTP::Head.new uri when :get Net::HTTP::Get.new uri else raise URLCanonicalize::Exception::Request, "Unknown method: #{http_method}" end end
canonical_url()
click to toggle source
# File lib/url_canonicalize/request.rb, line 104 def canonical_url @canonical_url ||= relative_to_absolute(canonical_url_raw) end
canonical_url_element()
click to toggle source
# File lib/url_canonicalize/request.rb, line 112 def canonical_url_element @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first end
canonical_url_raw()
click to toggle source
# File lib/url_canonicalize/request.rb, line 108 def canonical_url_raw @canonical_url ||= canonical_url_element['href'] if canonical_url_element.is_a?(Nokogiri::XML::Element) end
check_http_method()
click to toggle source
Some sites treat HEAD requests as suspicious activity and block the requester after a few attempts. For these sites we'll use GET requests only
# File lib/url_canonicalize/request.rb, line 167 def check_http_method @http_method = :get if /(linkedin|crunchbase).com/ =~ host end
enhanced_response()
click to toggle source
# File lib/url_canonicalize/request.rb, line 90 def enhanced_response if canonical_url puts " * canonical_url:\t#{canonical_url}" if ENV['DEBUG'] response_plus = URLCanonicalize::Response::Success.new(canonical_url, response, html) URLCanonicalize::Response::CanonicalFound.new(canonical_url, response_plus) else URLCanonicalize::Response::Success.new(url, response, html) end end
handle_failure(klass = response.class, message = response.message)
click to toggle source
# File lib/url_canonicalize/request.rb, line 86 def handle_failure(klass = response.class, message = response.message) URLCanonicalize::Response::Failure.new(klass, message) end
handle_redirection()
click to toggle source
# File lib/url_canonicalize/request.rb, line 73 def handle_redirection case response when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect # Temporary redirection handle_success else # Permanent redirection if location URLCanonicalize::Response::Redirect.new(location) else URLCanonicalize::Response::Failure.new(::URI::InvalidURIError, response['location']) end end end
handle_response()
click to toggle source
# File lib/url_canonicalize/request.rb, line 49 def handle_response log_response case response when Net::HTTPSuccess handle_success when Net::HTTPRedirection handle_redirection else handle_failure end rescue *NETWORK_EXCEPTIONS => e handle_failure(e.class, e.message) end
handle_success()
click to toggle source
# File lib/url_canonicalize/request.rb, line 64 def handle_success @canonical_url = $LAST_MATCH_INFO['url'] if (response['link'] || '') =~ /<(?<url>.+)>\s*;\s*rel="canonical"/i return enhanced_response if canonical_url || http_method == :get self.http_method = :get fetch end
headers()
click to toggle source
# File lib/url_canonicalize/request.rb, line 147 def headers @headers ||= { 'Accept-Language' => 'en-US,en;q=0.8', 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) '\ 'AppleWebKit/537.36 (KHTML, like Gecko) '\ 'Chrome/51.0.2704.103 Safari/537.36' } end
host()
click to toggle source
# File lib/url_canonicalize/request.rb, line 124 def host @host ||= uri.host end
html()
click to toggle source
# File lib/url_canonicalize/request.rb, line 100 def html @html ||= Nokogiri::HTML response.body end
http_method=(value)
click to toggle source
# File lib/url_canonicalize/request.rb, line 156 def http_method=(value) @http_method = value @request = nil @response = nil @location = nil @html = nil end
log_response()
click to toggle source
# File lib/url_canonicalize/request.rb, line 184 def log_response return unless ENV['DEBUG'] puts "#{http_method.upcase} #{url} #{response.code} #{response.message}" return unless ENV['DEBUG'].casecmp('headers') response.each { |k, v| puts " #{k}:\t#{v}" } end
relative_to_absolute(partial_url)
click to toggle source
# File lib/url_canonicalize/request.rb, line 171 def relative_to_absolute(partial_url) return unless partial_url partial_uri = ::URI.parse(partial_url) if partial_uri.host partial_url # It's already absolute else ::URI.join((uri || url), partial_url).to_s end rescue ::URI::InvalidURIError nil end
request()
click to toggle source
# File lib/url_canonicalize/request.rb, line 45 def request @request ||= request_for_method end
request_for_method()
click to toggle source
# File lib/url_canonicalize/request.rb, line 128 def request_for_method r = base_request headers.each { |header_key, header_value| r[header_key] = header_value } r end
response()
click to toggle source
# File lib/url_canonicalize/request.rb, line 36 def response @response ||= do_http_request end
uri()
click to toggle source
# File lib/url_canonicalize/request.rb, line 116 def uri @uri ||= http.uri end
url()
click to toggle source
# File lib/url_canonicalize/request.rb, line 120 def url @url ||= uri.to_s end