class URLCanonicalize::Request

Make an HTTP request

Constants

NETWORK_EXCEPTIONS

Attributes

http[R]
http_method[R]

Public Class Methods

new(http, http_method = :head) click to toggle source
# File lib/url_canonicalize/request.rb, line 31
def initialize(http, http_method = :head)
  @http = http
  @http_method = http_method
end

Public Instance Methods

fetch() click to toggle source
# File lib/url_canonicalize/request.rb, line 6
def fetch
  handle_response
end
location() click to toggle source
# File lib/url_canonicalize/request.rb, line 10
def location
  @location ||= relative_to_absolute(response['location'])
end
with_uri(uri) click to toggle source
# File lib/url_canonicalize/request.rb, line 14
def with_uri(uri)
  @uri = uri

  @url = nil
  @host = nil
  @request = nil
  @response = nil
  @location = nil
  @html = nil

  self
end

Private Instance Methods

base_request() click to toggle source
# File lib/url_canonicalize/request.rb, line 134
def base_request
  check_http_method

  case http_method
  when :head
    Net::HTTP::Head.new uri
  when :get
    Net::HTTP::Get.new uri
  else
    raise URLCanonicalize::Exception::Request, "Unknown method: #{http_method}"
  end
end
canonical_url() click to toggle source
# File lib/url_canonicalize/request.rb, line 104
def canonical_url
  @canonical_url ||= relative_to_absolute(canonical_url_raw)
end
canonical_url_element() click to toggle source
# File lib/url_canonicalize/request.rb, line 112
def canonical_url_element
  @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first
end
canonical_url_raw() click to toggle source
# File lib/url_canonicalize/request.rb, line 108
def canonical_url_raw
  @canonical_url ||= canonical_url_element['href'] if canonical_url_element.is_a?(Nokogiri::XML::Element)
end
check_http_method() click to toggle source

Some sites treat HEAD requests as suspicious activity and block the requester after a few attempts. For these sites we'll use GET requests only

# File lib/url_canonicalize/request.rb, line 167
def check_http_method
  @http_method = :get if /(linkedin|crunchbase).com/ =~ host
end
enhanced_response() click to toggle source
# File lib/url_canonicalize/request.rb, line 90
def enhanced_response
  if canonical_url
    puts "  * canonical_url:\t#{canonical_url}" if ENV['DEBUG']
    response_plus = URLCanonicalize::Response::Success.new(canonical_url, response, html)
    URLCanonicalize::Response::CanonicalFound.new(canonical_url, response_plus)
  else
    URLCanonicalize::Response::Success.new(url, response, html)
  end
end
handle_failure(klass = response.class, message = response.message) click to toggle source
# File lib/url_canonicalize/request.rb, line 86
def handle_failure(klass = response.class, message = response.message)
  URLCanonicalize::Response::Failure.new(klass, message)
end
handle_redirection() click to toggle source
# File lib/url_canonicalize/request.rb, line 73
def handle_redirection
  case response
  when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect # Temporary redirection
    handle_success
  else # Permanent redirection
    if location
      URLCanonicalize::Response::Redirect.new(location)
    else
      URLCanonicalize::Response::Failure.new(::URI::InvalidURIError, response['location'])
    end
  end
end
handle_response() click to toggle source
# File lib/url_canonicalize/request.rb, line 49
def handle_response
  log_response

  case response
  when Net::HTTPSuccess
    handle_success
  when Net::HTTPRedirection
    handle_redirection
  else
    handle_failure
  end
rescue *NETWORK_EXCEPTIONS => e
  handle_failure(e.class, e.message)
end
handle_success() click to toggle source
# File lib/url_canonicalize/request.rb, line 64
def handle_success
  @canonical_url = $LAST_MATCH_INFO['url'] if (response['link'] || '') =~ /<(?<url>.+)>\s*;\s*rel="canonical"/i

  return enhanced_response if canonical_url || http_method == :get

  self.http_method = :get
  fetch
end
headers() click to toggle source
# File lib/url_canonicalize/request.rb, line 147
def headers
  @headers ||= {
    'Accept-Language' => 'en-US,en;q=0.8',
    'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) '\
                    'AppleWebKit/537.36 (KHTML, like Gecko) '\
                    'Chrome/51.0.2704.103 Safari/537.36'
  }
end
host() click to toggle source
# File lib/url_canonicalize/request.rb, line 124
def host
  @host ||= uri.host
end
html() click to toggle source
# File lib/url_canonicalize/request.rb, line 100
def html
  @html ||= Nokogiri::HTML response.body
end
http_method=(value) click to toggle source
# File lib/url_canonicalize/request.rb, line 156
def http_method=(value)
  @http_method = value
  @request = nil
  @response = nil
  @location = nil
  @html = nil
end
log_response() click to toggle source
# File lib/url_canonicalize/request.rb, line 184
def log_response
  return unless ENV['DEBUG']
  puts "#{http_method.upcase} #{url} #{response.code} #{response.message}"

  return unless ENV['DEBUG'].casecmp('headers')
  response.each { |k, v| puts "  #{k}:\t#{v}" }
end
relative_to_absolute(partial_url) click to toggle source
# File lib/url_canonicalize/request.rb, line 171
def relative_to_absolute(partial_url)
  return unless partial_url
  partial_uri = ::URI.parse(partial_url)

  if partial_uri.host
    partial_url # It's already absolute
  else
    ::URI.join((uri || url), partial_url).to_s
  end
rescue ::URI::InvalidURIError
  nil
end
request() click to toggle source
# File lib/url_canonicalize/request.rb, line 45
def request
  @request ||= request_for_method
end
request_for_method() click to toggle source
# File lib/url_canonicalize/request.rb, line 128
def request_for_method
  r = base_request
  headers.each { |header_key, header_value| r[header_key] = header_value }
  r
end
response() click to toggle source
# File lib/url_canonicalize/request.rb, line 36
def response
  @response ||= do_http_request
end
uri() click to toggle source
# File lib/url_canonicalize/request.rb, line 116
def uri
  @uri ||= http.uri
end
url() click to toggle source
# File lib/url_canonicalize/request.rb, line 120
def url
  @url ||= uri.to_s
end