module GoGetter

Constants

USER_AGENTS

Some user agents for use with websites that change their behavior according to your browser Set by adding to http_headers: “User-Agent” => USER_AGENTS Use www.useragentstring.com/pages/useragentstring.php to find more user agent strings

Public Class Methods

get(uri, http_headers = {}, params = {}) click to toggle source
# File lib/go_getter/go_getter.rb, line 11
def GoGetter.get(uri, http_headers = {}, params = {})
  uri = parse_url(uri.to_s) unless uri.is_a? URI
  path = uri.path
  path << "?#{uri.query}" if uri.query
  request = Net::HTTP::Get.new(path)
  http_headers.each {|key, value| request.add_field key, value }

  # basic authentication
  request.basic_auth(params[:auth_user], params[:auth_pass]) if params[:auth_user] and params[:auth_pass]

  # proxy
  klass = (params[:proxy_host] and params[:proxy_port]) ?
    Net::HTTP::Proxy(params[:proxy_host], params[:proxy_port], params[:proxy_user], params[:proxy_pass]) : Net::HTTP

  # SSL
  opt = (uri.scheme == "https") ?
    { use_ssl: true, verify_mode: OpenSSL::SSL::VERIFY_NONE } : {}
    
  response = klass.start(uri.host, uri.port, opt) do |http|
    http.read_timeout = params.fetch(:read_timeout, 600)
    http.request(request)
  end

  if response.is_a?(Net::HTTPRedirection) # Redirect
    # allow for a single redirection by default
    params[:max_redirects] = 1 unless params.has_key?(:max_redirects)
    response = handle_redirection(uri, response, http_headers, params)
  else
    response.final_uri = uri
  end

  return response
end
handle_redirection(from_uri, response, http_headers, params) click to toggle source
# File lib/go_getter/go_getter.rb, line 57
def GoGetter.handle_redirection(from_uri, response, http_headers, params)
  if params.fetch(:max_redirects, 0) > 0
    params[:uris_seen] = Set.new unless params[:uris_seen]
    if params[:uris_seen].size < params.fetch(:max_redirects, 0) && response['Location']
      params[:uris_seen] << from_uri
      new_uri = URI.parse(response['Location'])
      # new uri may be just the path, w/o host and port; if so, copy from old
      unless new_uri.host
        new_uri.host = from_uri.host
        new_uri.port = from_uri.port
      end
      new_uri.scheme = from_uri.scheme unless new_uri.scheme
      # avoid infinite redirect loops
      unless params[:uris_seen].member? new_uri
        # request the new location just as we did the old one.
        params[:max_redirects] -= 1
        response = GoGetter.get(new_uri, http_headers, params)
      end
    end
  end
  response
end
parse_url(url) click to toggle source

Given a URL, which may not be formatted properly, parse a URI

# File lib/go_getter/go_getter.rb, line 46
def GoGetter.parse_url(url)
  unless (url =~ %r{^https?://}mi) == 0
    url = "http://#{url}"
  end
  uri = URI.parse url
  if uri.path.length == 0 and uri.query.nil?
    uri.path = "/"
  end
  uri
end