module Wombat::Processing::Parser

Constants

HTTP_METHODS

Attributes

context[RW]
mechanize[RW]
page[RW]
response_code[RW]

Public Class Methods

new() click to toggle source
# File lib/wombat/processing/parser.rb, line 29
def initialize
  # http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
  @mechanize = Mechanize.new { |a|
    a.post_connect_hooks << lambda { |_,_,response,_|
      if response.content_type.nil? || response.content_type.empty?
        response.content_type = 'text/html'
      end
    }
  }
  @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
  @mechanize.user_agent = Wombat.user_agent if Wombat.user_agent
  @mechanize.user_agent_alias = Wombat.user_agent_alias if Wombat.user_agent_alias
end

Public Instance Methods

parse(metadata, url=nil) click to toggle source
# File lib/wombat/processing/parser.rb, line 43
def parse(metadata, url=nil)
  @context = parser_for(metadata, url)

  Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
end

Private Instance Methods

decode_body() click to toggle source
# File lib/wombat/processing/parser.rb, line 81
def decode_body
  # Check if it's gzip encoded
  if @page.body.start_with?("\x1F\x8B".b)
    Zlib::GzipReader.new(StringIO.new(@page.body)).read
  else
    @page.body
  end
end
method_from(_method) click to toggle source
# File lib/wombat/processing/parser.rb, line 90
def method_from(_method)
  return :get if _method.nil?
  HTTP_METHODS.detect(->{:get}){ |i| i == _method.downcase.to_sym }
end
parser_for(metadata, url) click to toggle source
# File lib/wombat/processing/parser.rb, line 50
def parser_for(metadata, url)
  url ||= "#{metadata[:base_url]}#{metadata[:path]}"
  parser = nil
  _method = method_from(metadata[:http_method])
  data = metadata[:data]
  args = [url, data].compact
  begin
    @page = metadata[:page]

    if metadata[:document_format] == :html
      @page = @mechanize.public_send(_method, *args) unless @page
      parser = @page.parser         # Nokogiri::HTML::Document
      parser.mechanize_page = @page # Mechanize::Page
      parser.headers = @page.header
    else
      @page = RestClient.public_send(_method, *args) unless @page
      parser = Nokogiri::XML(decode_body)
      parser.headers = @page.headers
    end
    @response_code = @page.code.to_i if @page.respond_to? :code
    parser
  rescue
    if $!.respond_to? :http_code
      @response_code = $!.http_code.to_i
    elsif $!.respond_to? :response_code
      @response_code = $!.response_code.to_i
    end
    raise $!
  end
end