class Spidey::AbstractSpider

Constants

DEFAULT_REQUEST_INTERVAL

Attributes

errors[RW]
handlers[RW]
request_interval[RW]
results[RW]
urls[RW]

Public Class Methods

handle(url, handler, default_data = {}) click to toggle source
# File lib/spidey/abstract_spider.rb, line 10
def self.handle(url, handler, default_data = {})
  start_urls << url
  handlers[url] = [handler, default_data]
end
new(attrs = {}) click to toggle source

Accepts:

request_interval: number of seconds to wait between requests (default: 3)
# File lib/spidey/abstract_spider.rb, line 17
def initialize(attrs = {})
  @urls = []
  @handlers = {}
  @results = []
  self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
  @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
end

Private Class Methods

handlers() click to toggle source
# File lib/spidey/abstract_spider.rb, line 94
def self.handlers
  @handlers ||= {}
end
start_urls() click to toggle source
# File lib/spidey/abstract_spider.rb, line 90
def self.start_urls
  @start_urls ||= []
end

Public Instance Methods

crawl(options = {}) click to toggle source

Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:

max_urls: maximum number of URLs to crawl before returning (optional)
# File lib/spidey/abstract_spider.rb, line 27
def crawl(options = {})
  @errors = []
  i = 0
  each_url do |url, handler, default_data|
    break if options[:max_urls] && i >= options[:max_urls]
    begin
      page = agent.get(url)
      Spidey.logger.info "Handling #{url.inspect}"
      send handler, page, default_data
    rescue => ex
      add_error url: url, handler: handler, error: ex
    end
    sleep request_interval if request_interval > 0
    i += 1
  end
end

Protected Instance Methods

add_error(attrs) click to toggle source

Override this for custom error-handling.

# File lib/spidey/abstract_spider.rb, line 69
def add_error(attrs)
  @errors << attrs
  Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
end
clean(str) click to toggle source

Strips ASCII/Unicode whitespace from ends and substitutes ASCII for Unicode internal spaces.

# File lib/spidey/abstract_spider.rb, line 79
def clean(str)
  return nil unless str
  str.gsub(/\p{Space}/, ' ').strip.squeeze(' ')
end
each_url() { |url, first, last| ... } click to toggle source

Override this for custom storage or prioritization of crawled URLs. Iterates through URL queue, yielding the URL, handler, and default data.

# File lib/spidey/abstract_spider.rb, line 56
def each_url(&block)
  urls.each do |url|
    yield url, handlers[url].first, handlers[url].last
  end
end
handle(url, handler, default_data = {}) click to toggle source

Override this for custom queueing of crawled URLs.

# File lib/spidey/abstract_spider.rb, line 47
def handle(url, handler, default_data = {})
  unless @handlers[url]
    @urls << url
    @handlers[url] = [handler, default_data]
  end
end
record(data) click to toggle source

Override this for custom result storage.

# File lib/spidey/abstract_spider.rb, line 63
def record(data)
  results << data
  Spidey.logger.info "Recording #{data.inspect}"
end
resolve_url(href, page) click to toggle source
# File lib/spidey/abstract_spider.rb, line 74
def resolve_url(href, page)
  agent.agent.resolve(href, page).to_s
end

Private Instance Methods

agent() click to toggle source
# File lib/spidey/abstract_spider.rb, line 86
def agent
  @agent ||= Mechanize.new
end