class Spidey::AbstractSpider
Constants
- DEFAULT_REQUEST_INTERVAL
Attributes
errors[RW]
handlers[RW]
request_interval[RW]
results[RW]
urls[RW]
Public Class Methods
handle(url, handler, default_data = {})
click to toggle source
# File lib/spidey/abstract_spider.rb, line 10 def self.handle(url, handler, default_data = {}) start_urls << url handlers[url] = [handler, default_data] end
new(attrs = {})
click to toggle source
Accepts:
request_interval: number of seconds to wait between requests (default: 3)
# File lib/spidey/abstract_spider.rb, line 17 def initialize(attrs = {}) @urls = [] @handlers = {} @results = [] self.class.start_urls.each { |url| handle url, *self.class.handlers[url] } @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL end
Private Class Methods
handlers()
click to toggle source
# File lib/spidey/abstract_spider.rb, line 94 def self.handlers @handlers ||= {} end
start_urls()
click to toggle source
# File lib/spidey/abstract_spider.rb, line 90 def self.start_urls @start_urls ||= [] end
Public Instance Methods
crawl(options = {})
click to toggle source
Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
max_urls: maximum number of URLs to crawl before returning (optional)
# File lib/spidey/abstract_spider.rb, line 27 def crawl(options = {}) @errors = [] i = 0 each_url do |url, handler, default_data| break if options[:max_urls] && i >= options[:max_urls] begin page = agent.get(url) Spidey.logger.info "Handling #{url.inspect}" send handler, page, default_data rescue => ex add_error url: url, handler: handler, error: ex end sleep request_interval if request_interval > 0 i += 1 end end
Protected Instance Methods
add_error(attrs)
click to toggle source
Override this for custom error-handling.
# File lib/spidey/abstract_spider.rb, line 69 def add_error(attrs) @errors << attrs Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" end
clean(str)
click to toggle source
Strips ASCII/Unicode whitespace from ends and substitutes ASCII for Unicode internal spaces.
# File lib/spidey/abstract_spider.rb, line 79 def clean(str) return nil unless str str.gsub(/\p{Space}/, ' ').strip.squeeze(' ') end
each_url() { |url, first, last| ... }
click to toggle source
Override this for custom storage or prioritization of crawled URLs. Iterates through URL queue, yielding the URL, handler, and default data.
# File lib/spidey/abstract_spider.rb, line 56 def each_url(&block) urls.each do |url| yield url, handlers[url].first, handlers[url].last end end
handle(url, handler, default_data = {})
click to toggle source
Override this for custom queueing of crawled URLs.
# File lib/spidey/abstract_spider.rb, line 47 def handle(url, handler, default_data = {}) unless @handlers[url] @urls << url @handlers[url] = [handler, default_data] end end
record(data)
click to toggle source
Override this for custom result storage.
# File lib/spidey/abstract_spider.rb, line 63 def record(data) results << data Spidey.logger.info "Recording #{data.inspect}" end
resolve_url(href, page)
click to toggle source
# File lib/spidey/abstract_spider.rb, line 74 def resolve_url(href, page) agent.agent.resolve(href, page).to_s end
Private Instance Methods
agent()
click to toggle source
# File lib/spidey/abstract_spider.rb, line 86 def agent @agent ||= Mechanize.new end