module Spidey::Strategies::Mongo2
Attributes
error_collection[RW]
result_collection[RW]
url_collection[RW]
Public Class Methods
new(attrs = {})
click to toggle source
Calls superclass method
# File lib/spidey/strategies/mongo2.rb, line 5 def initialize(attrs = {}) self.url_collection = attrs.delete(:url_collection) self.result_collection = attrs.delete(:result_collection) self.error_collection = attrs.delete(:error_collection) super attrs end
Public Instance Methods
add_error(attrs)
click to toggle source
# File lib/spidey/strategies/mongo2.rb, line 45 def add_error(attrs) error = attrs.delete(:error) doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error.message, spider: self.class.name) error_collection.insert_one doc Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.message}" end
crawl(options = {})
click to toggle source
Calls superclass method
# File lib/spidey/strategies/mongo2.rb, line 12 def crawl(options = {}) @crawl_started_at = Time.now @until = Time.now + options[:crawl_for] if options[:crawl_for] super options end
each_url() { |url, url, symbolize_keys| ... }
click to toggle source
# File lib/spidey/strategies/mongo2.rb, line 37 def each_url(&_block) while url = get_next_url break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch url_collection.update_one({ '_id' => url['_id'] }, '$set' => { last_crawled_at: Time.now }) yield url['url'], url['handler'], url['default_data'].symbolize_keys end end
handle(url, handler, default_data = {})
click to toggle source
# File lib/spidey/strategies/mongo2.rb, line 18 def handle(url, handler, default_data = {}) Spidey.logger.info "Queueing #{url.inspect[0..200]}..." url_collection.update_one( { 'spider' => self.class.name, 'url' => url }, { '$set' => { 'handler' => handler, 'default_data' => default_data } }, upsert: true ) end
record(data)
click to toggle source
# File lib/spidey/strategies/mongo2.rb, line 27 def record(data) doc = data.merge('spider' => self.class.name) Spidey.logger.info "Recording #{doc.inspect[0..500]}..." if respond_to?(:result_key) && key = result_key(doc) result_collection.update_one({ 'key' => key }, { '$set' => doc }, upsert: true) else result_collection.insert_one doc end end
Private Instance Methods
get_next_url()
click to toggle source
# File lib/spidey/strategies/mongo2.rb, line 54 def get_next_url return nil if @until && Time.now >= @until # exceeded time bound url_collection.find({ spider: self.class.name }, sort: [[:last_crawled_at, ::Mongo::ASCENDING], [:_id, ::Mongo::ASCENDING]]).first end