module Spidey::Strategies::Moped

Attributes

error_collection[RW]
result_collection[RW]
url_collection[RW]

Public Class Methods

new(attrs = {}) click to toggle source
Calls superclass method
# File lib/spidey/strategies/moped.rb, line 5
def initialize(attrs = {})
  self.url_collection = attrs.delete(:url_collection)
  self.result_collection = attrs.delete(:result_collection)
  self.error_collection = attrs.delete(:error_collection)
  super attrs
end

Public Instance Methods

add_error(attrs) click to toggle source
# File lib/spidey/strategies/moped.rb, line 45
def add_error(attrs)
  error = attrs.delete(:error)
  doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error.message, spider: self.class.name)
  error_collection.insert doc
  Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.message}"
end
crawl(options = {}) click to toggle source
Calls superclass method
# File lib/spidey/strategies/moped.rb, line 12
def crawl(options = {})
  @crawl_started_at = Time.now
  @until = Time.now + options[:crawl_for] if options[:crawl_for]
  super options
end
each_url() { |url, url, symbolize_keys| ... } click to toggle source
# File lib/spidey/strategies/moped.rb, line 37
def each_url(&_block)
  while url = get_next_url
    break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
    url_collection.find('_id' => url['_id']).update('$set' => { last_crawled_at: Time.now })
    yield url['url'], url['handler'], url['default_data'].symbolize_keys
  end
end
handle(url, handler, default_data = {}) click to toggle source
# File lib/spidey/strategies/moped.rb, line 18
def handle(url, handler, default_data = {})
  Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
  url_collection.find(
    'spider' => self.class.name, 'url' => url
  ).upsert(
    '$set' => { 'handler' => handler, 'default_data' => default_data }
  )
end
record(data) click to toggle source
# File lib/spidey/strategies/moped.rb, line 27
def record(data)
  doc = data.merge('spider' => self.class.name)
  Spidey.logger.info "Recording #{doc.inspect[0..500]}..."
  if respond_to?(:result_key) && key = result_key(doc)
    result_collection.find('key' => key).upsert('$set' => doc)
  else
    result_collection.insert doc
  end
end

Private Instance Methods

get_next_url() click to toggle source
# File lib/spidey/strategies/moped.rb, line 54
def get_next_url
  return nil if @until && Time.now >= @until # exceeded time bound
  url_collection.find(spider: self.class.name).sort('last_crawled_at' => 1, '_id' => 1).first
end