class CobwebCrawlHelper

The crawl class gives easy access to information about the crawl, and gives the ability to stop a crawl

Constants

BATCH_SIZE
CANCELLED
FINISHED
STARTING

Attributes

id[RW]

Public Class Methods

new(data) click to toggle source
# File lib/cobweb_crawl_helper.rb, line 11
def initialize(data)
  @data = data

  # TAKING A LONG TIME TO RUN ON PRODUCTION BOX
  @stats = Stats.new(data)
end

Public Instance Methods

destroy() click to toggle source
# File lib/cobweb_crawl_helper.rb, line 18
def destroy
  options = @data
  options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
  if RESQUE_INSTALLED
    options[:processing_queue] = "CobwebJob" unless options.has_key?(:processing_queue)
    options[:crawl_finished_queue] = "CobwebFinishedJob" unless options.has_key?(:crawl_finished_queue)
  end
  if SIDEKIQ_INSTALLED
    options[:processing_queue] = "CrawlWorker" unless options.has_key?(:processing_queue)
    options[:crawl_finished_queue] = "CrawlFinishedWorker" unless options.has_key?(:crawl_finished_queue)
  end

  # set status as cancelled now so that we don't enqueue any further pages
  self.statistics.end_crawl(@data, true)

  counter = 0
  while(counter < 200) do
    break if self.statistics.get_status == CANCELLED
    sleep 1
    counter += 1
  end
  if options[:queue_system] == :resque && RESQUE_INSTALLED
    position = Resque.size(options[:queue_name])
    until position == 0
      position-=BATCH_SIZE
      position = 0 if position < 0
      job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
      job_items.each do |item|
        if item["args"][0]["crawl_id"] == id
          # remove this job from the queue
          Resque.dequeue(CrawlJob, item["args"][0])
        end
      end
    end
  end
  if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED

    queue = Sidekiq::Queue.new("crawl_worker")
    queue.each do |job|
      job.delete if job.args[0]["crawl_id"] == id
    end


    process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
    queue = Sidekiq::Queue.new(process_queue_name)
    queue.each do |job|
      job.delete if job.args[0]["crawl_id"] == id
    end
  end

  if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED

    additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
    additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
    additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?

    Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
  end

  if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED

    additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
    additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
    additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?

    Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
  end


end
statistics() click to toggle source
# File lib/cobweb_crawl_helper.rb, line 89
def statistics
  @stats
end
status() click to toggle source
# File lib/cobweb_crawl_helper.rb, line 93
def status
  statistics.get_status
end