class CobwebModule::Crawl
Public Class Methods
new(options={})
click to toggle source
# File lib/crawl.rb, line 4 def initialize(options={}) @options = HashUtil.deep_symbolize_keys(options) setup_defaults @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options])) @stats = Stats.new(@options) @debug = @options[:debug] @first_to_finish = false end
Public Instance Methods
already_crawled?(link=@options[:url])
click to toggle source
Returns true if the url requested is already in the crawled queue
# File lib/crawl.rb, line 17 def already_crawled?(link=@options[:url]) @redis.sismember "crawled", link end
already_handled?(link)
click to toggle source
# File lib/crawl.rb, line 29 def already_handled?(link) already_crawled?(link) || already_queued?(link) || already_running?(link) end
already_queued?(link)
click to toggle source
# File lib/crawl.rb, line 21 def already_queued?(link) @redis.sismember "queued", link end
already_running?(link)
click to toggle source
# File lib/crawl.rb, line 25 def already_running?(link) @redis.sismember "currently_running", link end
cancelled?()
click to toggle source
# File lib/crawl.rb, line 33 def cancelled? @stats.get_statistics[:current_status] == "Cancelled" end
content()
click to toggle source
# File lib/crawl.rb, line 154 def content raise "Content is not available" if @content.nil? CobwebModule::CrawlObject.new(@content, @options) end
crawled_base_url()
click to toggle source
# File lib/crawl.rb, line 257 def crawled_base_url @redis.get("crawled_base_url") end
debug_ap(value)
click to toggle source
# File lib/crawl.rb, line 290 def debug_ap(value) ap(value) if @options[:debug] end
debug_puts(value)
click to toggle source
# File lib/crawl.rb, line 294 def debug_puts(value) puts(value) if @options[:debug] end
finish()
click to toggle source
# File lib/crawl.rb, line 226 def finish debug_puts "" debug_puts "========================================================================" debug_puts "finished crawl on #{@options[:url]}" print_counters debug_puts "========================================================================" debug_puts "" set_first_to_finish @stats.end_crawl(@options) end
finished?()
click to toggle source
# File lib/crawl.rb, line 205 def finished? print_counters debug_puts @stats.get_status if @stats.get_status == CobwebCrawlHelper::FINISHED debug_puts "Already Finished!" end # if there's nothing left queued or the crawled limit has been reached and we're not still processing something if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0 if queue_counter == 0 && @redis.smembers("currently_running").empty? debug_puts "queue_counter is 0 and currently_running is empty so we're done" #finished return true end elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty? #finished debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}" return true end false end
finished_processing()
click to toggle source
# File lib/crawl.rb, line 201 def finished_processing @redis.srem "currently_running", @options[:url] end
first_to_finish?()
click to toggle source
# File lib/crawl.rb, line 253 def first_to_finish? @first_to_finish end
lock(key) { || ... }
click to toggle source
# File lib/crawl.rb, line 269 def lock(key, &block) debug_puts "REQUESTING LOCK [#{key}]" set_nx = @redis.setnx("#{key}_lock", "locked") debug_puts "LOCK:#{key}:#{set_nx}" while !set_nx debug_puts "===== WAITING FOR LOCK [#{key}] =====" sleep 0.5 set_nx = @redis.setnx("#{key}_lock", "locked") end debug_puts "RECEIVED LOCK [#{key}]" @redis.expire("#{key}_lock", 30) begin result = yield ensure @redis.del("#{key}_lock") debug_puts "LOCK RELEASED [#{key}]" end result end
process() { || ... }
click to toggle source
# File lib/crawl.rb, line 185 def process(&block) lock("process-count") do if @options[:crawl_limit_by_page] if content.mime_type.match("text/html") increment_process_counter end else increment_process_counter end #@redis.sadd "queued", @options[:url] end yield if block_given? @redis.incr("crawl_job_enqueued_count") end
process_links() { |link| ... }
click to toggle source
# File lib/crawl.rb, line 112 def process_links &block # set the base url if this is the first page set_base_url @redis @cobweb_links = CobwebLinks.new(@options) if within_queue_limits? document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https]) #get rid of duplicate links in the same page. document_links.uniq! # select the link if its internal internal_links = document_links.select{ |link| @cobweb_links.internal?(link) } # if the site has the same content for http and https then normalize to http if @options[:treat_https_as_http] internal_links.map!{|link| link.gsub(/^https/, "http")} end # reject the link if we've crawled it or queued it internal_links.reject! { |link| already_handled?(link)} if status != CobwebCrawlHelper::CANCELLED && internal_links.present? lock("internal-links") do @redis.sadd "queued", internal_links @redis.incrby "queue-counter", internal_links.count end internal_links.map{|link| yield(link) if block_given? } end if @options[:store_inbound_links] document_links.each do |link| uri = URI.parse(link).normalize @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url) end end end end
redis()
click to toggle source
# File lib/crawl.rb, line 265 def redis @redis end
retrieve()
click to toggle source
# File lib/crawl.rb, line 64 def retrieve unless cancelled? unless already_running? @options[:url] unless already_crawled? @options[:url] update_queues if within_crawl_limits? @redis.sadd("currently_running", @options[:url]) @stats.update_status("Retrieving #{@options[:url]}...") @content = Cobweb.new(@options).get(@options[:url], @options) @cookies = @content[:cookies] update_counters if @options[:url] == @redis.get("original_base_url") @redis.set("crawled_base_url", @content[:base_url]) end if content.permitted_type? ## update statistics @stats.update_statistics(@content) return true end else puts "=======================================" puts "OUTWITH CRAWL LIMITS" puts "=======================================" decrement_queue_counter end else puts "=======================================" puts "ALREADY CRAWLED" puts "=======================================" decrement_queue_counter end else debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n" debug_ap @redis.smembers("currently_running") decrement_queue_counter end else puts "=======================================" puts "CRAWL CANCELLED" puts "=======================================" end false end
set_first_to_finish()
click to toggle source
# File lib/crawl.rb, line 238 def set_first_to_finish @redis.watch("first_to_finish") do if !@redis.exists("first_to_finish") @redis.multi do debug_puts "set first to finish" @first_to_finish = true @redis.set("first_to_finish", 1) end else @redis.unwatch end end end
statistics()
click to toggle source
# File lib/crawl.rb, line 261 def statistics @stats.get_statistics end
to_be_processed?()
click to toggle source
# File lib/crawl.rb, line 181 def to_be_processed? !finished? && within_process_limits? end
update_counters()
click to toggle source
# File lib/crawl.rb, line 170 def update_counters if @options[:crawl_limit_by_page] if content.mime_type.match("text/html") increment_crawl_counter end else increment_crawl_counter end decrement_queue_counter end
update_queues()
click to toggle source
# File lib/crawl.rb, line 159 def update_queues lock("update_queues") do #@redis.incr "inprogress" # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects) @redis.srem "queued", @options[:url] @redis.sadd "crawled", @options[:url] # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page end end
within_crawl_limits?()
click to toggle source
Returns true if the crawl count is within limits
# File lib/crawl.rb, line 38 def within_crawl_limits? @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i end
within_process_limits?()
click to toggle source
Returns true if the processed count is within limits
# File lib/crawl.rb, line 43 def within_process_limits? @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i end
within_queue_limits?()
click to toggle source
Returns true if the queue count is calculated to be still within limits when complete
# File lib/crawl.rb, line 48 def within_queue_limits? # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved if @options[:crawl_limit_by_page] return true # if a crawl limit is set, limit queue size to crawled + queue elsif @options[:crawl_limit].to_i > 0 (queue_counter + crawl_counter) < @options[:crawl_limit].to_i # no crawl limit set so always within queue limit else true end end
Private Instance Methods
counters()
click to toggle source
# File lib/crawl.rb, line 340 def counters "crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]} currently_running: #{@redis.smembers("currently_running").count}" end
crawl_counter()
click to toggle source
# File lib/crawl.rb, line 322 def crawl_counter @redis.get("crawl-counter").to_i end
decrement_queue_counter()
click to toggle source
Decrements the queue counter and refreshes crawl counters
# File lib/crawl.rb, line 318 def decrement_queue_counter @redis.decr "queue-counter" end
increment_crawl_counter()
click to toggle source
Increments the crawl counter and refreshes crawl counters
# File lib/crawl.rb, line 310 def increment_crawl_counter @redis.incr "crawl-counter" end
increment_process_counter()
click to toggle source
Increments the process counter
# File lib/crawl.rb, line 314 def increment_process_counter @redis.incr "process-counter" end
increment_queue_counter()
click to toggle source
Increments the queue counter and refreshes crawl counters
# File lib/crawl.rb, line 306 def increment_queue_counter @redis.incr "queue-counter" end
print_counters()
click to toggle source
# File lib/crawl.rb, line 336 def print_counters debug_puts counters end
process_counter()
click to toggle source
# File lib/crawl.rb, line 328 def process_counter @redis.get("process-counter").to_i end
queue_counter()
click to toggle source
# File lib/crawl.rb, line 325 def queue_counter @redis.get("queue-counter").to_i end
set_base_url(redis)
click to toggle source
Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
# File lib/crawl.rb, line 345 def set_base_url(redis) if redis.get("base_url").nil? unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal] uri = Addressable::URI.parse(content.redirect_through.last) redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join) end redis.set("base_url", content.url) end end
setup_defaults()
click to toggle source
# File lib/crawl.rb, line 299 def setup_defaults @options[:redis_options] = {} unless @options.has_key? :redis_options @options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page @options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types end
status()
click to toggle source
# File lib/crawl.rb, line 332 def status @stats.get_status end