class SiteMapper::Crawler
Crawls a given site.
Constants
- OPTIONS
Default options
Public Class Methods
collect_urls(*args) { |url| ... }
click to toggle source
See documentation for the instance variant of this method. @return [Array] with links. @see collect_urls
# File lib/site_mapper/crawler.rb, line 34 def self.collect_urls(*args) new(*args).collect_urls { |url| yield(url) } end
new(url, options = {})
click to toggle source
@param [String] url base url for crawler @param [Hash] options hash @example Create crawler with custom User-Agent
Crawler.new('example.com', user_agent: 'MyUserAgent')
@example Create crawler and sleep 1 second between each request
Crawler.new('example.com', sleep_length: 1)
@example Create crawler and perform max 3 requests
Crawler.new('example.com', max_requests: 3)
# File lib/site_mapper/crawler.rb, line 21 def initialize(url, options = {}) @base_url = Request.resolve_url(url) @options = OPTIONS.dup.merge(options) @user_agent = @options.fetch(:user_agent) @crawl_url = CrawlUrl.new(@base_url) @fetch_queue = CrawlQueue.new @processed = Set.new @robots = nil end
Public Instance Methods
collect_urls() { |url| ... }
click to toggle source
Collects all links on domain for domain. @return [Array] with links. @example URLs for example.com
crawler = Crawler.new('example.com') crawler.collect_urls
@example URLs for example.com with block (executes in its own thread)
crawler = Crawler.new('example.com') crawler.collect_urls do |new_url| puts "New URL found: #{new_url}" end
# File lib/site_mapper/crawler.rb, line 48 def collect_urls @fetch_queue << @crawl_url.resolved_base_url until @fetch_queue.empty? || @processed.length >= @options[:max_requests] url = @fetch_queue.pop yield(url) page_urls_for(url) end result = @processed + @fetch_queue Logger.log "Crawling finished:" Logger.log "Processed links: #{@processed.length}" Logger.log "Found links: #{result.length}" result.to_a rescue Interrupt, IRB::Abort Logger.err_log 'Crawl interrupted.' @fetch_queue.to_a end
Private Instance Methods
eligible_for_queue?(url)
click to toggle source
# File lib/site_mapper/crawler.rb, line 78 def eligible_for_queue?(url) robots.allowed?(url) && !@processed.include?(url) end
page_urls_for(current_url)
click to toggle source
# File lib/site_mapper/crawler.rb, line 67 def page_urls_for(current_url) Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{current_url}" link_elements = Request.document(current_url, user_agent: @options[:user_agent]).css('a') wait @processed << current_url link_elements.each do |page_link| url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url) @fetch_queue << url if url && eligible_for_queue?(url) end end
robots()
click to toggle source
# File lib/site_mapper/crawler.rb, line 82 def robots return @robots unless @robots.nil? robots_url = URI.join(@base_url, '/robots.txt').to_s robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent]) @robots = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent]) @robots end
wait()
click to toggle source
# File lib/site_mapper/crawler.rb, line 90 def wait sleep @options[:sleep_length] end