class SiteMapper::Crawler

Crawls a given site.

Constants

OPTIONS

Default options

Public Class Methods

collect_urls(*args) { |url| ... } click to toggle source

See documentation for the instance variant of this method. @return [Array] with links. @see collect_urls

# File lib/site_mapper/crawler.rb, line 34
def self.collect_urls(*args)
  new(*args).collect_urls { |url| yield(url) }
end
new(url, options = {}) click to toggle source

@param [String] url base url for crawler @param [Hash] options hash @example Create crawler with custom User-Agent

Crawler.new('example.com', user_agent: 'MyUserAgent')

@example Create crawler and sleep 1 second between each request

Crawler.new('example.com', sleep_length: 1)

@example Create crawler and perform max 3 requests

Crawler.new('example.com', max_requests: 3)
# File lib/site_mapper/crawler.rb, line 21
def initialize(url, options = {})
  @base_url    = Request.resolve_url(url)
  @options     = OPTIONS.dup.merge(options)
  @user_agent  = @options.fetch(:user_agent)
  @crawl_url   = CrawlUrl.new(@base_url)
  @fetch_queue = CrawlQueue.new
  @processed   = Set.new
  @robots      = nil
end

Public Instance Methods

collect_urls() { |url| ... } click to toggle source

Collects all links on domain for domain. @return [Array] with links. @example URLs for example.com

crawler = Crawler.new('example.com')
crawler.collect_urls

@example URLs for example.com with block (executes in its own thread)

crawler = Crawler.new('example.com')
crawler.collect_urls do |new_url|
  puts "New URL found: #{new_url}"
end
# File lib/site_mapper/crawler.rb, line 48
def collect_urls
  @fetch_queue << @crawl_url.resolved_base_url
  until @fetch_queue.empty? || @processed.length >= @options[:max_requests]
    url = @fetch_queue.pop
    yield(url)
    page_urls_for(url)
  end
  result = @processed + @fetch_queue
  Logger.log "Crawling finished:"
  Logger.log "Processed links: #{@processed.length}"
  Logger.log "Found links:     #{result.length}"
  result.to_a
rescue Interrupt, IRB::Abort
  Logger.err_log 'Crawl interrupted.'
  @fetch_queue.to_a
end

Private Instance Methods

eligible_for_queue?(url) click to toggle source
# File lib/site_mapper/crawler.rb, line 78
def eligible_for_queue?(url)
  robots.allowed?(url) && !@processed.include?(url)
end
page_urls_for(current_url) click to toggle source
# File lib/site_mapper/crawler.rb, line 67
def page_urls_for(current_url)
  Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{current_url}"
  link_elements = Request.document(current_url, user_agent: @options[:user_agent]).css('a')
  wait
  @processed << current_url
  link_elements.each do |page_link|
    url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
    @fetch_queue << url if url && eligible_for_queue?(url)
  end
end
robots() click to toggle source
# File lib/site_mapper/crawler.rb, line 82
def robots
  return @robots unless @robots.nil?
  robots_url  = URI.join(@base_url, '/robots.txt').to_s
  robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent])
  @robots     = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent])
  @robots
end
wait() click to toggle source
# File lib/site_mapper/crawler.rb, line 90
def wait
  sleep @options[:sleep_length]
end