class Creepycrawler::Site

object to handle the discovery of our site through crawling

Constants

DEFAULT_OPTIONS

Attributes

crawl_queue[R]

queue used to store discovered pages and crawl the site

domain[R]

the site domain

options[R]

hash of additional options to be passed in

page_crawl_count[R]

number of pages crawled

root_node[R]

holds the root node information

url[R]

url the crawl began with

visited_queue[R]

queue used to store visited pages

Public Class Methods

new(url, options = {}) click to toggle source
# File lib/creepy-crawler/site.rb, line 40
def initialize(url, options = {})
  response = open(url, :allow_redirections => :all)
  url_parsed = Addressable::URI.parse(response.base_uri)
  @domain = url_parsed.host
  @url = url_parsed.to_s
  @page_crawl_count = 0
  @options = options
  # add the initial url to our crawl queue
  @crawl_queue = [@url] 
  @broken_links = [] 
  @visited_queue = []
  @graph = Creepycrawler::Graph.new
end

Public Instance Methods

crawl() click to toggle source
# File lib/creepy-crawler/site.rb, line 54
def crawl
  # merge default and passed in options into one hash
  @options = DEFAULT_OPTIONS.merge(@options)

  # begin crawl loop
  loop do
    # break if we have crawled all sites, or reached :max_page_crawl
    break if @crawl_queue.empty? or (!options[:max_page_crawl].nil? and @page_crawl_count >= @options[:max_page_crawl])
    
    begin
      # pull next page from crawl_queue and setup page
      page = Page.new(@crawl_queue.shift)
      
      # add url to visited queue to keep track of where we have been
      @visited_queue.push(page.url.to_s)
      
      # respect robots.txt
      if @options[:obey_robots] and page.robots_disallowed? 
        puts "Not crawling #{page.url} per Robots.txt request" if options[:verbose]
        next
      end

      puts "Crawling and indexing: #{page.url}" if @options[:verbose]
      
      # retrieve page
      page.fetch
    
      current_page_node = @graph.add_page(page.url) if @options[:graph_to_neo4j] 
      #todo: fix this. on first run current_page_node is a hash. subsequent is an array of hashes
      @root_node = current_page_node if @page_crawl_count == 0 and @options[:graph_to_neo4j]
      
      # Loop through all links on the current page
      page.links.each do |link|

        # add to crawl queue - only push local links, links that do not yet exist in the queue and links that haven't been visted
        @crawl_queue.push(link) if local? link and !@crawl_queue.include? link and !@visited_queue.include? link.to_s

        # add link page to graph
        current_link_node = @graph.add_page(link) if @options[:graph_to_neo4j]

        # create a links_to relationship from the current page node to link node
        @graph.create_relationship("links_to", current_page_node, current_link_node) if @options[:graph_to_neo4j]
      end
    rescue => e
      puts "Exception thrown: #{e.message} - Skipping Page" if @options[:verbose]
      @broken_links.push(page.url)
      next
    end
    @page_crawl_count += 1
  end # end of loop

  return self
end
local?(link) click to toggle source

is link local to site?

# File lib/creepy-crawler/site.rb, line 109
def local?(link)
  uri = Addressable::URI.parse(link)
  return true if uri.host == @domain
  return false
end