class Creepycrawler::Page
Represents a webpage and the methods to extract the details we need for our crawler
Attributes
body[R]
page html
url[RW]
page url
Public Class Methods
new(url)
click to toggle source
# File lib/creepy-crawler/page.rb, line 10 def initialize(url) @url = Addressable::URI.parse(url).normalize @robotstxt = WebRobots.new("CreepyCrawler") end
Public Instance Methods
body=(body)
click to toggle source
# File lib/creepy-crawler/page.rb, line 15 def body=(body) # convert to Nokogiri object @body = Nokogiri::HTML(body) end
fetch()
click to toggle source
retrieve page
# File lib/creepy-crawler/page.rb, line 21 def fetch @body = Nokogiri::HTML(open(@url, :allow_redirections => :all)) end
links()
click to toggle source
return all links on page
# File lib/creepy-crawler/page.rb, line 26 def links # if we haven't fetched the page, get it fetch if @body.nil? # using nokogiri, find all anchor elements hyperlinks = @body.css('a') # get array of links on page - remove any empty links or links that are invalid @links = hyperlinks.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if do |href| # if href is empty, points to an anchor, mailto or ftp delete invalid = true if href.empty? or /^#/ =~ href or /^mailto:/ =~ href or /^ftp:/ =~ href or /^javascript:/ =~ href # if Addressable throws an exception, we have an invalid link - delete begin Addressable::URI.parse(href) rescue invalid = true end invalid end # map all links to absolute @links.map{|link| relative_to_absolute_link(link)} end
relative_to_absolute_link(link)
click to toggle source
# File lib/creepy-crawler/page.rb, line 52 def relative_to_absolute_link(link) uri = Addressable::URI.parse(link).normalize # this url was relative, prepend our known domain if uri.host.nil? return (@url + uri.path).to_s else # the url was already absolute - leave as is return uri.to_s end end
robots_disallowed?()
click to toggle source
# File lib/creepy-crawler/page.rb, line 64 def robots_disallowed? return @robotstxt.disallowed?(@url) end