class Creepycrawler::Page

Represents a webpage and the methods to extract the details we need for our crawler

Attributes

body[R]

page html

url[RW]

page url

Public Class Methods

new(url) click to toggle source
# File lib/creepy-crawler/page.rb, line 10
def initialize(url)
  @url = Addressable::URI.parse(url).normalize
  @robotstxt = WebRobots.new("CreepyCrawler")
end

Public Instance Methods

body=(body) click to toggle source
# File lib/creepy-crawler/page.rb, line 15
def body=(body)
  # convert to Nokogiri object
  @body = Nokogiri::HTML(body)
end
fetch() click to toggle source

retrieve page

# File lib/creepy-crawler/page.rb, line 21
def fetch
  @body = Nokogiri::HTML(open(@url, :allow_redirections => :all))
end
robots_disallowed?() click to toggle source
# File lib/creepy-crawler/page.rb, line 64
def robots_disallowed?
  return @robotstxt.disallowed?(@url)
end