class WebPageParser::BaseParser

Attributes

retrieve_session[RW]
url[R]

Public Class Methods

new(options = { }) click to toggle source

takes a hash of options. The :url option passes the page url, and the :page option passes the raw html page content for parsing

# File lib/web-page-parser/base_parser.rb, line 19
def initialize(options = { })
  @url = options[:url]
  @page = options[:page]
  @guid = options[:guid]
end

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/base_parser.rb, line 43
def content
  @content || []
end
date() click to toggle source
# File lib/web-page-parser/base_parser.rb, line 47
def date
end
guid() click to toggle source
# File lib/web-page-parser/base_parser.rb, line 53
def guid
  return @guid if @guid
  @guid = guid_from_url if url
  @guid
end
guid_from_url() click to toggle source
# File lib/web-page-parser/base_parser.rb, line 50
def guid_from_url
end
hash() click to toggle source

Return a hash representing the textual content of this web page

# File lib/web-page-parser/base_parser.rb, line 60
def hash
  digest = Digest::MD5.new
  digest << title.to_s
  digest << content.join('').to_s
  digest.to_s
end
page() click to toggle source

return the page contents, retrieving it from the server if necessary

# File lib/web-page-parser/base_parser.rb, line 26
def page
  @page ||= retrieve_page
end
retrieve_page(rurl = nil) click to toggle source

request the page from the server and return the raw contents

# File lib/web-page-parser/base_parser.rb, line 31
def retrieve_page(rurl = nil)
  durl = rurl || url
  return nil unless durl
  durl = filter_url(durl) if self.respond_to?(:filter_url)
  self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
  self.class.retrieve_session.get(durl)
end
title() click to toggle source
# File lib/web-page-parser/base_parser.rb, line 39
def title
  @title
end