class Sunbro::Page
Attributes
The raw HTTP
response body of the page
Integer response code of the page
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
Exception object, if one was raised during HTTP#fetch_page
Headers of the HTTP
response
URL of the page this one redirected to, if any
URL of the page that brought us to this page
Response time of the request for this page in milliseconds
The URL of the page
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
Public Class Methods
# File lib/sunbro/page.rb, line 210 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => JSON.load(hash['headers']), '@body' => hash['body'], '@code' => hash['code'].to_i, '@error' => hash['error'], '@visited' => hash['visited'], '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'].present?) ? URI(hash['redirect_to']) : nil, '@redirect_from' => (hash['redirect_from'].present?) ? URI(hash['redirect_from']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end
Create a new page
# File lib/sunbro/page.rb, line 32 def initialize(url, params = {}) @url = url @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @error = params[:error] @fetched = !params[:code].nil? @force_format = params[:force_format] @body = params[:body] @redirect_from = params[:redirect_from] end
Public Instance Methods
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
# File lib/sunbro/page.rb, line 149 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end
The content-type returned by the HTTP
request for this page
# File lib/sunbro/page.rb, line 100 def content_type return headers['content-type'].first if headers['content-type'].first.present? headers['content_type'] end
Delete the Nokogiri document and response body to conserve memory
# File lib/sunbro/page.rb, line 78 def discard_doc! @doc = @body = nil end
Nokogiri document for the HTML body
# File lib/sunbro/page.rb, line 53 def doc @doc ||= begin if image? nil elsif should_parse_as?(:xml) Nokogiri::XML(@body, @url.to_s) elsif should_parse_as?(:html) Nokogiri::HTML(@body, @url.to_s) elsif @body Nokogiri.parse(@body, @url.to_s) end end end
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
# File lib/sunbro/page.rb, line 86 def fetched? @fetched end
Returns true
if the page is a HTML document, returns false
otherwise.
# File lib/sunbro/page.rb, line 117 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end
Returns true
if the page is an image, returns false
otherwise.
# File lib/sunbro/page.rb, line 109 def image? !!(content_type =~ %r{^(image/)\b}) end
Returns true
if uri is in the same domain as the page, returns false
otherwise
# File lib/sunbro/page.rb, line 182 def in_domain?(uri) uri.host == @url.host end
# File lib/sunbro/page.rb, line 186 def marshal_dump [@url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end
# File lib/sunbro/page.rb, line 190 def marshal_load(ary) @url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
# File lib/sunbro/page.rb, line 141 def not_found? 404 == @code end
# File lib/sunbro/page.rb, line 71 def present? !error && code && body.present? && doc end
Returns true
if the page is a HTTP
redirect, returns false
otherwise.
# File lib/sunbro/page.rb, line 133 def redirect? (300..307).include?(@code) end
Converts relative URL link into an absolute URL based on the location of the page
# File lib/sunbro/page.rb, line 164 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end
# File lib/sunbro/page.rb, line 194 def to_hash { 'url' => @url.to_s, 'headers' => headers.to_json, 'body' => @body, 'code' => @code, 'error' => (@error ? @error.to_s : nil), 'visited' => @visited, 'referer' => (@referer ? @referer.to_s : nil), 'redirect_to' => (@redirect_to ? @redirect_to.to_s : nil), 'redirect_from' => (@redirect_from ? @redirect_from.to_s : nil), 'response_time' => @response_time, 'fetched' => @fetched }.reject { |k, v| v.nil? } end
# File lib/sunbro/page.rb, line 67 def valid? (url != "about:blank") && !not_found? && present? end
Returns true
if the page is a XML document, returns false
otherwise.
# File lib/sunbro/page.rb, line 125 def xml? !!(content_type =~ %r{^(text/xml|application/xml)\b}) end
Private Instance Methods
# File lib/sunbro/page.rb, line 230 def cleanup_encoding(source) return source unless source && (html? || xml? || @force_format) text = source.dup text.encode!('UTF-16', 'UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}) text.encode('UTF-8', 'UTF-16') end
# File lib/sunbro/page.rb, line 237 def should_parse_as?(format) return false unless @body return @force_format == format if @force_format send("#{format}?") end