class Polipus::Page

Attributes

aliases[RW]
body[R]

The raw HTTP response body of the page

code[RW]

Integer response code of the page

depth[RW]

Depth of this page from the root of the crawl.

domain_aliases[RW]
error[R]

Exception object, if one was raised during HTTP#fetch_page

fetched_at[RW]
headers[R]

Headers of the HTTP response

redirect_to[R]

URL of the page this one redirected to, if any

referer[RW]

URL of the page that brought us to this page

response_time[RW]

Response time of the request for this page in milliseconds

storable[RW]

Whether the current page should be stored Default: true

url[R]

The URL of the page

user_data[RW]

OpenStruct it holds users defined data

Public Class Methods

from_hash(hash) click to toggle source
# File lib/polipus/page.rb, line 245
def self.from_hash(hash)
  page = new(URI(hash['url']))
  {
    '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
    '@body'          => hash['body'],
    '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
    '@code'          => hash['code'].to_i,
    '@depth'         => hash['depth'].to_i,
    '@referer'       => hash['referer'],
    '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
    '@response_time' => hash['response_time'].to_i,
    '@fetched'       => hash['fetched'],
    '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
    '@fetched_at'    => hash['fetched_at'],
    '@error'         => hash['error']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end
from_json(json) click to toggle source
# File lib/polipus/page.rb, line 266
def self.from_json(json)
  hash = JSON.parse json
  from_hash hash
end
new(url, params = {}) click to toggle source

Create a new page

# File lib/polipus/page.rb, line 44
def initialize(url, params = {})
  @url = URI(url)
  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]
  @fetched = !params[:code].nil?
  @user_data = OpenStruct.new
  @domain_aliases = params[:domain_aliases] ||= []
  @storable = true
  @fetched_at = params[:fetched_at]
end

Public Instance Methods

base() click to toggle source

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE

# File lib/polipus/page.rb, line 157
def base
  @base = if doc
            href = doc.search('//head/base/@href')
            URI(href.to_s) unless href.nil? rescue nil
          end unless @base

  return nil if @base && @base.to_s.empty?
  @base
end
content_type() click to toggle source

The content-type returned by the HTTP request for this page

# File lib/polipus/page.rb, line 117
def content_type
  headers['content-type'].first
end
discard_doc!() click to toggle source

Delete the Nokogiri document and response body to conserve memory

# File lib/polipus/page.rb, line 101
def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end
doc() click to toggle source

Nokogiri document for the HTML body

# File lib/polipus/page.rb, line 83
def doc
  return @doc if @doc
  @body ||= ''
  @body = @body.encode('utf-8', 'binary', invalid: :replace,
                                          undef: :replace, replace: '')
  @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
end
expired?(ttl) click to toggle source
# File lib/polipus/page.rb, line 240
def expired?(ttl)
  return false if fetched_at.nil?
  (Time.now.to_i - ttl) > fetched_at
end
fetched?() click to toggle source

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

# File lib/polipus/page.rb, line 110
def fetched?
  @fetched
end
html?() click to toggle source

Returns true if the page is a HTML document, returns false otherwise.

# File lib/polipus/page.rb, line 125
def html?
  content_type =~ %r{^(text/html|application/xhtml+xml)\b}
end
in_domain?(uri) click to toggle source

Returns true if uri is in the same domain as the page, returns false otherwise

# File lib/polipus/page.rb, line 201
def in_domain?(uri)
  @domain_aliases ||= []
  uri.host == @url.host || @domain_aliases.include?(uri.host)
end
not_found?() click to toggle source

Returns true if the page was not found (returned 404 code), returns false otherwise.

# File lib/polipus/page.rb, line 149
def not_found?
  404 == @code
end
redirect?() click to toggle source

Returns true if the page is a HTTP redirect, returns false otherwise.

# File lib/polipus/page.rb, line 133
def redirect?
  (300...400).include?(@code)
end
storable?() click to toggle source

Returns true if page is marked as storeable false otherwise Default is true

# File lib/polipus/page.rb, line 236
def storable?
  @storable
end
success?() click to toggle source

Returns true if the page is a HTTP success, returns false otherwise.

# File lib/polipus/page.rb, line 141
def success?
  (200..206).include?(@code)
end
to_absolute(link) click to toggle source

Converts relative URL link into an absolute URL based on the location of the page

# File lib/polipus/page.rb, line 171
def to_absolute(link)
  return nil if link.nil?

  valid_link = link.to_s.encode('utf-8', 'binary', invalid: :replace,
                                                   undef: :replace, replace: '')

  # remove anchor
  link =
    begin
      URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
    rescue URI::Error
      return nil
    end

  relative = begin
               URI(link)
             rescue URI::Error
               return nil
             end
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  absolute
end
to_hash() click to toggle source
# File lib/polipus/page.rb, line 206
def to_hash
  {
    'url'           => @url.to_s,
    'headers'       => Marshal.dump(@headers),
    'body'          => @body,
    'links'         => links.map(&:to_s),
    'code'          => @code,
    'depth'         => @depth,
    'referer'       => @referer.to_s,
    'redirect_to'   => @redirect_to.to_s,
    'response_time' => @response_time,
    'fetched'       => @fetched,
    'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
    'fetched_at'    => @fetched_at,
    'error'         => @error.to_s
  }
end
to_json() click to toggle source
# File lib/polipus/page.rb, line 224
def to_json
  th = to_hash.dup
  th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
  th.delete('headers') if content_type.empty?
  th.to_json
end