class Digger::Page
Attributes
The raw HTTP
response body of the page
Integer response code of the page
Depth of this page from the root of the crawl.
Exception object, if one was raised during HTTP#fetch_page
Headers of the HTTP
response
URL of the page this one redirected to, if any
URL of the page that brought us to this page
Response time of the request for this page in milliseconds
Whether the current page should be stored Default: true
OpenStruct it holds users defined data
Public Class Methods
# File lib/digger/page.rb, line 255 def self.from_hash(hash) page = new(URI(hash['url'])) { '@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] }, '@body' => hash['body'], '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [], '@code' => hash['code'].to_i, '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'], '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil, '@fetched_at' => hash['fetched_at'], '@error' => hash['error'] }.each do |var, value| page.instance_variable_set(var, value) end page end
# File lib/digger/page.rb, line 276 def self.from_json(json) hash = JSON.parse json from_hash hash end
Create a new page
# File lib/digger/page.rb, line 41 def initialize(url, params = {}) @url = URI(url) @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? @user_data = OpenStruct.new @domain_aliases = params[:domain_aliases] ||= [] @storable = true @fetched_at = params[:fetched_at] end
Public Instance Methods
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
# File lib/digger/page.rb, line 175 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s.empty? @base end
The content-type returned by the HTTP
request for this page
# File lib/digger/page.rb, line 135 def content_type headers['content-type'].first end
Delete the Nokogiri
document and response body to conserve memory
# File lib/digger/page.rb, line 119 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end
Discard links, a next call of page.links will return an empty array
# File lib/digger/page.rb, line 112 def discard_links! @links = [] end
Nokogiri
document for the HTML body
# File lib/digger/page.rb, line 86 def doc # return @doc if @doc # @body ||= '' # @body = @body.encode('utf-8', 'binary', :invalid => :replace, # :undef => :replace, :replace => '') # @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? @doc ||= begin Nokogiri::HTML(body) if !body.nil? && html? rescue nil end end
# File lib/digger/page.rb, line 249 def expired?(ttl) return false if fetched_at.nil? (Time.now.to_i - ttl) > fetched_at end
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
# File lib/digger/page.rb, line 128 def fetched? @fetched end
Returns true
if the page is a HTML document, returns false
otherwise.
# File lib/digger/page.rb, line 143 def html? content_type =~ %r{^(text/html|application/xhtml+xml)\b} end
Returns true
if uri is in the same domain as the page, returns false
otherwise
# File lib/digger/page.rb, line 210 def in_domain?(uri) @domain_aliases ||= [] uri.host == @url.host || @domain_aliases.include?(uri.host) end
# File lib/digger/page.rb, line 97 def json @json ||= JSON.parse body end
# File lib/digger/page.rb, line 101 def jsonp @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1] end
Array of distinct A tag HREFs from the page
# File lib/digger/page.rb, line 67 def links unless @links.nil? @links = Set.new return [] unless doc doc.search('//a[@href]').each do |a| u = a['href'] next if u.nil? || u.empty? abs = to_absolute(u) rescue next @links << abs if abs && in_domain?(abs) end end @links.to_a end
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
# File lib/digger/page.rb, line 167 def not_found? @code == 404 end
Returns true
if the page is a HTTP
redirect, returns false
otherwise.
# File lib/digger/page.rb, line 151 def redirect? (300...400).include?(@code) end
Returns true
if page is marked as storeable false
otherwise Default is true
# File lib/digger/page.rb, line 245 def storable? @storable end
Returns true
if the page is a HTTP
success, returns false
otherwise.
# File lib/digger/page.rb, line 159 def success? (200..206).include?(@code) end
# File lib/digger/page.rb, line 60 def title doc&.title end
Converts relative URL link into an absolute URL based on the location of the page
# File lib/digger/page.rb, line 190 def to_absolute(link) return nil if link.nil? link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '') relative = begin URI(link) rescue URI::Error return nil end absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? absolute end
# File lib/digger/page.rb, line 215 def to_hash { 'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched, 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump, 'fetched_at' => @fetched_at, 'error' => @error.to_s } end
# File lib/digger/page.rb, line 233 def to_json th = to_hash.dup th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) } th.delete('headers') if content_type.empty? th.to_json end