class Tspider::Page

Attributes

doc[R]
headers[R]
html[R]
location[R]
response[R]
response_time[R]
status[R]
url[R]

Public Class Methods

new(url,attrs) click to toggle source
# File lib/tspider/page.rb, line 10
def initialize(url,attrs)
  @url = url
  @uri = URI(@url)
  @user_agent = attrs[:user_agent] || ::Tspider::UA::DEFAULT
  @webrobots = WebRobots.new(@user_agent)
  @debug = false

  time_start = Time.now
  r = Client.get(@url, attrs.merge(:headers => {"User-Agent" => @user_agent}))
  time_end = Time.now
  @response = r
  @response_time = time_end - time_start
  @status = r.response.code.to_i

  @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace)
  @doc = Nokogiri::HTML(@html)
  @location = r.headers['location']
  @headers = r.headers.to_hash
end

Public Instance Methods

canonical() click to toggle source
# File lib/tspider/page.rb, line 91
def canonical
  safe_search('link[@rel="canonical"]', [0, 'href'])
end
meta(name) click to toggle source
# File lib/tspider/page.rb, line 64
def meta(name)
  safe_search("meta[@name=#{name}]", [0, 'content'])
end
meta_description() click to toggle source
# File lib/tspider/page.rb, line 52
def meta_description
  safe_search('meta[@name="description"]', [0, 'content'])
end
meta_keywords() click to toggle source
# File lib/tspider/page.rb, line 56
def meta_keywords
  safe_search('meta[@name="keywords"]', [0, 'content'])
end
meta_robots() click to toggle source
# File lib/tspider/page.rb, line 60
def meta_robots
  safe_search('meta[@name="robots"]', [0, 'content'])
end
opf() click to toggle source
# File lib/tspider/page.rb, line 30
def opf
  {:url => @url,
   :status => @status,
   :location => @Location,
   :response_time => @response_time,
   :canonical => canonical,
   :title => title,
   :meta_keywords => meta_keywords,
   :meta_description => meta_description,
   :meta_robots => meta_robots,
   :h1 => h1,
   :h2 => h2,
   :h3 => h3,
   :links => links,
   :headers => @headers
  }
end
response_header(key) click to toggle source
# File lib/tspider/page.rb, line 68
def response_header(key)
  @headers[key].join('|')
end
title() click to toggle source
# File lib/tspider/page.rb, line 48
def title
  safe_search('title', [0], true)
end