class Tspider::Page
Attributes
doc[R]
headers[R]
html[R]
location[R]
response[R]
response_time[R]
status[R]
url[R]
Public Class Methods
new(url,attrs)
click to toggle source
# File lib/tspider/page.rb, line 10 def initialize(url,attrs) @url = url @uri = URI(@url) @user_agent = attrs[:user_agent] || ::Tspider::UA::DEFAULT @webrobots = WebRobots.new(@user_agent) @debug = false time_start = Time.now r = Client.get(@url, attrs.merge(:headers => {"User-Agent" => @user_agent})) time_end = Time.now @response = r @response_time = time_end - time_start @status = r.response.code.to_i @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace) @doc = Nokogiri::HTML(@html) @location = r.headers['location'] @headers = r.headers.to_hash end
Public Instance Methods
canonical()
click to toggle source
# File lib/tspider/page.rb, line 91 def canonical safe_search('link[@rel="canonical"]', [0, 'href']) end
links()
click to toggle source
# File lib/tspider/page.rb, line 72 def links links = [] safe_search('a').each do |a| href = a['href'] text = a.content rel = a['rel'] url = @uri.merge(URI.escape(href.to_s)) if url.host == @uri.host disallow = @webrobots.disallowed?(url.to_s) else disallow = nil end links << {href: href, text: text, rel: rel, disallow: disallow} end links end
meta(name)
click to toggle source
# File lib/tspider/page.rb, line 64 def meta(name) safe_search("meta[@name=#{name}]", [0, 'content']) end
meta_description()
click to toggle source
# File lib/tspider/page.rb, line 52 def meta_description safe_search('meta[@name="description"]', [0, 'content']) end
meta_keywords()
click to toggle source
# File lib/tspider/page.rb, line 56 def meta_keywords safe_search('meta[@name="keywords"]', [0, 'content']) end
meta_robots()
click to toggle source
# File lib/tspider/page.rb, line 60 def meta_robots safe_search('meta[@name="robots"]', [0, 'content']) end
opf()
click to toggle source
# File lib/tspider/page.rb, line 30 def opf {:url => @url, :status => @status, :location => @Location, :response_time => @response_time, :canonical => canonical, :title => title, :meta_keywords => meta_keywords, :meta_description => meta_description, :meta_robots => meta_robots, :h1 => h1, :h2 => h2, :h3 => h3, :links => links, :headers => @headers } end
response_header(key)
click to toggle source
# File lib/tspider/page.rb, line 68 def response_header(key) @headers[key].join('|') end
safe_search(search_value, select_path=[], return_content=false)
click to toggle source
# File lib/tspider/page.rb, line 105 def safe_search(search_value, select_path=[], return_content=false) value = @doc.search(search_value).dup select_path.each do |key| begin value = value[key] rescue NoMethodError return nil end end return nil if value.nil? if return_content value.content else value end end
title()
click to toggle source
# File lib/tspider/page.rb, line 48 def title safe_search('title', [0], true) end