class Polipus::Robotex
Original code taken from github.com/chriskite/robotex/blob/master/lib/robotex.rb
Constants
- DEFAULT_TIMEOUT
- VERSION
Attributes
timeout[W]
user_agent[R]
Public Class Methods
get_robots_txt(uri, user_agent)
click to toggle source
# File lib/polipus/robotex.rb, line 95 def self.get_robots_txt(uri, user_agent) Timeout.timeout(Robotex.timeout) do URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil end rescue Timeout::Error STDERR.puts 'robots.txt request timed out' end
new(user_agent = nil)
click to toggle source
# File lib/polipus/robotex.rb, line 111 def initialize(user_agent = nil) user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil? @user_agent = user_agent @last_accessed = Time.at(1) @parsed = {} end
timeout()
click to toggle source
# File lib/polipus/robotex.rb, line 107 def self.timeout @timeout || DEFAULT_TIMEOUT end
Public Instance Methods
allowed?(uri)
click to toggle source
Download the server’s robots.txt, and return try if we are allowed to acces the url, false otherwise
# File lib/polipus/robotex.rb, line 126 def allowed?(uri) parse_host(uri).allowed?(uri, @user_agent) end
delay(uri)
click to toggle source
Return the value of the Crawl-Delay directive, or nil if none
# File lib/polipus/robotex.rb, line 132 def delay(uri) parse_host(uri).delay(@user_agent) end
delay!(uri)
click to toggle source
Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
# File lib/polipus/robotex.rb, line 139 def delay!(uri) delay = delay(uri) sleep delay - (Time.now - @last_accessed) if delay @last_accessed = Time.now end
parse_host(uri)
click to toggle source
# File lib/polipus/robotex.rb, line 118 def parse_host(uri) uri = URI.parse(uri.to_s) unless uri.is_a?(URI) @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent) end