class Polipus::Robotex

Original code taken from github.com/chriskite/robotex/blob/master/lib/robotex.rb

Constants

DEFAULT_TIMEOUT
VERSION

Attributes

timeout[W]
user_agent[R]

Public Class Methods

get_robots_txt(uri, user_agent) click to toggle source
# File lib/polipus/robotex.rb, line 95
def self.get_robots_txt(uri, user_agent)
  Timeout.timeout(Robotex.timeout) do
    URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
  end
rescue Timeout::Error
  STDERR.puts 'robots.txt request timed out'
end
new(user_agent = nil) click to toggle source
# File lib/polipus/robotex.rb, line 111
def initialize(user_agent = nil)
  user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
  @user_agent = user_agent
  @last_accessed = Time.at(1)
  @parsed = {}
end
timeout() click to toggle source
# File lib/polipus/robotex.rb, line 107
def self.timeout
  @timeout || DEFAULT_TIMEOUT
end

Public Instance Methods

allowed?(uri) click to toggle source

Download the server’s robots.txt, and return try if we are allowed to acces the url, false otherwise

# File lib/polipus/robotex.rb, line 126
def allowed?(uri)
  parse_host(uri).allowed?(uri, @user_agent)
end
delay(uri) click to toggle source

Return the value of the Crawl-Delay directive, or nil if none

# File lib/polipus/robotex.rb, line 132
def delay(uri)
  parse_host(uri).delay(@user_agent)
end
delay!(uri) click to toggle source

Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server

# File lib/polipus/robotex.rb, line 139
def delay!(uri)
  delay = delay(uri)
  sleep delay - (Time.now - @last_accessed) if delay
  @last_accessed = Time.now
end
parse_host(uri) click to toggle source
# File lib/polipus/robotex.rb, line 118
def parse_host(uri)
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
end