class Polipus::Robotex::ParsedRobots
Public Class Methods
new(uri, user_agent)
click to toggle source
# File lib/polipus/robotex.rb, line 16 def initialize(uri, user_agent) io = Robotex.get_robots_txt(uri, user_agent) if !io || io.content_type != 'text/plain' || io.status != %w(200 OK) io = StringIO.new("User-agent: *\nAllow: /\n") end @disallows = {} @allows = {} @delays = {} agent = /.*/ io.each do |line| next if line =~ /^\s*(#.*|$)/ arr = line.split(':') key = arr.shift value = arr.join(':').strip value.strip! case key.downcase when 'user-agent' agent = to_regex(value) when 'allow' unless value.empty? @allows[agent] ||= [] @allows[agent] << to_regex(value) end when 'disallow' unless value.empty? @disallows[agent] ||= [] @disallows[agent] << to_regex(value) end when 'crawl-delay' @delays[agent] = value.to_i end end @parsed = true end
Public Instance Methods
allowed?(uri, user_agent)
click to toggle source
# File lib/polipus/robotex.rb, line 52 def allowed?(uri, user_agent) return true unless @parsed allowed = true uri = URI.parse(uri.to_s) unless uri.is_a?(URI) path = uri.request_uri @allows.each do |key, value| unless allowed if user_agent =~ key value.each do |rule| path =~ rule && allowed = true end end end end @disallows.each do |key, value| if user_agent =~ key value.each do |rule| path =~ rule && allowed = false end end end allowed end
delay(user_agent)
click to toggle source
# File lib/polipus/robotex.rb, line 79 def delay(user_agent) @delays.each do |agent, delay| return delay if agent =~ user_agent end nil end
Protected Instance Methods
to_regex(pattern)
click to toggle source
# File lib/polipus/robotex.rb, line 88 def to_regex(pattern) pattern = Regexp.escape(pattern) pattern.gsub!(Regexp.escape('*'), '.*') Regexp.compile("^#{pattern}") end