class Robotx

Constants

TIMEOUT

Public Class Methods

new(uri, user_agent='*') click to toggle source
# File lib/robotx.rb, line 11
def initialize(uri, user_agent='*')
  @uri = URI.parse(URI.encode(uri))
  raise URI::InvalidURIError.new('scheme or host missing') unless @uri.scheme and @uri.host

  @user_agent  = user_agent.downcase
  @robots_data = parse_robots_txt
end

Public Instance Methods

allowed() click to toggle source
# File lib/robotx.rb, line 19
def allowed
  return disallowed.empty? ? ['/'] : @robots_data.fetch(@user_agent, {}).fetch('allow', ['/'])
end
allowed?(data) click to toggle source
# File lib/robotx.rb, line 27
def allowed?(data)
  if data.is_a?(Array) or data.is_a?(Set)
    return {}.tap do |hash|
      data.each do |uri|
        hash[uri] = check_permission(uri)
      end
    end
  end

  return check_permission(data)
end
crawl_delay() click to toggle source
# File lib/robotx.rb, line 43
def crawl_delay
  return [@robots_data.fetch(@user_agent, {}).fetch('crawl-delay', 0), 0].max
end
disallowed() click to toggle source
# File lib/robotx.rb, line 23
def disallowed
  return @robots_data.fetch(@user_agent, {}).fetch('disallow', [])
end
sitemap() click to toggle source
# File lib/robotx.rb, line 39
def sitemap
  return @robots_data.fetch('sitemap', [])
end
user_agents() click to toggle source
# File lib/robotx.rb, line 47
def user_agents
  return @robots_data.keys.delete_if { |agent| agent == 'sitemap' }
end

Private Instance Methods

allow_regex() click to toggle source
# File lib/robotx.rb, line 118
def allow_regex
  allow_data = @robots_data.fetch(@user_agent, {}).fetch('allow', [])
  @allow_regex ||= allow_data.empty? ? nil : Regexp.compile(allow_data.map { |uri| "^#{regex_value(uri)}" }.join("|"))
end
check_permission(uri) click to toggle source
# File lib/robotx.rb, line 123
def check_permission(uri)
  uri = URI.parse(URI.encode(uri))
  return true unless (@robots_data or @robots_data.any?) or (uri.scheme and uri.host)

  uri_path = strip_slashes(uri.path)
  return (!!!(uri_path =~ disallow_regex) or !!(uri_path =~ allow_regex))
end
disallow_regex() click to toggle source
# File lib/robotx.rb, line 113
def disallow_regex
  disallow_data = @robots_data.fetch(@user_agent, {}).fetch('disallow', [])
  @disallow_regex ||= disallow_data.empty? ? nil : Regexp.compile(disallow_data.map { |uri| "^#{regex_value(uri)}" }.join("|"))
end
load_robots_txt() click to toggle source
# File lib/robotx.rb, line 53
def load_robots_txt
  Timeout::timeout(Robotx::TIMEOUT) do
    if robots_txt_io = URI.join(@uri, 'robots.txt').open('User-Agent' => @user_agent) and robots_txt_io.content_type.downcase == 'text/plain' and robots_txt_io.status == ['200', 'OK']
      return robots_txt_io
    end
    raise OpenURI::HTTPError
  end
rescue
  return StringIO.new("User-agent: *\nAllow: /\n")
end
parse_robots_txt() click to toggle source
# File lib/robotx.rb, line 64
def parse_robots_txt
  agent  = '*'
  {}.tap do |hash|
    load_robots_txt.each do |line|
      next if line =~ /^\s*(#.*|$)/

      data  = line.split(/:/).map(&:strip)
      key   = data.shift
      value = data.join

      case key.downcase
      when 'user-agent'
        agent = value.downcase
        hash[agent] ||= {}
      when 'allow'
        hash[agent]['allow'] ||= []
        hash[agent]['allow'] << strip_slashes(value)
      when 'disallow'
        # Disallow: '' equals Allow: '/'
        if value.empty?
          hash[agent]['allow'] ||= []
          hash[agent]['allow'] << '/'
        else
          hash[agent]['disallow'] ||= []
          hash[agent]['disallow'] << strip_slashes(value)
        end
      when 'crawl-delay'
        hash[agent]['crawl-delay'] = value.to_i
      when 'sitemap'
        hash['sitemap'] ||= []
        hash['sitemap'] << strip_slashes(value).sub(/\/*$/, '')
      else
        hash[key] ||= []
        hash[key] << strip_slashes(value)
      end
    end
  end
rescue
  {}
end
regex_value(value='') click to toggle source
# File lib/robotx.rb, line 109
def regex_value(value='')
  return strip_slashes(value).gsub(/\*/,'.*').gsub(/\?/,'\?')
end
strip_slashes(value='') click to toggle source
# File lib/robotx.rb, line 105
def strip_slashes(value='')
  return value.sub(/\/*$/, '/')
end