class SiteMapper::Robots::ParsedRobots

Parses robots.txt

Public Class Methods

new(body, user_agent) click to toggle source

Initializes ParsedRobots

# File lib/site_mapper/robots.rb, line 10
def initialize(body, user_agent)
  @other     = {}
  @disallows = {}
  @allows    = {}
  @delays    = {}
  @sitemaps  = []
  parse(body)
end

Public Instance Methods

allowed?(uri, user_agent) click to toggle source

@param [URI] uri to be checked @param [String] user_agent to be checked @return [Boolean] true if uri is allowed to be crawled @example Check if www.google.com/googlesites is allowed to be crawled

uri = URI.parse('http://www.google.com/googlesites')
robots.allowed?(uri, 'SiteMapper')
# => false (as of 2014-10-22)
# File lib/site_mapper/robots.rb, line 59
def allowed?(uri, user_agent)
  return true unless @parsed
  allowed = true
  path    = uri.request_uri

  user_agent.downcase!

  @disallows.each do |key, value|
    if user_agent =~ key
      value.each do |rule|
        if path =~ rule
          allowed = false
        end
      end
    end
  end

  @allows.each do |key, value|
    unless allowed
      if user_agent =~ key
        value.each do |rule|
          if path =~ rule
            allowed = true
          end
        end
      end
    end
  end
  allowed
end
crawl_delay(user_agent) click to toggle source

@param [String] user_agent @return [Integer] crawl delay for user_agent

# File lib/site_mapper/robots.rb, line 92
def crawl_delay(user_agent)
  agent = user_agent.dup
  agent = to_regex(agent.downcase) if user_agent.is_a?(String)
  @delays[agent]
end
other_values() click to toggle source

Return key/value paris with unknown meaning. @return [Hash] key/value pairs from robots.txt

# File lib/site_mapper/robots.rb, line 100
def other_values
  @other
end
parse(body) click to toggle source

Parse robots.txt body. @param [String] body the webpage body HTML

# File lib/site_mapper/robots.rb, line 21
def parse(body)
  agent = /.*/
  body  = body || "User-agent: *\nAllow: /\n"
  body  = body.downcase
  body.each_line.each do |line|
    next if line =~ /^\s*(#.*|$)/
    arr   = line.split(':')
    key   = arr.shift
    value = arr.join(':').strip
    value.strip!
    case key
    when 'user-agent'
      agent = to_regex(value)
    when 'allow'
      @allows[agent] ||= []
      @allows[agent] << to_regex(value)
    when 'disallow'
      @disallows[agent] ||= []
      @disallows[agent] << to_regex(value)
    when 'crawl-delay'
      @delays[agent] = value.to_i
    when 'sitemap'
      @sitemaps << value
    else
      @other[key] ||= []
      @other[key] << value
    end
  end
  @parsed = true
end
sitemaps() click to toggle source

@return [Array] returns sitemaps defined in robots.txt

# File lib/site_mapper/robots.rb, line 105
def sitemaps
  @sitemaps
end

Protected Instance Methods

to_regex(pattern) click to toggle source

@return [Regex] regex from pattern @param [String] pattern to compile to Regex

# File lib/site_mapper/robots.rb, line 113
def to_regex(pattern)
  return /should-not-match-anything-123456789/ if pattern.strip.empty?
  pattern = Regexp.escape(pattern)
  pattern.gsub!(Regexp.escape('*'), '.*')
  Regexp.compile("^#{pattern}")
end