class SiteMapper::Robots::ParsedRobots
Parses robots.txt
Public Class Methods
new(body, user_agent)
click to toggle source
Initializes ParsedRobots
# File lib/site_mapper/robots.rb, line 10 def initialize(body, user_agent) @other = {} @disallows = {} @allows = {} @delays = {} @sitemaps = [] parse(body) end
Public Instance Methods
allowed?(uri, user_agent)
click to toggle source
@param [URI] uri to be checked @param [String] user_agent to be checked @return [Boolean] true if uri is allowed to be crawled @example Check if www.google.com/googlesites is allowed to be crawled
uri = URI.parse('http://www.google.com/googlesites') robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
# File lib/site_mapper/robots.rb, line 59 def allowed?(uri, user_agent) return true unless @parsed allowed = true path = uri.request_uri user_agent.downcase! @disallows.each do |key, value| if user_agent =~ key value.each do |rule| if path =~ rule allowed = false end end end end @allows.each do |key, value| unless allowed if user_agent =~ key value.each do |rule| if path =~ rule allowed = true end end end end end allowed end
crawl_delay(user_agent)
click to toggle source
@param [String] user_agent @return [Integer] crawl delay for user_agent
# File lib/site_mapper/robots.rb, line 92 def crawl_delay(user_agent) agent = user_agent.dup agent = to_regex(agent.downcase) if user_agent.is_a?(String) @delays[agent] end
other_values()
click to toggle source
Return key/value paris with unknown meaning. @return [Hash] key/value pairs from robots.txt
# File lib/site_mapper/robots.rb, line 100 def other_values @other end
parse(body)
click to toggle source
Parse robots.txt body. @param [String] body the webpage body HTML
# File lib/site_mapper/robots.rb, line 21 def parse(body) agent = /.*/ body = body || "User-agent: *\nAllow: /\n" body = body.downcase body.each_line.each do |line| next if line =~ /^\s*(#.*|$)/ arr = line.split(':') key = arr.shift value = arr.join(':').strip value.strip! case key when 'user-agent' agent = to_regex(value) when 'allow' @allows[agent] ||= [] @allows[agent] << to_regex(value) when 'disallow' @disallows[agent] ||= [] @disallows[agent] << to_regex(value) when 'crawl-delay' @delays[agent] = value.to_i when 'sitemap' @sitemaps << value else @other[key] ||= [] @other[key] << value end end @parsed = true end
sitemaps()
click to toggle source
@return [Array] returns sitemaps defined in robots.txt
# File lib/site_mapper/robots.rb, line 105 def sitemaps @sitemaps end
Protected Instance Methods
to_regex(pattern)
click to toggle source
@return [Regex] regex from pattern @param [String] pattern to compile to Regex
# File lib/site_mapper/robots.rb, line 113 def to_regex(pattern) return /should-not-match-anything-123456789/ if pattern.strip.empty? pattern = Regexp.escape(pattern) pattern.gsub!(Regexp.escape('*'), '.*') Regexp.compile("^#{pattern}") end