class Robotx
Constants
- TIMEOUT
Public Class Methods
new(uri, user_agent='*')
click to toggle source
# File lib/robotx.rb, line 11 def initialize(uri, user_agent='*') @uri = URI.parse(URI.encode(uri)) raise URI::InvalidURIError.new('scheme or host missing') unless @uri.scheme and @uri.host @user_agent = user_agent.downcase @robots_data = parse_robots_txt end
Public Instance Methods
allowed()
click to toggle source
# File lib/robotx.rb, line 19 def allowed return disallowed.empty? ? ['/'] : @robots_data.fetch(@user_agent, {}).fetch('allow', ['/']) end
allowed?(data)
click to toggle source
# File lib/robotx.rb, line 27 def allowed?(data) if data.is_a?(Array) or data.is_a?(Set) return {}.tap do |hash| data.each do |uri| hash[uri] = check_permission(uri) end end end return check_permission(data) end
crawl_delay()
click to toggle source
# File lib/robotx.rb, line 43 def crawl_delay return [@robots_data.fetch(@user_agent, {}).fetch('crawl-delay', 0), 0].max end
disallowed()
click to toggle source
# File lib/robotx.rb, line 23 def disallowed return @robots_data.fetch(@user_agent, {}).fetch('disallow', []) end
sitemap()
click to toggle source
# File lib/robotx.rb, line 39 def sitemap return @robots_data.fetch('sitemap', []) end
user_agents()
click to toggle source
# File lib/robotx.rb, line 47 def user_agents return @robots_data.keys.delete_if { |agent| agent == 'sitemap' } end
Private Instance Methods
allow_regex()
click to toggle source
# File lib/robotx.rb, line 118 def allow_regex allow_data = @robots_data.fetch(@user_agent, {}).fetch('allow', []) @allow_regex ||= allow_data.empty? ? nil : Regexp.compile(allow_data.map { |uri| "^#{regex_value(uri)}" }.join("|")) end
check_permission(uri)
click to toggle source
# File lib/robotx.rb, line 123 def check_permission(uri) uri = URI.parse(URI.encode(uri)) return true unless (@robots_data or @robots_data.any?) or (uri.scheme and uri.host) uri_path = strip_slashes(uri.path) return (!!!(uri_path =~ disallow_regex) or !!(uri_path =~ allow_regex)) end
disallow_regex()
click to toggle source
# File lib/robotx.rb, line 113 def disallow_regex disallow_data = @robots_data.fetch(@user_agent, {}).fetch('disallow', []) @disallow_regex ||= disallow_data.empty? ? nil : Regexp.compile(disallow_data.map { |uri| "^#{regex_value(uri)}" }.join("|")) end
load_robots_txt()
click to toggle source
# File lib/robotx.rb, line 53 def load_robots_txt Timeout::timeout(Robotx::TIMEOUT) do if robots_txt_io = URI.join(@uri, 'robots.txt').open('User-Agent' => @user_agent) and robots_txt_io.content_type.downcase == 'text/plain' and robots_txt_io.status == ['200', 'OK'] return robots_txt_io end raise OpenURI::HTTPError end rescue return StringIO.new("User-agent: *\nAllow: /\n") end
parse_robots_txt()
click to toggle source
# File lib/robotx.rb, line 64 def parse_robots_txt agent = '*' {}.tap do |hash| load_robots_txt.each do |line| next if line =~ /^\s*(#.*|$)/ data = line.split(/:/).map(&:strip) key = data.shift value = data.join case key.downcase when 'user-agent' agent = value.downcase hash[agent] ||= {} when 'allow' hash[agent]['allow'] ||= [] hash[agent]['allow'] << strip_slashes(value) when 'disallow' # Disallow: '' equals Allow: '/' if value.empty? hash[agent]['allow'] ||= [] hash[agent]['allow'] << '/' else hash[agent]['disallow'] ||= [] hash[agent]['disallow'] << strip_slashes(value) end when 'crawl-delay' hash[agent]['crawl-delay'] = value.to_i when 'sitemap' hash['sitemap'] ||= [] hash['sitemap'] << strip_slashes(value).sub(/\/*$/, '') else hash[key] ||= [] hash[key] << strip_slashes(value) end end end rescue {} end
regex_value(value='')
click to toggle source
# File lib/robotx.rb, line 109 def regex_value(value='') return strip_slashes(value).gsub(/\*/,'.*').gsub(/\?/,'\?') end
strip_slashes(value='')
click to toggle source
# File lib/robotx.rb, line 105 def strip_slashes(value='') return value.sub(/\/*$/, '/') end