class Benchmark::HTTP::Spider

Public Class Methods

new(depth: nil, ignore: nil) click to toggle source
# File lib/benchmark/http/spider.rb, line 37
def initialize(depth: nil, ignore: nil)
        @depth = depth
        @ignore = ignore
end

Public Instance Methods

call(urls, &block) click to toggle source
# File lib/benchmark/http/spider.rb, line 125
     def call(urls, &block)
        statistics = Statistics.new
        
        urls.each do |url|
                endpoint = Async::HTTP::Endpoint.parse(url, timeout: 10)
                
                Async::HTTP::Client.open(endpoint, protocol: endpoint.protocol, connection_limit: 4) do |client|
                        fetch(statistics, client, endpoint.url, &block).wait
                end
        end
        
        return statistics
end
fetch(statistics, client, url, depth = @depth, fetched = Set.new) { |"HEAD", url, response| ... } click to toggle source
# File lib/benchmark/http/spider.rb, line 74
      def fetch(statistics, client, url, depth = @depth, fetched = Set.new, &block)
        if depth&.zero?
                Async.logger.warn(self) {"Exceeded depth while trying to visit #{url}!"}
                return
        elsif fetched.include?(url)
                return
        elsif @ignore&.match?(url.path)
                return
        end
        
        fetched << url
        
        request_uri = url.request_uri
        
        response = client.head(request_uri).tap(&:read)
        
        yield("HEAD", url, response) if block_given?
        
        if response.redirection?
                location = url + response.headers['location']
                if location.host == url.host
                        Async.logger.debug(self) {"Following redirect to #{location}..."}
                        fetch(statistics, client, location, depth&.-(1), fetched, &block).wait
                        return
                else
                        Async.logger.debug(self) {"Ignoring redirect to #{location}."}
                        return
                end
        end
        
        content_type = response.headers['content-type']
        unless content_type&.start_with? 'text/html'
                # puts "Unsupported content type: #{content_type}"
                return
        end
        
        response = statistics.measure do
                client.get(request_uri)
        end
        
        yield("GET", url, response) if block_given?
        
        extract_links(url, response) do |href|
                fetch(statistics, client, href, depth&.-(1), fetched, &block)
        end.each(&:wait)
rescue Async::TimeoutError
        Async.logger.error(self) {"Timeout while fetching #{url}"}
rescue StandardError
        Async.logger.error(self) {$!}
end