class HTMLProofer::UrlValidator::External
Attributes
before_request[W]
external_urls[R]
Public Class Methods
new(runner, external_urls)
click to toggle source
Calls superclass method
HTMLProofer::UrlValidator::new
# File lib/html_proofer/url_validator/external.rb, line 16 def initialize(runner, external_urls) super(runner) @external_urls = external_urls @hydra = Typhoeus::Hydra.new(@runner.options[:hydra]) @before_request = [] @paths_with_queries = {} end
Public Instance Methods
add_failure(metadata, description, status = nil)
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 193 def add_failure(metadata, description, status = nil) if blank?(metadata) # possible if we're checking an array of links @failed_checks << Failure.new("", "Links > External", description, status: status) else metadata.each do |m| @failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status) end end end
check_hash_in_2xx_response(href, url, response, filenames)
click to toggle source
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page
# File lib/html_proofer/url_validator/external.rb, line 118 def check_hash_in_2xx_response(href, url, response, filenames) return false if @runner.options[:only_4xx] return false unless @runner.options[:check_external_hash] return false unless url.hash? hash = url.hash headers = response.options.fetch(:headers, {}) content_type = headers.find { |k, _| k.casecmp("content-type").zero? } # attempt to verify PDF hash ref; see #787 for more details # FIXME: this is re-reading the PDF response if content_type && content_type[1].include?("pdf") io = URI.parse(url.to_s).open reader = PDF::Reader.new(io) pages = reader.pages if hash =~ /\Apage=(\d+)\z/ page = Regexp.last_match[1].to_i unless pages[page - 1] msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) end return true end end body_doc = create_nokogiri(response.body) unencoded_hash = Addressable::URI.unescape(hash) xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])] # user-content is a special addition by GitHub. if url.host =~ /github\.com/i xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])] # when linking to a file on GitHub, like #L12-L34, only the first "L" portion # will be identified as a linkable portion xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/ end return unless body_doc.xpath(xpath.join("|")).empty? msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) true end
handle_connection_failure(href, metadata, response_code, status_message)
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 175 def handle_connection_failure(href, metadata, response_code, status_message) msgs = [<<~MSG, External link #{href} failed with something very wrong. It's possible libcurl couldn't connect to the server, or perhaps the request timed out. Sometimes, making too many requests at once also breaks things. MSG ] msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message) msg = msgs.join("\n").chomp @cache.add_external(href, metadata, 0, msg, false) return if @runner.options[:only_4xx] add_failure(metadata, msg, response_code) end
handle_timeout(href, filenames, response_code)
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 167 def handle_timeout(href, filenames, response_code) msg = "External link #{href} failed: got a time out (response code #{response_code})" @cache.add_external(href, filenames, 0, msg, false) return if @runner.options[:only_4xx] add_failure(filenames, msg, response_code) end
queue_request(method, url, filenames)
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 72 def queue_request(method, url, filenames) opts = @runner.options[:typhoeus].merge(method: method) request = Typhoeus::Request.new(url.url, opts) @before_request.each do |callback| callback.call(request) end request.on_complete { |response| response_handler(response, url, filenames) } @hydra.queue(request) end
response_handler(response, url, filenames)
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 82 def response_handler(response, url, filenames) method = response.request.options[:method] href = response.request.base_url.to_s response_code = response.code response.body.delete!("\x00") @logger.log(:debug, "Received a #{response_code} for #{href}") return if @runner.options[:ignore_status_codes].include?(response_code) if response_code.between?(200, 299) @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response( href, url, response, filenames, ) elsif response.timed_out? handle_timeout(href, filenames, response_code) elsif response_code.zero? handle_connection_failure(href, filenames, response_code, response.status_message) elsif method == :head # some servers don't support HEAD queue_request(:get, url, filenames) else return if @runner.options[:only_4xx] && !response_code.between?(400, 499) # Received a non-successful http response. status_message = blank?(response.status_message) ? "" : ": #{response.status_message}" msg = "External link #{href} failed#{status_message}" add_failure(filenames, msg, response_code) @cache.add_external(href, filenames, response_code, msg, false) end end
run_external_link_checker(external_urls)
click to toggle source
Proofer runs faster if we pull out all the external URLs and run the checks at the end. Otherwise, we’re halting the consuming process for every file during ‘process_files`.
In addition, sorting the list lets libcurl keep connections to the same hosts alive.
Finally, we’ll first make a HEAD request, rather than GETing all the contents. If the HEAD fails, we’ll fall back to GET, as some servers are not configured for HEAD. If we’ve decided to check for hashes, we must do a GET–HEAD is not available as an option.
# File lib/html_proofer/url_validator/external.rb, line 46 def run_external_link_checker(external_urls) # Route log from Typhoeus/Ethon to our own logger Ethon.logger = @logger external_urls.each_pair do |external_url, metadata| url = Attribute::Url.new(@runner, external_url, base_url: nil) unless url.valid? add_failure(metadata, "#{url} is an invalid URL", 0) next end next unless new_url_query_values?(url) method = if @runner.options[:check_external_hash] && url.hash? :get else :head end queue_request(method, url, metadata) end @hydra.run end
validate()
click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 26 def validate urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls urls_detected = pluralize(urls_to_check.count, "external link", "external links") @logger.log(:info, "Checking #{urls_detected}") run_external_link_checker(urls_to_check) @failed_checks end
Private Instance Methods
new_url_query_values?(url)
click to toggle source
remember queries we’ve seen, ignore future ones
# File lib/html_proofer/url_validator/external.rb, line 204 def new_url_query_values?(url) return true if (query_values = url.query_values).nil? queries = query_values.keys.join("-") domain_path = url.domain_path if @paths_with_queries[domain_path].nil? @paths_with_queries[domain_path] = [queries] true elsif !@paths_with_queries[domain_path].include?(queries) @paths_with_queries[domain_path] << queries true else false end end