class HTMLProofer::UrlValidator::External

Attributes

before_request[W]
external_urls[R]

Public Class Methods

new(runner, external_urls) click to toggle source
Calls superclass method HTMLProofer::UrlValidator::new
# File lib/html_proofer/url_validator/external.rb, line 16
def initialize(runner, external_urls)
  super(runner)

  @external_urls = external_urls
  @hydra = Typhoeus::Hydra.new(@runner.options[:hydra])
  @before_request = []

  @paths_with_queries = {}
end

Public Instance Methods

add_failure(metadata, description, status = nil) click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 193
def add_failure(metadata, description, status = nil)
  if blank?(metadata) # possible if we're checking an array of links
    @failed_checks << Failure.new("", "Links > External", description, status: status)
  else
    metadata.each do |m|
      @failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status)
    end
  end
end
check_hash_in_2xx_response(href, url, response, filenames) click to toggle source

Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page

# File lib/html_proofer/url_validator/external.rb, line 118
def check_hash_in_2xx_response(href, url, response, filenames)
  return false if @runner.options[:only_4xx]
  return false unless @runner.options[:check_external_hash]
  return false unless url.hash?

  hash = url.hash
  headers = response.options.fetch(:headers, {})
  content_type = headers.find { |k, _| k.casecmp("content-type").zero? }

  # attempt to verify PDF hash ref; see #787 for more details
  # FIXME: this is re-reading the PDF response
  if content_type && content_type[1].include?("pdf")
    io = URI.parse(url.to_s).open
    reader = PDF::Reader.new(io)

    pages = reader.pages
    if hash =~ /\Apage=(\d+)\z/
      page = Regexp.last_match[1].to_i

      unless pages[page - 1]
        msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
        add_failure(filenames, msg, response.code)
        @cache.add_external(href, filenames, response.code, msg, false)
      end

      return true
    end
  end

  body_doc = create_nokogiri(response.body)

  unencoded_hash = Addressable::URI.unescape(hash)
  xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
  # user-content is a special addition by GitHub.
  if url.host =~ /github\.com/i
    xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
    # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
    # will be identified as a linkable portion
    xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
  end

  return unless body_doc.xpath(xpath.join("|")).empty?

  msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
  add_failure(filenames, msg, response.code)
  @cache.add_external(href, filenames, response.code, msg, false)
  true
end
handle_connection_failure(href, metadata, response_code, status_message) click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 175
      def handle_connection_failure(href, metadata, response_code, status_message)
        msgs = [<<~MSG,
          External link #{href} failed with something very wrong.
          It's possible libcurl couldn't connect to the server, or perhaps the request timed out.
          Sometimes, making too many requests at once also breaks things.
        MSG
        ]

        msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message)

        msg = msgs.join("\n").chomp

        @cache.add_external(href, metadata, 0, msg, false)
        return if @runner.options[:only_4xx]

        add_failure(metadata, msg, response_code)
      end
handle_timeout(href, filenames, response_code) click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 167
def handle_timeout(href, filenames, response_code)
  msg = "External link #{href} failed: got a time out (response code #{response_code})"
  @cache.add_external(href, filenames, 0, msg, false)
  return if @runner.options[:only_4xx]

  add_failure(filenames, msg, response_code)
end
queue_request(method, url, filenames) click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 72
def queue_request(method, url, filenames)
  opts = @runner.options[:typhoeus].merge(method: method)
  request = Typhoeus::Request.new(url.url, opts)
  @before_request.each do |callback|
    callback.call(request)
  end
  request.on_complete { |response| response_handler(response, url, filenames) }
  @hydra.queue(request)
end
response_handler(response, url, filenames) click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 82
def response_handler(response, url, filenames)
  method = response.request.options[:method]
  href = response.request.base_url.to_s
  response_code = response.code
  response.body.delete!("\x00")

  @logger.log(:debug, "Received a #{response_code} for #{href}")

  return if @runner.options[:ignore_status_codes].include?(response_code)

  if response_code.between?(200, 299)
    @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(
      href,
      url,
      response,
      filenames,
    )
  elsif response.timed_out?
    handle_timeout(href, filenames, response_code)
  elsif response_code.zero?
    handle_connection_failure(href, filenames, response_code, response.status_message)
  elsif method == :head # some servers don't support HEAD
    queue_request(:get, url, filenames)
  else
    return if @runner.options[:only_4xx] && !response_code.between?(400, 499)

    # Received a non-successful http response.
    status_message = blank?(response.status_message) ? "" : ": #{response.status_message}"
    msg = "External link #{href} failed#{status_message}"
    add_failure(filenames, msg, response_code)
    @cache.add_external(href, filenames, response_code, msg, false)
  end
end
validate() click to toggle source
# File lib/html_proofer/url_validator/external.rb, line 26
def validate
  urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
  urls_detected = pluralize(urls_to_check.count, "external link", "external links")
  @logger.log(:info, "Checking #{urls_detected}")

  run_external_link_checker(urls_to_check)

  @failed_checks
end

Private Instance Methods

new_url_query_values?(url) click to toggle source

remember queries we’ve seen, ignore future ones

# File lib/html_proofer/url_validator/external.rb, line 204
        def new_url_query_values?(url)
  return true if (query_values = url.query_values).nil?

  queries = query_values.keys.join("-")
  domain_path = url.domain_path
  if @paths_with_queries[domain_path].nil?
    @paths_with_queries[domain_path] = [queries]
    true
  elsif !@paths_with_queries[domain_path].include?(queries)
    @paths_with_queries[domain_path] << queries
    true
  else
    false
  end
end