class HTMLProofer::UrlValidator

Attributes

external_urls[R]

Public Class Methods

new(logger, external_urls, options) click to toggle source
# File lib/html-proofer/url_validator.rb, line 13
def initialize(logger, external_urls, options)
  @logger = logger
  @external_urls = external_urls
  @failed_tests = []
  @options = options
  @hydra = Typhoeus::Hydra.new(@options[:hydra])
  @cache = Cache.new(@logger, @options[:cache])
end

Public Instance Methods

add_external_issue(filenames, desc, status = nil) click to toggle source
# File lib/html-proofer/url_validator.rb, line 224
def add_external_issue(filenames, desc, status = nil)
  # possible if we're checking an array of links
  if filenames.nil?
    @failed_tests << Issue.new('', desc, status: status)
  else
    filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) }
  end
end
check_hash_in_2xx_response(href, effective_url, response, filenames) click to toggle source

Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page

# File lib/html-proofer/url_validator.rb, line 180
def check_hash_in_2xx_response(href, effective_url, response, filenames)
  return false if @options[:only_4xx]
  return false unless @options[:check_external_hash]
  return false unless (hash = hash?(href))

  body_doc = create_nokogiri(response.body)

  unencoded_hash = Addressable::URI.unescape(hash)
  xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
  # user-content is a special addition by GitHub.
  if URI.parse(href).host =~ /github\.com/i
    xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
    # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
    # will be identified as a linkable portion
    xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
  end

  return unless body_doc.xpath(xpath.join('|')).empty?

  msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not"
  add_external_issue(filenames, msg, response.code)
  @cache.add(href, filenames, response.code, msg)
  true
end
clean_url(href) click to toggle source
# File lib/html-proofer/url_validator.rb, line 126
def clean_url(href)
  # catch any obvious issues, like strings in port numbers
  parsed = Addressable::URI.parse(href)
  if href !~ /^([!#{$&}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/
    parsed.normalize
  else
    href
  end
end
establish_queue(external_urls) click to toggle source
# File lib/html-proofer/url_validator.rb, line 108
def establish_queue(external_urls)
  external_urls.each_pair do |url, filenames|
    url = begin
             clean_url(url)
          rescue URI::Error, Addressable::URI::InvalidURIError
            add_external_issue(filenames, "#{url} is an invalid URL")
            next
           end

    method = if hash?(url) && @options[:check_external_hash]
               :get
             else
               :head
             end
    queue_request(method, url, filenames)
  end
end
extract_domain_path(uri) click to toggle source
# File lib/html-proofer/url_validator.rb, line 70
def extract_domain_path(uri)
  uri.host + uri.path
end
handle_failure(href, filenames, response_code, return_message) click to toggle source
# File lib/html-proofer/url_validator.rb, line 213
def handle_failure(href, filenames, response_code, return_message)
  msg = "External link #{href} failed: response code #{response_code} means something's wrong.
         It's possible libcurl couldn't connect to the server or perhaps the request timed out.
         Sometimes, making too many requests at once also breaks things.
         Either way, the return message (if any) from the server is: #{return_message}"
  @cache.add(href, filenames, 0, msg)
  return if @options[:only_4xx]

  add_external_issue(filenames, msg, response_code)
end
handle_timeout(href, filenames, response_code) click to toggle source
# File lib/html-proofer/url_validator.rb, line 205
def handle_timeout(href, filenames, response_code)
  msg = "External link #{href} failed: got a time out (response code #{response_code})"
  @cache.add(href, filenames, 0, msg)
  return if @options[:only_4xx]

  add_external_issue(filenames, msg, response_code)
end
hash?(url) click to toggle source

Does the URL have a hash?

# File lib/html-proofer/url_validator.rb, line 234
def hash?(url)
  URI.parse(url).fragment
rescue URI::InvalidURIError
  false
end
load_cache() click to toggle source
# File lib/html-proofer/url_validator.rb, line 74
def load_cache
  cache_count = @cache.size
  cache_text = pluralize(cache_count, 'link', 'links')

  @logger.log :info, "Found #{cache_text} in the cache..."

  @cache.retrieve_urls(@external_urls)
end
new_url_query_values?(uri, paths_with_queries) click to toggle source

remember queries we've seen, ignore future ones

# File lib/html-proofer/url_validator.rb, line 56
def new_url_query_values?(uri, paths_with_queries)
  queries = uri.query_values.keys.join('-')
  domain_path = extract_domain_path(uri)
  if paths_with_queries[domain_path].nil?
    paths_with_queries[domain_path] = [queries]
    true
  elsif !paths_with_queries[domain_path].include?(queries)
    paths_with_queries[domain_path] << queries
    true
  else
    false
  end
end
queue_request(method, href, filenames) click to toggle source
# File lib/html-proofer/url_validator.rb, line 136
def queue_request(method, href, filenames)
  opts = @options[:typhoeus].merge(method: method)
  request = Typhoeus::Request.new(href, opts)
  request.on_complete { |response| response_handler(response, filenames) }
  @hydra.queue request
end
remove_query_values() click to toggle source
# File lib/html-proofer/url_validator.rb, line 36
def remove_query_values
  return nil if @external_urls.nil?

  paths_with_queries = {}
  iterable_external_urls = @external_urls.dup
  @external_urls.each_key do |url|
    uri = begin
            Addressable::URI.parse(url)
          rescue URI::Error, Addressable::URI::InvalidURIError
            @logger.log :error, "#{url} is an invalid URL"
            nil
          end
    next if uri.nil? || uri.query.nil?

    iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries)
  end
  iterable_external_urls
end
response_handler(response, filenames) click to toggle source
# File lib/html-proofer/url_validator.rb, line 143
def response_handler(response, filenames)
  effective_url = response.options[:effective_url]
  href = response.request.base_url.to_s
  method = response.request.options[:method]
  response_code = response.code
  response.body.delete!("\x00")

  debug_msg = if filenames.nil?
                "Received a #{response_code} for #{href}"
              else
                "Received a #{response_code} for #{href}  in #{filenames.join(' ')}"
              end

  @logger.log :debug, debug_msg

  return if @options[:http_status_ignore].include?(response_code)

  if response_code.between?(200, 299)
    @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames)
  elsif response.timed_out?
    handle_timeout(href, filenames, response_code)
  elsif response_code.zero?
    handle_failure(effective_url, filenames, response_code, response.return_message)
  elsif method == :head
    queue_request(:get, href, filenames)
  else
    return if @options[:only_4xx] && !response_code.between?(400, 499)

    # Received a non-successful http response.
    msg = "External link #{href} failed: #{response_code} #{response.return_message}"
    add_external_issue(filenames, msg, response_code)
    @cache.add(href, filenames, response_code, msg)
  end
end
run() click to toggle source
# File lib/html-proofer/url_validator.rb, line 22
def run
  @external_urls = remove_query_values

  if @cache.use_cache?
    urls_to_check = load_cache
    external_link_checker(urls_to_check)
    @cache.write
  else
    external_link_checker(@external_urls)
  end

  @failed_tests
end