class ValidateWebsite::Crawl

Class for http website validation

Attributes

crawler[R]

Public Class Methods

new(options = {}, validation_type = :crawl) click to toggle source
Calls superclass method ValidateWebsite::Core::new
# File lib/validate_website/crawl.rb, line 11
def initialize(options = {}, validation_type = :crawl)
  super
  start_message(@site)
end

Public Instance Methods

crawl(options = {}) click to toggle source

@param [Hash] options

:color [Boolean] color output (true, false)
:exclude [String] a String used by Regexp.new
:markup [Boolean] Check the markup validity
:not_found [Boolean] Check for not found page (404)
# File lib/validate_website/crawl.rb, line 26
def crawl(options = {})
  @options = @options.merge(options)
  @options[:ignore_links] = @options[:exclude] if @options[:exclude]

  @crawler = spidr_crawler(@site, @options)
  print_status_line(@crawler.history.size,
                    @crawler.failures.size,
                    @not_founds_count,
                    @errors_count)
end
history_count() click to toggle source
# File lib/validate_website/crawl.rb, line 16
def history_count
  crawler.history.size
end

Private Instance Methods

extract_imgs_from_page(page) click to toggle source

Extract imgs urls from page

@param [Spidr::Page] an Spidr::Page object @return [Array] Lists of urls

# File lib/validate_website/crawl.rb, line 44
def extract_imgs_from_page(page)
  return Set[] if page.is_redirect?

  page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
    u = elem.attributes['src'].content
    result << page.to_absolute(URI.parse(WEBrick::HTTPUtils.escape(u)))
  end
end
on_every_css_page(crawler) click to toggle source
# File lib/validate_website/crawl.rb, line 63
def on_every_css_page(crawler)
  crawler.every_css_page do |page|
    check_css_syntax(page) if options[:css_syntax]
    ValidateWebsite::Utils.extract_urls_from_css(page).each do |u|
      crawler.enqueue(u)
    end
  end
end
on_every_failed_url(crawler) click to toggle source
# File lib/validate_website/crawl.rb, line 91
def on_every_failed_url(crawler)
  crawler.every_failed_url do |url|
    not_found_error(url)
  end
end
on_every_html_page(crawler) click to toggle source
# File lib/validate_website/crawl.rb, line 76
def on_every_html_page(crawler)
  crawler.every_html_page do |page|
    extract_imgs_from_page(page).each do |i|
      crawler.enqueue(i)
    end

    if validate?(page)
      keys = %i[ignore html5_validator]
      # slice does not exists on Ruby <= 2.4
      slice = Hash[[keys, options.values_at(*keys)].transpose]
      validate(page.doc, page.body, page.url, slice)
    end
  end
end
spidr_crawler(site, options) click to toggle source
# File lib/validate_website/crawl.rb, line 53
def spidr_crawler(site, options)
  @host = URI(site).host
  Spidr.site(site, options) do |crawler|
    crawler.cookies[@host] = default_cookies if options[:cookies]
    on_every_css_page(crawler)
    on_every_html_page(crawler)
    on_every_failed_url(crawler) if options[:not_found]
  end
end
validate?(page) click to toggle source
# File lib/validate_website/crawl.rb, line 72
def validate?(page)
  options[:markup] && page.html? && !page.is_redirect?
end