class ValidateWebsite::Crawl
Class for http website validation
Attributes
crawler[R]
Public Class Methods
new(options = {}, validation_type = :crawl)
click to toggle source
Calls superclass method
ValidateWebsite::Core::new
# File lib/validate_website/crawl.rb, line 11 def initialize(options = {}, validation_type = :crawl) super start_message(@site) end
Public Instance Methods
crawl(options = {})
click to toggle source
@param [Hash] options
:color [Boolean] color output (true, false) :exclude [String] a String used by Regexp.new :markup [Boolean] Check the markup validity :not_found [Boolean] Check for not found page (404)
# File lib/validate_website/crawl.rb, line 26 def crawl(options = {}) @options = @options.merge(options) @options[:ignore_links] = @options[:exclude] if @options[:exclude] @crawler = spidr_crawler(@site, @options) print_status_line(@crawler.history.size, @crawler.failures.size, @not_founds_count, @errors_count) end
history_count()
click to toggle source
# File lib/validate_website/crawl.rb, line 16 def history_count crawler.history.size end
Private Instance Methods
extract_imgs_from_page(page)
click to toggle source
Extract imgs urls from page
@param [Spidr::Page] an Spidr::Page object @return [Array] Lists of urls
# File lib/validate_website/crawl.rb, line 44 def extract_imgs_from_page(page) return Set[] if page.is_redirect? page.doc.search('//img[@src]').reduce(Set[]) do |result, elem| u = elem.attributes['src'].content result << page.to_absolute(URI.parse(WEBrick::HTTPUtils.escape(u))) end end
on_every_css_page(crawler)
click to toggle source
# File lib/validate_website/crawl.rb, line 63 def on_every_css_page(crawler) crawler.every_css_page do |page| check_css_syntax(page) if options[:css_syntax] ValidateWebsite::Utils.extract_urls_from_css(page).each do |u| crawler.enqueue(u) end end end
on_every_failed_url(crawler)
click to toggle source
# File lib/validate_website/crawl.rb, line 91 def on_every_failed_url(crawler) crawler.every_failed_url do |url| not_found_error(url) end end
on_every_html_page(crawler)
click to toggle source
# File lib/validate_website/crawl.rb, line 76 def on_every_html_page(crawler) crawler.every_html_page do |page| extract_imgs_from_page(page).each do |i| crawler.enqueue(i) end if validate?(page) keys = %i[ignore html5_validator] # slice does not exists on Ruby <= 2.4 slice = Hash[[keys, options.values_at(*keys)].transpose] validate(page.doc, page.body, page.url, slice) end end end
spidr_crawler(site, options)
click to toggle source
# File lib/validate_website/crawl.rb, line 53 def spidr_crawler(site, options) @host = URI(site).host Spidr.site(site, options) do |crawler| crawler.cookies[@host] = default_cookies if options[:cookies] on_every_css_page(crawler) on_every_html_page(crawler) on_every_failed_url(crawler) if options[:not_found] end end
validate?(page)
click to toggle source
# File lib/validate_website/crawl.rb, line 72 def validate?(page) options[:markup] && page.html? && !page.is_redirect? end