class WebChecker
Constants
- IgnoreErrors
- LinkElementsXPath
- Schemas
- SchemasDir
- VERSION
Public Class Methods
new(site_uri:, site_dir:)
click to toggle source
# File lib/web-checker.rb, line 28 def initialize(site_uri:, site_dir:) @site_uri = Addressable::URI.parse(site_uri) @site_dir = Path.new(site_dir) @schemas = {} @visited = {} end
Public Instance Methods
check()
click to toggle source
# File lib/web-checker.rb, line 35 def check # get/parse robots # get/parse sitemap check_uri(@site_uri) end
check_css(uri, css)
click to toggle source
# File lib/web-checker.rb, line 137 def check_css(uri, css) css.gsub(/\burl\(\s*["'](.*?)["']\s*\)/) do check_uri(uri + $1) end end
check_html(uri, html)
click to toggle source
# File lib/web-checker.rb, line 75 def check_html(uri, html) check_html_tidy(uri, html) check_html_nokogiri(uri, html) end
check_html_nokogiri(uri, html)
click to toggle source
# File lib/web-checker.rb, line 104 def check_html_nokogiri(uri, html) doc_class = (html =~ /<!DOCTYPE html>/i) ? Nokogiri::HTML5 : Nokogiri::HTML doc = doc_class.parse(html) { |config| config.strict } unless doc.errors.empty? show_errors(doc.errors) raise Error, "HTML parsing failed (via Nokogiri)" end doc.xpath(LinkElementsXPath).each { |e| check_uri(uri + e.value) } end
check_html_tidy(uri, html)
click to toggle source
# File lib/web-checker.rb, line 80 def check_html_tidy(uri, html) tmp_file = Path.tmpfile tmp_file.write(html) errors = %x{tidy -utf8 -quiet -errors #{tmp_file} 2>&1}.split("\n") errors = errors.map { |str| # line 82 column 1 - Warning: <table> lacks "summary" attribute str =~ /^line (\d+) column (\d+) - (.*?): (.*)$/ or raise "Can't parse error: #{str.inspect}" { msg: str, line: $1.to_i, column: $2.to_i, type: $3.downcase.to_sym, error: $4.strip, } }.reject { |e| IgnoreErrors.include?(e[:error]) } unless errors.empty? warn "#{uri} has invalid HTML" show_errors(errors) raise Error, "HTML parsing failed (via Tidy)" end end
check_uri(uri)
click to toggle source
# File lib/web-checker.rb, line 41 def check_uri(uri) uri = Addressable::URI.parse(uri) uri.normalize! return unless local?(uri) && !seen?(uri) # ;;warn "CHECKING: #{uri}" response = HTTP.get(uri) # ;;pp(response: response) @visited[uri] = true case response.code when 200...300 body = response.body.to_s # ;;pp(body: body) case (type = response.headers['Content-Type']) when 'text/html' check_html(uri, body) when 'text/css' check_css(uri, body) when 'application/xml', 'text/xml' check_xml(uri, body) when 'image/jpeg', 'image/png', 'image/gif', 'application/javascript' # ignore else ;;warn "skipping unknown resource type: #{uri} (#{type})" end when 300...400 redirect_uri = Addressable::URI.parse(response.headers['Location']) check_uri(uri + redirect_uri) when 404 raise Error, "URI not found: #{uri}" else raise Error, "Bad status: #{response.inspect}" end end
check_xml(uri, xml)
click to toggle source
# File lib/web-checker.rb, line 114 def check_xml(uri, xml) xml_doc = Nokogiri::XML::Document.parse(xml) { |config| config.strict } unless xml_doc.errors.empty? show_errors(xml_doc.errors) raise Error, "XML parsing failed" end root_name = xml_doc.root.name schema_file = Schemas[root_name] or raise Error, "Unknown schema: #{root_name.inspect}" schema = (@schemas[schema_file] ||= Nokogiri::XML::Schema(schema_file.open)) validation_errors = schema.validate(xml_doc) unless validation_errors.empty? show_errors(validation_errors) raise Error, "XML validation failed" end xml_doc.xpath(LinkElementsXPath).each { |e| check_uri(uri + e.value) } end
local?(uri)
click to toggle source
# File lib/web-checker.rb, line 143 def local?(uri) (!uri.scheme && !uri.host) || (uri.scheme == @site_uri.scheme && uri.host == @site_uri.host && uri.port == @site_uri.port) end
report()
click to toggle source
# File lib/web-checker.rb, line 152 def report unless @files.empty? puts "\t" + "unreferenced files:" @files.sort.each do |path| puts "\t\t" + path.to_s end end end
seen?(uri)
click to toggle source
# File lib/web-checker.rb, line 148 def seen?(uri) @visited[uri] end
show_errors(errors)
click to toggle source
# File lib/web-checker.rb, line 131 def show_errors(errors) errors.each do |error| warn "#{error} [line #{error[:line]}, column #{error[:column]}]" end end