module RDF::RDFa::Reader::Nokogiri

Nokogiri implementation of an XML parser.

@see nokogiri.org/

Public Class Methods

library() click to toggle source

Returns the name of the underlying XML library.

@return [Symbol]

# File lib/rdf/rdfa/reader/nokogiri.rb, line 12
def self.library
  :nokogiri
end

Public Instance Methods

detect_host_language_version(input, **options) click to toggle source

Determine the host language and/or version from options and the input document

# File lib/rdf/rdfa/reader/nokogiri.rb, line 191
def detect_host_language_version(input, **options)
  @host_language = options[:host_language] ? options[:host_language].to_sym : nil
  @version = options[:version] ? options[:version].to_sym : nil
  return if @host_language && @version

  # Snif version based on input
  case input
  when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document
    doc_type_string = input.children.detect {|c| c.is_a?(::Nokogiri::XML::DTD)}
    version_attr = input.root && input.root.attribute("version").to_s
    root_element = input.root.name.downcase
    content_type = case
    when root_element == "html" && input.is_a?(::Nokogiri::HTML::Document)
      "text/html"
    when root_element == "html" && input.is_a?(::Nokogiri::XML::Document)
      "application/xhtml+html"
    end
  else
    content_type = input.content_type if input.respond_to?(:content_type)

    # Determine from head of document
    head = if input.respond_to?(:read)
      input.rewind
      string = input.read(1000)
      input.rewind
      string.to_s
    else
      input.to_s[0..1000]
    end

    doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
    root = head.match(%r(<[^!\?>]*>)m).to_s
    root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
    version_attr = root.match(/version\s*=\s*"([^"]+)"/m) ? $1 : ""
    head_element = head.match(%r(<head.*<\/head>)mi)
    head_doc = ::Nokogiri::HTML.parse(head_element.to_s)

    # May determine content-type and/or charset from meta
    # Easist way is to parse head into a document and iterate
    # of CSS matches
    head_doc.css("meta").each do |e|
      if e.attr("http-equiv").to_s.downcase == 'content-type'
        content_type, e = e.attr("content").to_s.downcase.split(";")
        options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
      elsif e.attr("charset")
        options[:encoding] = e.attr("charset").to_s.downcase
      end
    end
  end

  # Already using XML parser, determine from DOCTYPE and/or root element
  @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
  @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
  @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
  @version ||= :"rdfa1.1"

  @host_language ||= :xhtml1 if @version == :"rdfa1.0" && doc_type_string =~ /html/i

  @host_language ||= case content_type
  when "application/xml"  then :xml
  when "image/svg+xml"    then :svg
  when "text/html"
    case doc_type_string
    when /html 4/i        then :html4
    when /xhtml/i         then :xhtml1
    when /html/i          then :html5
    else                       :html5
    end
  when "application/xhtml+xml"
    case doc_type_string
    when /html 4/i        then :html4
    when /xhtml/i         then :xhtml1
    when /html/i          then :xhtml5
    else                       :xhtml5
    end
  else
    case root_element
    when /svg/i           then :svg
    else                       :html5
    end
  end
end
doc_base(base) click to toggle source

Find value of document base

@param [String] base Existing base from URI or :base_uri @return [String]

# File lib/rdf/rdfa/reader/nokogiri.rb, line 299
def doc_base(base)
  # find if the document has a base element
  case @host_language
  when :xhtml1, :xhtml5, :html4, :html5
    base_el = @doc.at_css("html>head>base")
    base = base.join(base_el.attribute("href").to_s.split("#").first) if base_el
  else
    xml_base = root.attribute_with_ns("base", "http://www.w3.org/XML/1998/namespace") || root.attribute('xml:base') if root
    base = base.join(xml_base) if xml_base
  end

  base
end
doc_errors() click to toggle source

Document errors

# File lib/rdf/rdfa/reader/nokogiri.rb, line 284
def doc_errors
  if @host_language == :html5
    @doc.errors.reject do |e|
      e.to_s =~ %r{(The doctype must be the first token in the document)|(Expected a doctype token)|(Unexpected '\?' where start tag name is expected)}
    end
  else
    @doc.errors.reject {|e| e.to_s =~ /(?:Tag \w+ invalid)|(?:Missing attribute name)/}
  end
end
initialize_xml(input, **options) click to toggle source

Initializes the underlying XML library.

@param [Hash{Symbol => Object}] options @return [void]

# File lib/rdf/rdfa/reader/nokogiri.rb, line 161
def initialize_xml(input, **options)
  require 'nokogiri' unless defined?(::Nokogiri)
  @doc = case input
  when ::Nokogiri::HTML::Document, ::Nokogiri::XML::Document
    input
  else
    # Try to detect charset from input
    options[:encoding] ||= input.charset if input.respond_to?(:charset)

    # Otherwise, default is utf-8
    options[:encoding] ||= 'utf-8'
    options[:encoding] = options[:encoding].to_s if options[:encoding]

    case @host_language
    when :html4
      ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
    when :html5
      begin
        input = input.read if input.respond_to?(:read)
        ::Nokogiri::HTML5(input.force_encoding(options[:encoding]), max_parse_errors: 1000)
      rescue LoadError, NoMethodError
        ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
      end
    else
      ::Nokogiri::XML.parse(input, base_uri.to_s, options[:encoding])
    end
  end
end
root() click to toggle source

Return proxy for document root

# File lib/rdf/rdfa/reader/nokogiri.rb, line 278
def root
  @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
end