module RDF::Microdata::Reader::Nokogiri

Nokogiri implementation of an HTML parser.

@see nokogiri.org/

Public Class Methods

library() click to toggle source

Returns the name of the underlying XML library.

@return [Symbol]

# File lib/rdf/microdata/reader/nokogiri.rb, line 12
def self.library
  :nokogiri
end

Public Instance Methods

doc_base(base) click to toggle source

Find value of document base

@param [String] base Existing base from URI or :base_uri @return [String]

# File lib/rdf/microdata/reader/nokogiri.rb, line 224
def doc_base(base)
  # find if the document has a base element
  base_el = @doc.at_css("html>head>base") 
  base = base_el.attribute("href").to_s.split("#").first if base_el
  base
end
doc_errors() click to toggle source

Document errors

# File lib/rdf/microdata/reader/nokogiri.rb, line 213
def doc_errors
  @doc.errors.reject do |e|
    e.to_s =~ %r{(The doctype must be the first token in the document)|(Expected a doctype token)|(Unexpected '\?' where start tag name is expected)}
  end
end
find_element_by_id(id) click to toggle source

Look up an element in the document by id

# File lib/rdf/microdata/reader/nokogiri.rb, line 241
def find_element_by_id(id)
  (e = @doc.at_css("##{id}")) && NodeProxy.new(e)
end
getItems() click to toggle source

Based on Microdata element.getItems

@see www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items

# File lib/rdf/microdata/reader/nokogiri.rb, line 235
def getItems
  @doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
end
initialize_html(input, **options) click to toggle source

Initializes the underlying XML library.

@param [Hash{Symbol => Object}] options @return [void]

# File lib/rdf/microdata/reader/nokogiri.rb, line 181
def initialize_html(input, **options)
  require 'nokogiri' unless defined?(::Nokogiri)
  @doc = case input
  when ::Nokogiri::XML::Document
    input
  else
    # Try to detect charset from input
    options[:encoding] ||= input.charset if input.respond_to?(:charset)
    
    # Otherwise, default is utf-8
    options[:encoding] ||= 'utf-8'
    options[:encoding] = options[:encoding].to_s if options[:encoding]

    begin
      input = input.read if input.respond_to?(:read)
      ::Nokogiri::HTML5(input.force_encoding(options[:encoding]), max_parse_errors: 1000)
    rescue LoadError, NoMethodError
      ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
    end
  end
end
root() click to toggle source

Return proxy for document root

# File lib/rdf/microdata/reader/nokogiri.rb, line 207
def root
  @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
end