class MicroMicro::Document
Constants
- HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP
A map of HTML `srcset` attributes and their associated element names
@see html.spec.whatwg.org/#srcset-attributes @see html.spec.whatwg.org/#attributes-3
- HTML_URL_ATTRIBUTES_MAP
A map of HTML URL attributes and their associated element names
Attributes
Public Class Methods
Ignore this node?
@param node [Nokogiri::XML::Element] @return [Boolean]
# File lib/micro_micro/document.rb, line 83 def self.ignore_node?(node) ignored_node_names.include?(node.name) end
A list of HTML element names the parser should ignore.
@return [Array<String>]
# File lib/micro_micro/document.rb, line 90 def self.ignored_node_names %w[script style template] end
Parse a string of HTML for microformats2-encoded data.
MicroMicro::Document.new('<a href="/" class="h-card" rel="me">Jason Garber</a>', 'https://sixtwothree.org')
Or, pull the source HTML of a page on the Web:
url = 'https://tantek.com' markup = Net::HTTP.get(URI.parse(url)) doc = MicroMicro::Document.new(markup, url)
@param markup [String] The HTML to parse for microformats2-encoded data. @param base_url
[String] The URL associated with markup. Used for relative URL resolution.
# File lib/micro_micro/document.rb, line 40 def initialize(markup, base_url) @markup = markup @base_url = base_url resolve_relative_urls end
@see microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties @see microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties
@param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element] @yield [context] @return [String]
# File lib/micro_micro/document.rb, line 100 def self.text_content_from(context) context.css(*ignored_node_names).unlink yield(context) if block_given? context.text.strip end
Public Instance Methods
@return [String]
# File lib/micro_micro/document.rb, line 48 def inspect format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relationships: #{relationships.inspect}>), object_id) end
A collection of items parsed from the provided markup.
@return [MicroMicro::Collections::ItemsCollection]
# File lib/micro_micro/document.rb, line 55 def items @items ||= Collections::ItemsCollection.new(Item.items_from(document)) end
A collection of relationships parsed from the provided markup.
@return [MicroMicro::Collections::RelationshipsCollection]
# File lib/micro_micro/document.rb, line 62 def relationships @relationships ||= Collections::RelationshipsCollection.new(Relationship.relationships_from(document)) end
Return the parsed document as a Hash.
@see microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
@return [Hash{Symbol => Array, Hash}]
# File lib/micro_micro/document.rb, line 71 def to_h { items: items.to_a, rels: relationships.group_by_rel, 'rel-urls': relationships.group_by_url } end
Private Instance Methods
@return [Nokogiri::XML::Element, nil]
# File lib/micro_micro/document.rb, line 113 def base_element @base_element ||= Nokogiri::HTML(markup).at('//base[@href]') end
@return [Nokogiri::HTML::Document]
# File lib/micro_micro/document.rb, line 118 def document @document ||= Nokogiri::HTML(markup, resolved_base_url) end
# File lib/micro_micro/document.rb, line 122 def resolve_relative_urls HTML_URL_ATTRIBUTES_MAP.each do |attribute, names| document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node| node[attribute] = Addressable::URI.join(resolved_base_url, node[attribute].strip).normalize.to_s end end HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP.each do |attribute, names| document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node| candidates = node[attribute].split(',').map(&:strip).map { |candidate| candidate.match(/^(?<url>.+?)(?<descriptor>\s+.+)?$/) } node[attribute] = candidates.map { |candidate| "#{Addressable::URI.join(resolved_base_url, candidate[:url]).normalize}#{candidate[:descriptor]}" }.join(', ') end end self end
@return [String]
# File lib/micro_micro/document.rb, line 141 def resolved_base_url @resolved_base_url ||= begin if base_element Addressable::URI.join(base_url, base_element['href'].strip).normalize.to_s else base_url end end end