class MicroMicro::Document

Constants

HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP

A map of HTML `srcset` attributes and their associated element names

@see html.spec.whatwg.org/#srcset-attributes @see html.spec.whatwg.org/#attributes-3

HTML_URL_ATTRIBUTES_MAP

A map of HTML URL attributes and their associated element names

@see html.spec.whatwg.org/#attributes-3

Attributes

base_url[R]
markup[R]

Public Class Methods

ignore_node?(node) click to toggle source

Ignore this node?

@param node [Nokogiri::XML::Element] @return [Boolean]

# File lib/micro_micro/document.rb, line 83
def self.ignore_node?(node)
  ignored_node_names.include?(node.name)
end
ignored_node_names() click to toggle source

A list of HTML element names the parser should ignore.

@return [Array<String>]

# File lib/micro_micro/document.rb, line 90
def self.ignored_node_names
  %w[script style template]
end
new(markup, base_url) click to toggle source

Parse a string of HTML for microformats2-encoded data.

MicroMicro::Document.new('<a href="/" class="h-card" rel="me">Jason Garber</a>', 'https://sixtwothree.org')

Or, pull the source HTML of a page on the Web:

url = 'https://tantek.com'
markup = Net::HTTP.get(URI.parse(url))

doc = MicroMicro::Document.new(markup, url)

@param markup [String] The HTML to parse for microformats2-encoded data. @param base_url [String] The URL associated with markup. Used for relative URL resolution.

# File lib/micro_micro/document.rb, line 40
def initialize(markup, base_url)
  @markup = markup
  @base_url = base_url

  resolve_relative_urls
end
text_content_from(context) { |context| ... } click to toggle source

@see microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties @see microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties

@param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element] @yield [context] @return [String]

# File lib/micro_micro/document.rb, line 100
def self.text_content_from(context)
  context.css(*ignored_node_names).unlink

  yield(context) if block_given?

  context.text.strip
end

Public Instance Methods

inspect() click to toggle source

@return [String]

# File lib/micro_micro/document.rb, line 48
def inspect
  format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relationships: #{relationships.inspect}>), object_id)
end
items() click to toggle source

A collection of items parsed from the provided markup.

@return [MicroMicro::Collections::ItemsCollection]

# File lib/micro_micro/document.rb, line 55
def items
  @items ||= Collections::ItemsCollection.new(Item.items_from(document))
end
relationships() click to toggle source

A collection of relationships parsed from the provided markup.

@return [MicroMicro::Collections::RelationshipsCollection]

# File lib/micro_micro/document.rb, line 62
def relationships
  @relationships ||= Collections::RelationshipsCollection.new(Relationship.relationships_from(document))
end
to_h() click to toggle source

Return the parsed document as a Hash.

@see microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats

@return [Hash{Symbol => Array, Hash}]

# File lib/micro_micro/document.rb, line 71
def to_h
  {
    items: items.to_a,
    rels: relationships.group_by_rel,
    'rel-urls': relationships.group_by_url
  }
end

Private Instance Methods

base_element() click to toggle source

@return [Nokogiri::XML::Element, nil]

# File lib/micro_micro/document.rb, line 113
def base_element
  @base_element ||= Nokogiri::HTML(markup).at('//base[@href]')
end
document() click to toggle source

@return [Nokogiri::HTML::Document]

# File lib/micro_micro/document.rb, line 118
def document
  @document ||= Nokogiri::HTML(markup, resolved_base_url)
end
resolve_relative_urls() click to toggle source
# File lib/micro_micro/document.rb, line 122
def resolve_relative_urls
  HTML_URL_ATTRIBUTES_MAP.each do |attribute, names|
    document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node|
      node[attribute] = Addressable::URI.join(resolved_base_url, node[attribute].strip).normalize.to_s
    end
  end

  HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP.each do |attribute, names|
    document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node|
      candidates = node[attribute].split(',').map(&:strip).map { |candidate| candidate.match(/^(?<url>.+?)(?<descriptor>\s+.+)?$/) }

      node[attribute] = candidates.map { |candidate| "#{Addressable::URI.join(resolved_base_url, candidate[:url]).normalize}#{candidate[:descriptor]}" }.join(', ')
    end
  end

  self
end
resolved_base_url() click to toggle source

@return [String]

# File lib/micro_micro/document.rb, line 141
def resolved_base_url
  @resolved_base_url ||= begin
    if base_element
      Addressable::URI.join(base_url, base_element['href'].strip).normalize.to_s
    else
      base_url
    end
  end
end