class RDF::Normalize::RDFC10

Public Class Methods

new(enumerable, **options) click to toggle source

Create an enumerable with grounded nodes

@param [RDF::Enumerable] enumerable @option options [Integer] :max_calls (40)

Maximum number of calls allowed for recursive blank node labeling,
as a multiple of the total number of blank nodes in the dataset.

@options options [:MD5, :SHA1, :SHA2, :SHA256, :SHA384, :SHA512] :hash_algorithm (:SHA256)

See [Digest Algorithms](https://github.com/ruby/digest#digest-algorithms)

@return [RDF::Enumerable] raise [RuntimeError] if the maximum number of levels of recursion is exceeded.

# File lib/rdf/normalize/rdfc10.rb, line 24
def initialize(enumerable, **options)
  @dataset, @options = enumerable, options
  @options[:hash_algorithm] ||= :SHA256
  unless %i{MD5 SHA1 SHA2 SHA256 SHA384 SHA512}.include?(@options[:hash_algorithm])
    raise UnknownHashAlgorithm, "UnknownHashAlgorithm: #{@options[:hash_algorithm].inspect}. Use one of MD5, SHA1, SHA2, SHA256, SHA384, or SHA512"
  end
end

Public Instance Methods

each(&block) click to toggle source

Yields each normalized statement

# File lib/rdf/normalize/rdfc10.rb, line 33
def each(&block)
  ns = NormalizationState.new(**@options)
  log_debug("ca:")
  log_debug("  log point", "Entering the canonicalization function (4.5.3).")
  log_depth(depth: 2) {normalize_statements(ns, &block)}
end
to_hash() click to toggle source

Returns a map from input blank node identifiers to canonical blank node identifiers.

@return [Hash{String => String}]

# File lib/rdf/normalize/rdfc10.rb, line 43
def to_hash
  ns = NormalizationState.new(**@options)
  log_debug("ca:")
  log_debug("  log point", "Entering the canonicalization function (4.5.3).")
  log_depth(depth: 2) {normalize_statements(ns)}
  ns.canonical_issuer.to_hash
end

Protected Instance Methods

normalize_statements(ns, &block) click to toggle source
# File lib/rdf/normalize/rdfc10.rb, line 53
def normalize_statements(ns, &block)
  # Step 2: Map BNodes to the statements they are used by
  dataset.each_statement do |statement|
    statement.to_quad.compact.select(&:node?).each do |node|
      ns.add_statement(node, statement)
    end
  end
  log_debug("ca.2:")
  log_debug("  log point", "Extract quads for each bnode (4.5.3 (2)).")
  log_debug("  Bnode to quads:")
  if logger && logger.level == 0
    ns.bnode_to_statements.each do |bn, statements|
      log_debug("    #{bn.id}:")
      statements.each do |s|
        log_debug {"      - #{s.to_nquads.strip}"}
      end
    end
  end

  ns.hash_to_bnodes = {}

  # Step 3: Calculate hashes for first degree nodes
  log_debug("ca.3:")
  log_debug("  log point", "Calculated first degree hashes (4.5.3 (3)).")
  log_debug("  with:")
  ns.bnode_to_statements.each_key do |node|
    log_debug("    - identifier") {node.id}
    log_debug("      h1dq:")
    hash = log_depth(depth: 8) {ns.hash_first_degree_quads(node)}
    ns.add_bnode_hash(node, hash)
  end

  # Step 4: Create canonical replacements for hashes mapping to a single node
  log_debug("ca.4:")
  log_debug("  log point", "Create canonical replacements for hashes mapping to a single node (4.5.3 (4)).")
  log_debug("  with:") unless ns.hash_to_bnodes.empty?
  ns.hash_to_bnodes.keys.sort.each do |hash|
    identifier_list = ns.hash_to_bnodes[hash]
    next if identifier_list.length > 1
    node = identifier_list.first
    id = ns.canonical_issuer.issue_identifier(node)
    log_debug("    - identifier") {node.id}
    log_debug("      hash", hash)
    log_debug("      canonical label", id)
    ns.hash_to_bnodes.delete(hash)
  end

  # Step 5: Iterate over hashs having more than one node
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
  log_debug("  log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
  log_debug("  with:") unless ns.hash_to_bnodes.empty?

  # Initialize the number of calls allowed to hash_n_degree_quads
  # as a multiple of the total number of blank nodes in the dataset.
  ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)

  ns.hash_to_bnodes.keys.sort.each do |hash|
    identifier_list = ns.hash_to_bnodes[hash]

    log_debug("    - hash", hash) 
    log_debug("      identifier list") {identifier_list.map(&:id).to_json(indent: ' ')}
    hash_path_list = []

    # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
    log_debug("      ca.5.2:")
    log_debug("        log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5.2)).")
    log_debug("        with:") unless identifier_list.empty?
    identifier_list.each do |identifier|
      next if ns.canonical_issuer.issued.include?(identifier)
      temporary_issuer = IdentifierIssuer.new("b")
      temporary_issuer.issue_identifier(identifier)
      log_debug("          - identifier") {identifier.id}
      hash_path_list << log_depth(depth: 12) {ns.hash_n_degree_quads(identifier, temporary_issuer)}
    end

    # Create canonical replacements for nodes
    log_debug("      ca.5.3:") unless hash_path_list.empty?
    log_debug("        log point", "Canonical identifiers for temporary identifiers (4.5.3 (5.3)).")
    log_debug("        issuer:") unless hash_path_list.empty?
    hash_path_list.sort_by(&:first).each do |result, issuer|
      issuer.issued.each do |node|
        id = ns.canonical_issuer.issue_identifier(node)
        log_debug("          - blank node") {node.id}
        log_debug("            canonical identifier", id)
      end
    end
  end

  # Step 6: Yield statements using BNodes from canonical replacements
  if block_given?
    dataset.each_statement do |statement|
      if statement.has_blank_nodes?
        quad = statement.to_quad.compact.map do |term|
          term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
        end
        block.call RDF::Statement.from(quad)
      else
        block.call statement
      end
    end
  end

  log_debug("ca.6:")
  log_debug("  log point", "Issued identifiers map (4.4.3 (6)).")
  log_debug("  issued identifiers map: #{ns.canonical_issuer.inspect}")
  dataset
end