class RDF::Normalize::RDFC10
Public Class Methods
new(enumerable, **options)
click to toggle source
Create an enumerable with grounded nodes
@param [RDF::Enumerable] enumerable @option options [Integer] :max_calls (40)
Maximum number of calls allowed for recursive blank node labeling, as a multiple of the total number of blank nodes in the dataset.
@options options [:MD5, :SHA1, :SHA2, :SHA256, :SHA384, :SHA512] :hash_algorithm (:SHA256)
See [Digest Algorithms](https://github.com/ruby/digest#digest-algorithms)
@return [RDF::Enumerable] raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
# File lib/rdf/normalize/rdfc10.rb, line 24 def initialize(enumerable, **options) @dataset, @options = enumerable, options @options[:hash_algorithm] ||= :SHA256 unless %i{MD5 SHA1 SHA2 SHA256 SHA384 SHA512}.include?(@options[:hash_algorithm]) raise UnknownHashAlgorithm, "UnknownHashAlgorithm: #{@options[:hash_algorithm].inspect}. Use one of MD5, SHA1, SHA2, SHA256, SHA384, or SHA512" end end
Public Instance Methods
each(&block)
click to toggle source
Yields each normalized statement
# File lib/rdf/normalize/rdfc10.rb, line 33 def each(&block) ns = NormalizationState.new(**@options) log_debug("ca:") log_debug(" log point", "Entering the canonicalization function (4.5.3).") log_depth(depth: 2) {normalize_statements(ns, &block)} end
to_hash()
click to toggle source
Returns a map from input blank node identifiers to canonical blank node identifiers.
@return [Hash{String => String}]
# File lib/rdf/normalize/rdfc10.rb, line 43 def to_hash ns = NormalizationState.new(**@options) log_debug("ca:") log_debug(" log point", "Entering the canonicalization function (4.5.3).") log_depth(depth: 2) {normalize_statements(ns)} ns.canonical_issuer.to_hash end
Protected Instance Methods
normalize_statements(ns, &block)
click to toggle source
# File lib/rdf/normalize/rdfc10.rb, line 53 def normalize_statements(ns, &block) # Step 2: Map BNodes to the statements they are used by dataset.each_statement do |statement| statement.to_quad.compact.select(&:node?).each do |node| ns.add_statement(node, statement) end end log_debug("ca.2:") log_debug(" log point", "Extract quads for each bnode (4.5.3 (2)).") log_debug(" Bnode to quads:") if logger && logger.level == 0 ns.bnode_to_statements.each do |bn, statements| log_debug(" #{bn.id}:") statements.each do |s| log_debug {" - #{s.to_nquads.strip}"} end end end ns.hash_to_bnodes = {} # Step 3: Calculate hashes for first degree nodes log_debug("ca.3:") log_debug(" log point", "Calculated first degree hashes (4.5.3 (3)).") log_debug(" with:") ns.bnode_to_statements.each_key do |node| log_debug(" - identifier") {node.id} log_debug(" h1dq:") hash = log_depth(depth: 8) {ns.hash_first_degree_quads(node)} ns.add_bnode_hash(node, hash) end # Step 4: Create canonical replacements for hashes mapping to a single node log_debug("ca.4:") log_debug(" log point", "Create canonical replacements for hashes mapping to a single node (4.5.3 (4)).") log_debug(" with:") unless ns.hash_to_bnodes.empty? ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] next if identifier_list.length > 1 node = identifier_list.first id = ns.canonical_issuer.issue_identifier(node) log_debug(" - identifier") {node.id} log_debug(" hash", hash) log_debug(" canonical label", id) ns.hash_to_bnodes.delete(hash) end # Step 5: Iterate over hashs having more than one node log_debug("ca.5:") unless ns.hash_to_bnodes.empty? log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).") log_debug(" with:") unless ns.hash_to_bnodes.empty? # Initialize the number of calls allowed to hash_n_degree_quads # as a multiple of the total number of blank nodes in the dataset. ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40) ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] log_debug(" - hash", hash) log_debug(" identifier list") {identifier_list.map(&:id).to_json(indent: ' ')} hash_path_list = [] # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements log_debug(" ca.5.2:") log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5.2)).") log_debug(" with:") unless identifier_list.empty? identifier_list.each do |identifier| next if ns.canonical_issuer.issued.include?(identifier) temporary_issuer = IdentifierIssuer.new("b") temporary_issuer.issue_identifier(identifier) log_debug(" - identifier") {identifier.id} hash_path_list << log_depth(depth: 12) {ns.hash_n_degree_quads(identifier, temporary_issuer)} end # Create canonical replacements for nodes log_debug(" ca.5.3:") unless hash_path_list.empty? log_debug(" log point", "Canonical identifiers for temporary identifiers (4.5.3 (5.3)).") log_debug(" issuer:") unless hash_path_list.empty? hash_path_list.sort_by(&:first).each do |result, issuer| issuer.issued.each do |node| id = ns.canonical_issuer.issue_identifier(node) log_debug(" - blank node") {node.id} log_debug(" canonical identifier", id) end end end # Step 6: Yield statements using BNodes from canonical replacements if block_given? dataset.each_statement do |statement| if statement.has_blank_nodes? quad = statement.to_quad.compact.map do |term| term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term end block.call RDF::Statement.from(quad) else block.call statement end end end log_debug("ca.6:") log_debug(" log point", "Issued identifiers map (4.4.3 (6)).") log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}") dataset end