class Traject::ExperimentalNokogiriStreamingReader

An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri. Not ready for use, not stable API, could go away.

This was my first try at a NokogiriReader implementation, it didn't work out, at least without a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the source is traversed, instead of relying on outer_xml – outer_xml returning a string results in a double-parsing, with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.

All in all, it's possible something could be gotten here with a lot more work, it's also possible Nokogiri's antipathy to namespaces could keep getting in the way.

Attributes

clipboard[R]
input_stream[R]
path_tracker[R]
settings[R]

Public Class Methods

new(input_stream, settings) click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 17
def initialize(input_stream, settings)
  @settings = Traject::Indexer::Settings.new settings
  @input_stream = input_stream
  @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new

  if each_record_xpath
    @path_tracker = PathTracker.new(each_record_xpath,
                                      clipboard: self.clipboard,
                                      namespaces: default_namespaces,
                                      extra_xpath_hooks: extra_xpath_hooks)
  end

  default_namespaces # trigger validation
  validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")

end

Public Instance Methods

default_namespaces() click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 77
def default_namespaces
  @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
    unless ns.kind_of?(Hash)
      raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
    end
  }
end
each() { |parse| ... } click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 85
def each
  unless each_record_xpath
    # forget streaming, just read it and return it once, done.
    yield Nokogiri::XML.parse(input_stream)
    return
  end

  reader = Nokogiri::XML::Reader(input_stream)

  reader.each do |reader_node|
    if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
      path_tracker.push(reader_node)

      if path_tracker.match?
        yield path_tracker.current_node_doc
      end
      path_tracker.run_extra_xpath_hooks

      if reader_node.self_closing?
        path_tracker.pop
      end
    end

    if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
      path_tracker.pop
    end
  end
end
each_record_xpath() click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 34
def each_record_xpath
  @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
end
extra_xpath_hooks() click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 38
def extra_xpath_hooks
  @extra_xpath_hooks ||= begin
    (settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
      hash.each_pair do |limited_xpath, callable|
        validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
      end
    end
  end
end

Protected Instance Methods

validate_limited_xpath(each_record_xpath, key_name:) click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 48
          def validate_limited_xpath(each_record_xpath, key_name:)
  return unless each_record_xpath

  components = each_record_xpath.split('/')
  components.each do |component|
    prefix, element = component.split(':')
    unless element
      # there was no namespace
      prefix, element = nil, prefix
    end

    # We don't support brackets or any xpath beyond the MOST simple.
    # Catch a few we can catch.
    if element =~ /::/ || element =~ /[\[\]]/
      raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
    end

    if prefix
      ns_uri = default_namespaces[prefix]
      if ns_uri.nil?
        raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
      end
    end
  end

  each_record_xpath
end