class Traject::ExperimentalNokogiriStreamingReader
An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri. Not ready for use, not stable API, could go away.
This was my first try at a NokogiriReader
implementation, it didn't work out, at least without a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the source is traversed, instead of relying on outer_xml – outer_xml returning a string results in a double-parsing, with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.
All in all, it's possible something could be gotten here with a lot more work, it's also possible Nokogiri's antipathy to namespaces could keep getting in the way.
Attributes
clipboard[R]
input_stream[R]
path_tracker[R]
settings[R]
Public Class Methods
new(input_stream, settings)
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 17 def initialize(input_stream, settings) @settings = Traject::Indexer::Settings.new settings @input_stream = input_stream @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new if each_record_xpath @path_tracker = PathTracker.new(each_record_xpath, clipboard: self.clipboard, namespaces: default_namespaces, extra_xpath_hooks: extra_xpath_hooks) end default_namespaces # trigger validation validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath") end
Public Instance Methods
default_namespaces()
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 77 def default_namespaces @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns| unless ns.kind_of?(Hash) raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}" end } end
each() { |parse| ... }
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 85 def each unless each_record_xpath # forget streaming, just read it and return it once, done. yield Nokogiri::XML.parse(input_stream) return end reader = Nokogiri::XML::Reader(input_stream) reader.each do |reader_node| if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT path_tracker.push(reader_node) if path_tracker.match? yield path_tracker.current_node_doc end path_tracker.run_extra_xpath_hooks if reader_node.self_closing? path_tracker.pop end end if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT path_tracker.pop end end end
each_record_xpath()
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 34 def each_record_xpath @each_record_xpath ||= settings["nokogiri.each_record_xpath"] end
extra_xpath_hooks()
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 38 def extra_xpath_hooks @extra_xpath_hooks ||= begin (settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash| hash.each_pair do |limited_xpath, callable| validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks") end end end end
Protected Instance Methods
validate_limited_xpath(each_record_xpath, key_name:)
click to toggle source
# File lib/traject/experimental_nokogiri_streaming_reader.rb, line 48 def validate_limited_xpath(each_record_xpath, key_name:) return unless each_record_xpath components = each_record_xpath.split('/') components.each do |component| prefix, element = component.split(':') unless element # there was no namespace prefix, element = nil, prefix end # We don't support brackets or any xpath beyond the MOST simple. # Catch a few we can catch. if element =~ /::/ || element =~ /[\[\]]/ raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}" end if prefix ns_uri = default_namespaces[prefix] if ns_uri.nil? raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}" end end end each_record_xpath end