class DocParser::Parser

The main parser class. This is the class you'll use to create your parser The real work happens in the Document class @see Document

Attributes

encoding[R]

@!visibility private

files[R]

@!visibility private

num_processes[R]

@!visibility private

outputs[R]

@!visibility private

Public Class Methods

new(files: [], quiet: false, encoding: 'utf-8', parallel: true, output: nil, range: nil, num_processes: Parallel.processor_count + 1) click to toggle source

Creates a new Parser instance

@param files [Array] An array containing URLs or paths to files @param quiet [Boolean] Be quiet @param encoding [String] The encoding to use for opening the files @param parallel [Boolean] Use parallel processing @param output [Output, Array] The output(s), defaults to a Screenoutput @param range [Range] Range of files to process (nil means process all) @param num_processes [Fixnum] Number of parallel processes

# File lib/docparser/parser.rb, line 38
def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
               output: nil, range: nil,
               num_processes: Parallel.processor_count + 1)
  @num_processes = parallel ? num_processes : 1
  @files = range ? files[range] : files
  @encoding = encoding

  @logger = Logger.new(STDERR)
  @logger.level = quiet ? Logger::ERROR : Logger::INFO

  initialize_outputs output

  @logger.info "DocParser v#{VERSION} loaded"
end

Public Instance Methods

parse!(&block) click to toggle source

Parses the `files`

Accepts a block which is executed for each document in the Document context where you can access the content using Nokogiri.

@see Document

# File lib/docparser/parser.rb, line 59
def parse!(&block)
  @logger.info "Parsing #{@files.length} files (encoding: #{@encoding})."
  start_time = Time.now

  if @num_processes > 1
    parallel_process(&block)
  else
    serial_process(&block)
  end

  @logger.info 'Processing finished'

  write_to_outputs

  @logger.info format('Done processing in %.2fs.', Time.now - start_time)
end

Private Instance Methods

initialize_outputs(output) click to toggle source
# File lib/docparser/parser.rb, line 78
def initialize_outputs(output)
  @outputs = []
  if output.is_a? Output
    @outputs << output
  elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
    @outputs = output
  elsif output
    raise ArgumentError, 'Invalid outputs specified'
  end

  @resultsets = Array.new(@outputs.length) { Set.new }
end
parallel_process(&block) click to toggle source
# File lib/docparser/parser.rb, line 91
def parallel_process(&block)
  @logger.info "Starting #{@num_processes} processes"
  option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads
  Parallel.map(@files, option => @num_processes) do |file|
    # :nocov: #
    parse_doc(file, &block)
    # :nocov: #
  end.each do |result|
    next unless @outputs

    result.each_with_index do |set, index|
      @resultsets[index].merge(set)
    end
  end
end
parse_doc(file, &block) click to toggle source
# File lib/docparser/parser.rb, line 115
def parse_doc(file, &block)
  doc = Document.new(filename: file, encoding: @encoding, parser: self)
  doc.parse!(&block)
end
serial_process(&block) click to toggle source
# File lib/docparser/parser.rb, line 107
def serial_process(&block)
  @files.each do |file|
    parse_doc(file, &block).each_with_index do |set, index|
      @resultsets[index].merge(set) if @outputs
    end
  end
end
write_to_outputs() click to toggle source
# File lib/docparser/parser.rb, line 120
def write_to_outputs
  @logger.info 'Writing data..'
  @outputs.each_with_index do |output, index|
    @resultsets[index].each do |row|
      output.add_row row
    end
    @resultsets[index] = nil
    output.close
  end
end