module EBNF::LL1::Parser

A Generic LL1 parser using a lexer and branch tables defined using the SWAP tool chain (modified).

# Creating terminal definitions and parser rules to parse generated grammars

The parser is initialized to callbacks invoked on entry and exit
to each `terminal` and `production`. A trivial parser loop can be described as follows:

    require 'ebnf/ll1/parser'
    require 'meta'

    class Parser
      include Meta
      include EBNF::LL1::Parser

      terminal(:SYMBOL, /([a-z]|[A-Z]|[0-9]|_)+/) do |prod, token, input|
        # Add data based on scanned token to input
        input[:symbol] = token.value
      end

      start_production(:rule) do |input, current, callback|
        # Process on start of production
        # Set state for entry into recursed rules through current

        # Callback to parser loop with callback
      end

      production(:rule) do |input, current, callback|
        # Process on end of production
        # return results in input, retrieve results from recursed rules in current

        # Callback to parser loop with callback
      end

      def initialize(input)
        parse(input, start_symbol,
          branch: BRANCH,
          first: FIRST,
          follow: FOLLOW,
          cleanup: CLEANUP
        ) do |context, *data|
          # Process calls from callback from productions

        rescue ArgumentError, RDF::LL1::Parser::Error => e
          progress("Parsing completed with errors:\n\t#{e.message}")
          raise RDF::ReaderError, e.message if validate?
        end

Attributes

lineno[R]

@return [Integer] line number of current token

Public Class Methods

included(base) click to toggle source
# File lib/ebnf/ll1/parser.rb, line 56
def self.included(base)
  base.extend(ClassMethods)
end

Public Instance Methods

add_prod_data(sym, *values) click to toggle source

Add values to production data, values aranged as an array

# File lib/ebnf/ll1/parser.rb, line 491
def add_prod_data(sym, *values)
  return if values.compact.empty?

  prod_data[sym] ||= []
  prod_data[sym] += values
  debug("add_prod_data(#{sym})") {"#{prod_data[sym].inspect} += #{values.inspect}"}
end
add_prod_datum(sym, values) click to toggle source

Add a single value to prod_data, allows for values to be an array

# File lib/ebnf/ll1/parser.rb, line 475
def add_prod_datum(sym, values)
  case values
  when Array
    prod_data[sym] ||= []
    debug("add_prod_datum(#{sym})") {"#{prod_data[sym].inspect} += #{values.inspect}"}
    prod_data[sym] += values
  when nil
    return
  else
    prod_data[sym] ||= []
    debug("add_prod_datum(#{sym})") {"#{prod_data[sym].inspect} << #{values.inspect}"}
    prod_data[sym] << values
  end
end
depth() click to toggle source
# File lib/ebnf/ll1/parser.rb, line 469
def depth; (@productions || []).length; end
parse(input = nil, start = nil, **options, &block) click to toggle source

Initializes a new parser instance.

Attempts to recover from errors.

@example

require 'rdf/ll1/parser'

class MyParser
  include EBNF::LL1::Parser

  branch      MyParser::BRANCH

  ##
  # Defines a production called during before parsing a non-terminal
  # with data from previous production along with data defined for the
  # current production
  #
  start_production :object do |input, current, callback|
    # Note production as triples for blankNodePropertyList
    # to set :subject instead of :resource
    current[:triples] = true
  end

  ##
  # Defines a production called during after parsing a non-terminal
  # with data from previous production along with data defined for the
  # current production
  #
  # callback to processor block
  production :object do |input, current, callback|
    object = current[:resource]
    callback.call :statement, RDF::Statement.new(input[:subject], input[:predicate], object)
  end

  ##
  # Defines the pattern for a terminal node
  terminal :BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL})) do |production, token, input|
    input[:BLANK_NODE_LABEL] = RDF::Node.new(token)
  end

  ##
  # Iterates the given block for each RDF statement in the input.
  #
  # @yield  [statement]
  # @yieldparam [RDF::Statement] statement
  # @return [void]
  def each_statement(&block)
    @callback = block

    parse(input, START.to_sym) do |context, *data|
      case context
      when :statement
        yield *data
      end
    end
  end

end

@param [String, to_s] input @param [Symbol, to_s] start

The starting production for the parser. It may be a URI from the grammar, or a symbol representing the local_name portion of the grammar URI.

@param [Hash{Symbol => Object}] options @option options [Hash{Symbol,String => Hash{Symbol,String => Array<Symbol,String>}}] :branch LL1 branch table. @option options [HHash{Symbol,String => Array<Symbol,String>}] :first ({})

Lists valid terminals that can precede each production (for error recovery).

@option options [Hash{Symbol,String => Array<Symbol,String>}] :follow ({})

Lists valid terminals that can follow each production (for error recovery).

@option options :high_water passed to lexer @option options [Logger] :logger for errors/progress/debug. @option options :low_water passed to lexer @option options [Boolean] :reset_on_start

Reset the parser state if the start token set with `prod` is found in a production. This reduces the production stack depth growth, which is appropriate for some grammars.

@option options [Boolean] :validate (false)

whether to validate the parsed statements and values. If not validating, the parser will attempt to recover from errors.

@yield [context, *data]

Yields for to return data to parser

@yieldparam [:statement, :trace] context

Context for block

@yieldparam [Symbol] *data

Data specific to the call

@return [EBNF::LL1::Parser] @raise [Exception] Raises exceptions for parsing errors

or errors raised during processing callbacks. Internal
errors are raised using {Error}.

@see cs.adelaide.edu.au/~charles/lt/Lectures/07-ErrorRecovery.pdf

# File lib/ebnf/ll1/parser.rb, line 266
def parse(input = nil, start = nil, **options, &block)
  @options = options.dup
  @branch  = options[:branch]
  @first   = options[:first] ||= {}
  @follow  = options[:follow] ||= {}
  @cleanup = options[:cleanup] ||= {}
  @lexer   = input.is_a?(Lexer) ? input : Lexer.new(input, self.class.patterns, **@options)
  @productions = []
  @parse_callback = block
  @recovering = false
  @error_log = []
  terminals = self.class.patterns.map(&:type)  # Get defined terminals to help with branching

  # Unrecoverable errors
  raise Error, "Branch table not defined" unless @branch && @branch.length > 0
  raise Error, "Starting production not defined" unless start

  @prod_data = [{}]
  start = start.split('#').last.to_sym unless start.is_a?(Symbol)
  todo_stack = [{prod: start, terms: nil}]

  while !todo_stack.empty?
    begin
      @recovering = false
      pushed = false
      if todo_stack.last[:terms].nil?
        todo_stack.last[:terms] = []
        cur_prod = todo_stack.last[:prod]

        # If cur_prod is the starting production, we can reset the stack
        # to the beginning to avoid excessive growth in the production
        # stack
        if options[:reset_on_start] && cur_prod == start
          todo_stack = [{prod: start, terms: []}]
          @productions = []
          @prod_data = [{}]
        end

        # Fetch the current token
        token = get_token(:recover)

        # At this point, token is either nil, in the first set of the production,
        # or in the follow set of this production or any previous production
        debug("parse(production)") do
          "token #{token ? token.representation.inspect : 'nil'}, " +
          "prod #{cur_prod.inspect}, " +
          "depth #{depth}"
        end

        # Got an opened production
        onStart(cur_prod)

        if token.nil?
          if  !(first_include?(cur_prod, :_eps) && follow_include?(cur_prod, :_eof))
            # End of file, and production does not contain eps, or it does, but follow does not contain eof
            error("parse(production)", "Unexpected end of input", lineno: lineno, production: cur_prod, raise: true)
          else
            debug("parse(production)") {"End of input prod #{cur_prod.inspect}"}
          end
        elsif prod_branch = @branch[cur_prod]
          sequence = prod_branch.fetch(token.representation) do
            error("parse(production)", "Expected one of #{@first[cur_prod].inspect}", token: token, production: cur_prod, raise: true)
          end
          debug("parse(production)") do
            "token #{token.representation.inspect} " +
            "prod #{cur_prod.inspect}, " +
            "prod_branch #{prod_branch.keys.inspect}, " +
            "sequence #{sequence.inspect}"
          end
          todo_stack.last[:terms] += sequence
        else
          error("parse(production)", "Unexpected", token: token, production: cur_prod, raise: true)
        end
      end

      debug("parse(terms)") {"todo #{todo_stack.last.inspect}, depth #{depth}"}
      while !todo_stack.last[:terms].to_a.empty?
        # Get the next term in this sequence
        term = todo_stack.last[:terms].shift
        debug("parse(token)") {"accept #{term.inspect}"}

        if token = accept(term)
          debug("parse(token)") {"token #{token.inspect}, term #{term.inspect}"}
          onTerminal(term, token)
        elsif terminals.include?(term) 
          # If term is a terminal, then it is an error if token does not
          # match it
          error("parse(token)", "Expected #{term.inspect}", token: get_token, production: cur_prod, raise: true)
        else
          token = get_token

          # If token is not in firsts of term, but eps is, skip to next
          # term
          if first_include?(term, :_eps) && !first_include?(term, token)
            debug("parse(token)") {"skip optional term #{term.inspect} on #{token.inspect}"}
            break
          else
            # Push term onto stack
            todo_stack << {prod: term, terms: nil}
            debug("parse(push)") {"term #{term.inspect}, depth #{depth}"}
            pushed = true
            break
          end
        end
      end
    rescue Lexer::Error, Error => e
      # Lexer encountered an illegal token or the parser encountered
      # a terminal which is inappropriate for the current production.
      # Perform error recovery to find a reasonable terminal based
      # on the follow sets of the relevant productions. This includes
      # remaining terms from the current production and the stacked
      # productions
      @lineno = e.lineno
      if e.is_a?(Lexer::Error)
        # Skip to the next valid terminal
        @lexer.recover
        error("parse(#{e.class})", "With input '#{e.input}': #{e.message}",
              production: @productions.last, token: e.token)
      else
        # Otherwise, the terminal is fine, just not for this production.
        @lexer.shift
        error("parse(#{e.class})", "#{e.message}",
              production: @productions.last, token: e.token)
      end

      # Get the list of follows for this sequence, this production and the stacked productions.
      debug("recovery", "stack follows:")
      todo_stack.reverse.each do |todo|
        debug("recovery") {"  #{todo[:prod]}: #{@follow[todo[:prod]].inspect}"}
      end

      # Find all follows to the top of the stack
      follows = todo_stack.inject([]) do |follow, todo|
        prod = todo[:prod]
        follow += @follow[prod] || []
      end.uniq
      debug("recovery") {"follows: #{follows.inspect}"}

      # Skip tokens until one is found in follows
      while (token = get_token(:recover)) && follows.none? {|t| token === t}
        skipped = @lexer.shift
        progress("recovery") {"skip #{skipped.inspect}"}
      end
      debug("recovery") {"found #{token.inspect} in follows"}

      # Pop stack elements until token is in follows
      while !todo_stack.empty? &&
            !follow_include?(todo_stack.last[:prod], token || :_eof)
        debug("recovery(pop)") {"todo #{todo_stack.last.inspect}, depth #{depth}"}
        todo_stack.pop
        onFinish
      end

      # Token is now in the first of the top production
      unless todo_stack.empty?
        todo_stack.pop
        onFinish
      end

      if todo_stack.empty?
        # Recovered to end of last production
        warn("recover", "recovered to end of productions")
      else
        warn("recover", "recovered to #{todo_stack.last[:prod].inspect} with #{token.inspect}")
      end

      @recovering = false
    ensure
      # After completing the last production in a sequence, pop down until we find a production
      #
      # If in recovery mode, continue popping until we find a term with a follow list
      while !pushed &&
            !todo_stack.empty? &&
            todo_stack.last.fetch(:terms, []).empty?
        debug("parse(pop)") {"todo #{todo_stack.last.inspect}, depth #{depth}"}
        todo_stack.pop
        onFinish
      end
    end
  end

  error("parse(eof)", "Finished processing before end of file", token: @lexer.first) if @lexer.first

  # Continue popping contexts off of the stack
  while !todo_stack.empty?
    debug("parse(eof)") {"stack #{todo_stack.last.inspect}, depth #{depth}"}
    # There can't be anything left to do, or if there is, it must be optional
    last_terms = todo_stack.last[:terms]
    if last_terms.length > 0 && last_terms.none? {|t|first_include?(t, :_eps)}
      error("parse(eof)",
        "End of input before end of production: stack #{todo_stack.last.inspect}, depth #{depth}"
      )
    end
    todo_stack.pop
    onFinish
  end

  # When all is said and done, raise the error log
  unless @error_log.empty?
    raise Error, @error_log.join("\n")
  end
end
prod_data() click to toggle source

Current ProdData element

# File lib/ebnf/ll1/parser.rb, line 472
def prod_data; @prod_data.last; end

Protected Instance Methods

debug(*args) { || ... } click to toggle source

Debug logging.

The call is ignored, unless ‘@options` is set.

@overload debug(node, message, **options)

@param [Array<String>] args Relevant location associated with message
@param [Hash] options
@option options [Integer] :depth
  Recursion depth for indenting output
@yieldreturn [String] additional string appended to `message`.
# File lib/ebnf/ll1/parser.rb, line 579
def debug(*args, &block)
  return unless @options[:logger]
  options = args.last.is_a?(Hash) ? args.pop : {}
  lineno = @lineno || (options[:token].lineno if options[:token].respond_to?(:lineno))
  level = options.fetch(:level, 0)
  depth = options[:depth] || self.depth

  if self.respond_to?(:log_debug)
    level = [:debug, :info, :warn, :error, :fatal][level]
    log_debug(*args, **options.merge(level: level, lineno: lineno, depth: depth), &block)
  elsif @options[:logger].respond_to?(:add)
    args << yield if block_given?
    @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
  elsif @options[:logger].respond_to?(:<<)
    args << yield if block_given?
    @options[:logger] << "[#{lineno}]" + (" " * depth) + args.join(" ")
  end
end
error(node, message, **options) click to toggle source

Error information, used as level ‘3` logger messages. Messages may be logged and are saved for reporting at end of parsing.

@param [String] node Relevant location associated with message @param [String] message Error string @param [Hash{Symbol => Object}] options @option options [URI, to_s] :production @option options [Token] :token @see debug

# File lib/ebnf/ll1/parser.rb, line 511
def error(node, message, **options)
  lineno = @lineno || (options[:token].lineno if options[:token].respond_to?(:lineno))
  m = "ERROR "
  m += "[line: #{lineno}] " if lineno
  m += message
  m += " (found #{options[:token].inspect})" if options[:token]
  m += ", production = #{options[:production].inspect}" if options[:production]
  @error_log << m unless @recovering
  @recovering = true
  debug(node, m, level: options.fetch(:level, 3), **options)
  if options[:raise] || @options[:validate]
    raise Error.new(m, lineno: lineno, token: options[:token], production: options[:production])
  end
end
progress(node, *args, &block) click to toggle source

Progress logged when parsing. Passed as level ‘1` logger messages.

The call is ignored, unless ‘@options` is set.

@overload progress(node, message, **options, &block)

@param [String] node Relevant location associated with message
@param [String] message ("")
@param [Hash] options
@option options [Integer] :depth
    Recursion depth for indenting output

@see debug

# File lib/ebnf/ll1/parser.rb, line 559
def progress(node, *args, &block)
  return unless @options[:logger]
  lineno = @lineno || (options[:token].lineno if options[:token].respond_to?(:lineno))
  args << {} unless args.last.is_a?(Hash)
  args.last[:level] ||= 1
  args.last[:lineno] ||= lineno
  debug(node, *args, &block)
end
warn(node, message, **options) click to toggle source

Warning information, used as level ‘2` logger messages. Messages may be logged and are saved for reporting at end of parsing.

@param [String] node Relevant location associated with message @param [String] message Error string @param [Hash] options @option options [URI, to_s] :production @option options [Token] :token @see debug

# File lib/ebnf/ll1/parser.rb, line 536
def warn(node, message, **options)
  lineno = @lineno || (options[:token].lineno if options[:token].respond_to?(:lineno))
  m = "WARNING "
  m += "[line: #{lineno}] " if lineno
  m += message
  m += " (found #{options[:token].inspect})" if options[:token]
  m += ", production = #{options[:production].inspect}" if options[:production]
  @error_log << m unless @recovering
  debug(node, m, level: 2, lineno: lineno, **options)
end

Private Instance Methods

accept(type_or_value) click to toggle source

Accept the first token in the input stream if it matches ‘type_or_value`. Raise Error, otherwise.

@param [Symbol, String] type_or_value @return [Token] @raise [Error, Lexer::Error]

# File lib/ebnf/ll1/parser.rb, line 750
def accept(type_or_value)
  if (token = get_token) && token === type_or_value
    debug("accept") {"#{token.inspect} === #{type_or_value.inspect}"}
    @lexer.shift
  end
end
first_include?(production, token) click to toggle source

Does first include the specified token

@param [Symbol] production @param [Symbol, Lexer::Token] token

A terminal, or symbol or string

@return [Boolean]

# File lib/ebnf/ll1/parser.rb, line 698
def first_include?(production, token)
  if token.is_a?(Lexer::Token)
    @first.fetch(production, []).any? {|t| token === t}
  else
    @first.fetch(production, []).include?(token)
  end
end
follow_include?(production, token) click to toggle source

Does follow include the specified terminal

@param [Symbol] production @param [Symbol, Lexer::Token] token

A terminal, or symbol or string

@return [Boolean]

# File lib/ebnf/ll1/parser.rb, line 713
def follow_include?(production, token)
  if token.is_a?(Lexer::Token)
    @follow.fetch(production, []).any? {|t| token === t}
  else
    @follow.fetch(production, []).include?(token)
  end
end
get_token(recover = nil) click to toggle source

Return the next token, raising an error if the token is invalid

@param [:recover] recover

Recover from errors and go until next valid token or end of file

@return [Token] @raise [Lexer::Error]

# File lib/ebnf/ll1/parser.rb, line 728
def get_token(recover = nil)
  token = @lexer.first
  #progress("token") {token.inspect}
  @lineno = token.lineno if token
  token
rescue Lexer::Error => e
  if recover
    # Recover from lexer error so that we can not bail out too early
    @lexer.recover
    error("get_token", "With input '#{e.input}': #{e.message}}", token: e.token, lineno: e.lineno)
    retry
  end
  raise
end
onFinish() click to toggle source

Finish of production

# File lib/ebnf/ll1/parser.rb, line 631
def onFinish
  #puts "prod_data(f): " + @prod_data.inspect
  prod = @productions.last
  handler = self.class.production_handlers[prod]
  if handler && !@recovering
    # Pop production data element from stack, potentially allowing handler to use it
    data = @prod_data.pop
    begin
      self.class.eval_with_binding(self) {
        handler.call(@prod_data.last, data, @parse_callback)
      }
    rescue ArgumentError, Error => e
      error("finish", "#{e.class}: #{e.message}", production: prod)
      @recovering = false
    end
    progress("#{prod}(:finish):#{@prod_data.length}") {@prod_data.last}
  elsif [:merge, :star].include?(@cleanup[prod])
    data = @prod_data.pop
    input = @prod_data.last

    # Append every element in data to last prod_data
    data.each do |k, v|
      input[k] = case input[k]
      when nil then v.is_a?(Hash) ? v : Array(v)
      when Hash then input[k].merge!(v)
      else Array(input[k]) + Array(v)
      end
    end
    debug("#{prod}(:finish):#{@prod_data.length} cleanup:#{@cleanup[prod]}") {@prod_data.last}
  else
    progress("#{prod}(:finish):#{@prod_data.length}") { "recovering" if @recovering }
  end
  @productions.pop
end
onStart(prod) click to toggle source

Start for production

# File lib/ebnf/ll1/parser.rb, line 600
def onStart(prod)
  handler = self.class.start_handlers[prod]
  @productions << prod
  if handler
    # Create a new production data element, potentially allowing handler
    # to customize before pushing on the @prod_data stack
    debug("#{prod}(:start):#{@prod_data.length}") {@prod_data.last}
    data = {}
    begin
      self.class.eval_with_binding(self) {
        handler.call(@prod_data.last, data, @parse_callback)
      }
    rescue ArgumentError, Error => e
      error("start", "#{e.class}: #{e.message}", production: prod)
      @recovering = false
    end
    @prod_data << data
  elsif [:merge, :star].include?(@cleanup[prod])
    # Save current data to merge later
    @prod_data << {}
    debug("#{prod}(:start}:#{@prod_data.length}:cleanup:#{@cleanup[prod]}") { get_token.inspect + (@recovering ? ' recovering' : '')}
  else
    # Make sure we push as many was we pop, even if there is no
    # explicit start handler
    @prod_data << {} if self.class.production_handlers[prod]
    debug("#{prod}(:start:#{@prod_data.length})") { get_token.inspect + (@recovering ? ' recovering' : '')}
  end
  #puts "prod_data(s): " + @prod_data.inspect
end
onTerminal(prod, token) click to toggle source

A terminal

# File lib/ebnf/ll1/parser.rb, line 667
def onTerminal(prod, token)
  unless @productions.empty?
    parentProd = @productions.last
    handler = self.class.terminal_handlers[prod]
    # Allows catch-all for simple string terminals
    handler ||= self.class.terminal_handlers[nil] if prod.is_a?(String)
    if handler
      begin
        self.class.eval_with_binding(self) {
          handler.call(parentProd, token, @prod_data.last, @parse_callback)
        }
      rescue ArgumentError, Error => e
        error("terminal", "#{e.class}: #{e.message}", token: token, production: prod)
        @recovering = false
      end
      progress("#{prod}(:terminal)", "", depth: (depth + 1)) {"#{token}: #{@prod_data.last}"}
    else
      progress("#{prod}(:terminal)", "", depth: (depth + 1)) {token.to_s}
    end
  else
    error("#{parentProd}(:terminal)", "Terminal has no parent production", token: token, production: prod)
  end
end