class EBNF::LL1::Lexer

A lexical analyzer

@example Tokenizing a Turtle string

terminals = [
  [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))],
  ...
]
ttl = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ."
lexer = EBNF::LL1::Lexer.tokenize(ttl, terminals)
lexer.each_token do |token|
  puts token.inspect
end

@example Tokenizing and returning a token stream

lexer = EBNF::LL1::Lexer.tokenize(...)
while :some-condition
  token = lexer.first # Get the current token
  token = lexer.shift # Get the current token and shift to the next
end

@example Handling error conditions

begin
  EBNF::LL1::Lexer.tokenize(query)
rescue EBNF::LL1::Lexer::Error => error
  warn error.inspect
end

@see en.wikipedia.org/wiki/Lexical_analysis

Attributes

input[RW]

The current input string being processed.

@return [String]

options[R]

Any additional options for the lexer.

@return [Hash]

scanner[R]

@return [StringScanner]

whitespace[R]

@return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals

Public Class Methods

new(input = nil, terminals = nil, **options) click to toggle source

Initializes a new lexer instance.

@param [String, to_s] input @param [Array<Array<Symbol, Regexp>, Terminal>] terminals

Array of symbol, regexp pairs used to match terminals.
If the symbol is nil, it defines a Regexp to match string terminals.

@param [Hash{Symbol => Object}] options @option options [Regexp] :whitespace

Whitespace between tokens, including comments

@option options :high_water passed to scanner @option options :low_water passed to scanner

# File lib/ebnf/ll1/lexer.rb, line 94
def initialize(input = nil, terminals = nil, **options)
  @options        = options.dup
  @whitespace     = @options[:whitespace]
  @terminals      = terminals.map do |term|
    if term.is_a?(Array) && term.length ==3
      # Last element is options
      Terminal.new(term[0], term[1], **term[2])
    elsif term.is_a?(Array)
      Terminal.new(*term)
    else
      term
    end
  end

  raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0

  @scanner = Scanner.new(input, **options)
end
tokenize(input, terminals, **options, &block) click to toggle source

Tokenizes the given ‘input` string or stream.

@param [String, to_s] input @param [Array<Array<Symbol, Regexp>>] terminals

Array of symbol, regexp pairs used to match terminals.
If the symbol is nil, it defines a Regexp to match string terminals.

@param [Hash{Symbol => Object}] options @yield [lexer] @yieldparam [Lexer] lexer @return [Lexer] @raise [Lexer::Error] on invalid input

# File lib/ebnf/ll1/lexer.rb, line 77
def self.tokenize(input, terminals, **options, &block)
  lexer = self.new(input, terminals, **options)
  block_given? ? block.call(lexer) : lexer
end
unescape_codepoints(string) click to toggle source

Returns a copy of the given ‘input` string with all `uXXXX` and `UXXXXXXXX` Unicode codepoint escape sequences replaced with their unescaped UTF-8 character counterparts.

@param [String] string @return [String] @see www.w3.org/TR/rdf-sparql-query/#codepointEscape

# File lib/ebnf/ll1/lexer.rb, line 49
def self.unescape_codepoints(string)
  ::EBNF::Unescape.unescape_codepoints(string)
end
unescape_string(input) click to toggle source

Returns a copy of the given ‘input` string with all string escape sequences (e.g. `n` and `t`) replaced with their unescaped UTF-8 character counterparts.

@param [String] input @return [String] @see www.w3.org/TR/rdf-sparql-query/#grammarEscapes

# File lib/ebnf/ll1/lexer.rb, line 61
def self.unescape_string(input)
  ::EBNF::Unescape.unescape_string(input)
end

Public Instance Methods

each(&block)
Alias for: each_token
each_token() { |token| ... } click to toggle source

Enumerates each token in the input string.

@yield [token] @yieldparam [Token] token @return [Enumerator]

# File lib/ebnf/ll1/lexer.rb, line 146
def each_token(&block)
  if block_given?
    while token = shift
      yield token
    end
  end
  enum_for(:each_token)
end
Also aliased as: each
first(*types) click to toggle source

Returns first token in input stream

@param [Array] types Optional set of types for restricting terminals examined @return [Token]

# File lib/ebnf/ll1/lexer.rb, line 161
def first(*types)
  return nil unless scanner

  @first ||= begin
    {} while !scanner.eos? && skip_whitespace
    return nil if scanner.eos?

    token = match_token(*types)

    if token.nil?
      lexme = (scanner.rest.split(@whitespace || /\s/).first rescue nil) || scanner.rest
      raise Error.new("Invalid token #{lexme[0..100].inspect}",
        input: scanner.rest[0..100], token: lexme, lineno: lineno)
    end

    token
  end
rescue ArgumentError, Encoding::CompatibilityError => e
  raise Error.new(e.message,
    input: (scanner.rest[0..100] rescue '??'), token: lexme, lineno: lineno)
rescue Error
  raise
rescue
  STDERR.puts "Expected ArgumentError, got #{$!.class}"
  raise
end
lineno() click to toggle source

The current line number (one-based).

@return [Integer]

# File lib/ebnf/ll1/lexer.rb, line 220
def lineno
  scanner.lineno
end
recover(*types) click to toggle source

Skip input until a token is matched

@param [Array] types Optional set of types for restricting terminals examined @return [Token]

# File lib/ebnf/ll1/lexer.rb, line 203
def recover(*types)
  until scanner.eos? || tok = match_token(*types)
    if scanner.skip_until(@whitespace || /\s+/m).nil? # Skip past current "token"
      # No whitespace at the end, must be and end of string
      scanner.terminate
    else
      skip_whitespace
    end
  end
  scanner.unscan if tok
  first
end
shift() click to toggle source

Returns first token and shifts to next

@return [Token]

# File lib/ebnf/ll1/lexer.rb, line 192
def shift
  cur = first
  @first = nil
  cur
end
valid?() click to toggle source

Returns ‘true` if the input string is lexically valid.

To be considered valid, the input string must contain more than zero terminals, and must not contain any invalid terminals.

@return [Boolean]

# File lib/ebnf/ll1/lexer.rb, line 132
def valid?
  begin
    !count.zero?
  rescue Error
    false
  end
end

Protected Instance Methods

match_token(*types) click to toggle source

Return the matched token.

If the token was matched with a case-insensitive regexp, track this with the resulting {Token}, so that comparisons with that token are also case insensitive

@param [Array] types Optional set of types for restricting terminals examined @return [Token]

# File lib/ebnf/ll1/lexer.rb, line 248
def match_token(*types)
  @terminals.each do |term|
    next unless types.empty? || types.include?(term.type)
    #STDERR.puts "match[#{term.type}] #{scanner.rest[0..100].inspect} against #{term.regexp.inspect}" #if term.type == :STRING_LITERAL_SINGLE_QUOTE
    if term.partial_regexp && scanner.match?(term.partial_regexp) && !scanner.match?(term.regexp) && scanner.respond_to?(:ensure_buffer_full)
      scanner.ensure_buffer_full
    end

    if matched = scanner.scan(term.regexp)
      #STDERR.puts "  matched #{term.type.inspect}: #{matched.inspect}"
      tok = token(term.type, term.canonicalize(matched))
      return tok
    end
  end
  nil
end
skip_whitespace() click to toggle source

Skip whitespace, as defined through input options or defaults

# File lib/ebnf/ll1/lexer.rb, line 230
def skip_whitespace
  # skip all white space, but keep track of the current line number
  while @whitespace && !scanner.eos?
    unless scanner.scan(@whitespace)
      return
    end
  end
end
token(type, value, **options) click to toggle source

Constructs a new token object annotated with the current line number.

The parser relies on the type being a symbolized URI and the value being a string, if there is no type. If there is a type, then the value takes on the native representation appropriate for that type.

@param [Symbol] type @param [String] value

Scanner instance with access to matched groups

@param [Hash{Symbol => Object}] options @return [Token]

# File lib/ebnf/ll1/lexer.rb, line 337
def token(type, value, **options)
  Token.new(type, value, lineno: lineno, **options)
end