class SrlRuby::Tokenizer

A tokenizer for the Simple Regex Language. Responsibility: break input SRL into a sequence of token objects. The tokenizer should recognize: Keywords: as, capture, letter Integer literals including single digit String literals (quote delimited) Single character literal Delimiters: parentheses '(' and ')' Separators: comma (optional)

Attributes

line_start[R]

@return [Integer] offset of start of current line within input

lineno[R]

@return [Integer] current line number

scanner[R]

@return [StringScanner]

Public Class Methods

new(source) click to toggle source

Constructor. Initialize a tokenizer for SRL. @param source [String] SRL text to tokenize.

# File lib/srl_ruby/tokenizer.rb, line 99
def initialize(source)
  @scanner = StringScanner.new(source)
  @lineno = 1
  @line_start = 0
end

Public Instance Methods

tokens() click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 105
def tokens
  tok_sequence = []
  until @scanner.eos?
    token = _next_token
    tok_sequence << token unless token.nil?
  end

  return tok_sequence
end

Private Instance Methods

_next_token() click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 117
def _next_token
  skip_whitespaces
  curr_ch = scanner.peek(1)
  return nil if curr_ch.nil? || curr_ch.empty?

  token = nil

  if '(),'.include? curr_ch
    # Delimiters, separators => single character token
    token = build_token(@@lexeme2name[curr_ch], scanner.getch)
  elsif (lexeme = scanner.scan(/[0-9]{2,}((?=\s|,|\))|$)/))
    token = build_token('INTEGER', lexeme) # An integer has 2..* digits
  elsif (lexeme = scanner.scan(/[0-9]((?=\s|,|\))|$)/))
  token = build_token('DIGIT_LIT', lexeme)
  elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
    unquoted = lexeme.gsub(/(^")|("$)/, '')
    token = build_token('STRING_LIT', unquoted)
  elsif (lexeme = scanner.scan(/'(?:\\'|[^'])*'/)) # Single quotes literal?
    unquoted = lexeme.gsub(/(^')|('$)/, '')
    token = build_token('STRING_LIT', unquoted)
  elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s|,|\))|$)/))
    token = build_token('LETTER_LIT', lexeme)
  elsif (lexeme = scanner.scan(/[a-zA-Z_][a-zA-Z0-9_]+/))
    keyw = @@keywords[lexeme.upcase]
    tok_type = keyw || 'IDENTIFIER'
    token = build_token(tok_type, lexeme)
  elsif (lexeme = scanner.scan(/[^,"\s]{2,}/))
    token = build_token('CHAR_CLASS', lexeme)
  else # Unknown token
    erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
    sequel = scanner.scan(/.{1,20}/)
    erroneous += sequel unless sequel.nil?
    raise ScanError, "Unknown token #{erroneous} on line #{lineno}"
  end

  return token
end
build_token(aSymbolName, aLexeme) click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 155
def build_token(aSymbolName, aLexeme)
  begin
    col = scanner.pos - aLexeme.size - @line_start + 1
    pos = Rley::Lexical::Position.new(@lineno, col)
    token = Rley::Lexical::Token.new(aLexeme, aSymbolName, pos)
  rescue StandardError => e
    puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
    raise e
  end

  return token
end
skip_whitespaces() click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 168
def skip_whitespaces
  pre_pos = scanner.pos

  loop do
    ws_found = false
    found = scanner.skip(/[ \t\f]+/)
    ws_found = true if found
    found = scanner.skip(/(?:\r\n)|\r|\n/)
    if found
      ws_found = true
      @lineno += 1
      @line_start = scanner.pos
    end
    break unless ws_found
  end

  curr_pos = scanner.pos
  return if curr_pos == pre_pos
  # skipped = scanner.string.slice(Range.new(pre_pos, curr_pos))
  # triplet = skipped.rpartition(/\n|\r/)
  # @column = 1 unless triplet[1].empty?

  # Correction for the tabs
  # tab_count = triplet[2].chars.count { |ch| ch =~ /\t/ }
  # @column += triplet[2].size + tab_count * (tab_size - 1) - 1
end
tab_size() click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 195
def tab_size
  2
end