class SrlRuby::Tokenizer
A tokenizer for the Simple Regex
Language. Responsibility: break input SRL into a sequence of token objects. The tokenizer should recognize: Keywords: as, capture, letter Integer literals including single digit String literals (quote delimited) Single character literal Delimiters: parentheses '(' and ')' Separators: comma (optional)
Attributes
line_start[R]
@return [Integer] offset of start of current line within input
lineno[R]
@return [Integer] current line number
scanner[R]
@return [StringScanner]
Public Class Methods
new(source)
click to toggle source
Constructor. Initialize a tokenizer for SRL. @param source [String] SRL text to tokenize.
# File lib/srl_ruby/tokenizer.rb, line 99 def initialize(source) @scanner = StringScanner.new(source) @lineno = 1 @line_start = 0 end
Public Instance Methods
tokens()
click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 105 def tokens tok_sequence = [] until @scanner.eos? token = _next_token tok_sequence << token unless token.nil? end return tok_sequence end
Private Instance Methods
_next_token()
click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 117 def _next_token skip_whitespaces curr_ch = scanner.peek(1) return nil if curr_ch.nil? || curr_ch.empty? token = nil if '(),'.include? curr_ch # Delimiters, separators => single character token token = build_token(@@lexeme2name[curr_ch], scanner.getch) elsif (lexeme = scanner.scan(/[0-9]{2,}((?=\s|,|\))|$)/)) token = build_token('INTEGER', lexeme) # An integer has 2..* digits elsif (lexeme = scanner.scan(/[0-9]((?=\s|,|\))|$)/)) token = build_token('DIGIT_LIT', lexeme) elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal? unquoted = lexeme.gsub(/(^")|("$)/, '') token = build_token('STRING_LIT', unquoted) elsif (lexeme = scanner.scan(/'(?:\\'|[^'])*'/)) # Single quotes literal? unquoted = lexeme.gsub(/(^')|('$)/, '') token = build_token('STRING_LIT', unquoted) elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s|,|\))|$)/)) token = build_token('LETTER_LIT', lexeme) elsif (lexeme = scanner.scan(/[a-zA-Z_][a-zA-Z0-9_]+/)) keyw = @@keywords[lexeme.upcase] tok_type = keyw || 'IDENTIFIER' token = build_token(tok_type, lexeme) elsif (lexeme = scanner.scan(/[^,"\s]{2,}/)) token = build_token('CHAR_CLASS', lexeme) else # Unknown token erroneous = curr_ch.nil? ? '' : scanner.scan(/./) sequel = scanner.scan(/.{1,20}/) erroneous += sequel unless sequel.nil? raise ScanError, "Unknown token #{erroneous} on line #{lineno}" end return token end
build_token(aSymbolName, aLexeme)
click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 155 def build_token(aSymbolName, aLexeme) begin col = scanner.pos - aLexeme.size - @line_start + 1 pos = Rley::Lexical::Position.new(@lineno, col) token = Rley::Lexical::Token.new(aLexeme, aSymbolName, pos) rescue StandardError => e puts "Failing with '#{aSymbolName}' and '#{aLexeme}'" raise e end return token end
skip_whitespaces()
click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 168 def skip_whitespaces pre_pos = scanner.pos loop do ws_found = false found = scanner.skip(/[ \t\f]+/) ws_found = true if found found = scanner.skip(/(?:\r\n)|\r|\n/) if found ws_found = true @lineno += 1 @line_start = scanner.pos end break unless ws_found end curr_pos = scanner.pos return if curr_pos == pre_pos # skipped = scanner.string.slice(Range.new(pre_pos, curr_pos)) # triplet = skipped.rpartition(/\n|\r/) # @column = 1 unless triplet[1].empty? # Correction for the tabs # tab_count = triplet[2].chars.count { |ch| ch =~ /\t/ } # @column += triplet[2].size + tab_count * (tab_size - 1) - 1 end
tab_size()
click to toggle source
# File lib/srl_ruby/tokenizer.rb, line 195 def tab_size 2 end