class Regexp::Lexer
A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.
Constants
- CLOSING_TOKENS
- CONDITION_TOKENS
- OPENING_TOKENS
Attributes
Public Class Methods
Source
# File lib/regexp_parser/lexer.rb, line 16 def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) new.lex(input, syntax, options: options, &block) end
Also aliased as: scan
Public Instance Methods
Source
# File lib/regexp_parser/lexer.rb, line 20 def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) syntax = Regexp::Syntax.for(syntax) self.tokens = [] self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 last = nil Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if type == :quantifier and last break_literal(last) if last.type == :literal break_codepoint_list(last) if last.token == :codepoint_list end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) current = merge_condition(current) if type == :conditional and CONDITION_TOKENS.include?(token) last.next = current if last current.previous = last if last tokens << current last = current descend(type, token) end if block_given? tokens.map { |t| block.call(t) } else tokens end end
Private Instance Methods
Source
# File lib/regexp_parser/lexer.rb, line 71 def ascend(type, token) case type when :group, :assertion self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token) when :set self.set_nesting = set_nesting - 1 if token == :close when :conditional self.conditional_nesting = conditional_nesting - 1 if token == :close end end
Source
# File lib/regexp_parser/lexer.rb, line 108 def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? tokens.pop tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } end
Source
# File lib/regexp_parser/lexer.rb, line 95 def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? tokens.pop tokens << Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) end
called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier
Source
# File lib/regexp_parser/lexer.rb, line 82 def descend(type, token) case type when :group, :assertion self.nesting = nesting + 1 if OPENING_TOKENS.include?(token) when :set self.set_nesting = set_nesting + 1 if token == :open when :conditional self.conditional_nesting = conditional_nesting + 1 if token == :open end end
Source
# File lib/regexp_parser/lexer.rb, line 123 def merge_condition(current) last = tokens.pop Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) end