class ElasticAPM::Sql::Tokenizer

@api private

Constants

ALPHA
DIGIT
SPACE

Attributes

input[R]
scanner[R]
token[R]

Public Class Methods

new(input) click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 33
def initialize(input)
  @input = input

  @scanner = StringScanner.new(input)
  @byte_start = 0
end

Public Instance Methods

scan() click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 46
def scan
  scanner.skip(SPACE)

  @byte_start = scanner.pos
  char = next_char

  return false unless char

  @token = next_token(char)

  true
end
text() click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 42
def text
  @input.byteslice(@byte_start, @byte_end - @byte_start)
end

Private Instance Methods

next_char() click to toggle source

rubocop:enable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 82
def next_char
  char = @scanner.getch
  @byte_end = @scanner.pos
  char
end
next_token(char) click to toggle source

rubocop:disable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 62
def next_token(char)
  case char
  when '_'   then scan_keyword_or_identifier(possible_keyword: false)
  when '.'   then PERIOD
  when '$'   then scan_dollar_sign
  when '`'   then scan_quoted_indentifier('`')
  when '"'   then scan_quoted_indentifier('"')
  when '['   then scan_quoted_indentifier(']')
  when '('   then LPAREN
  when ')'   then RPAREN
  when '/'   then scan_bracketed_or_cql_comment
  when '-'   then scan_simple_comment
  when "'"   then scan_string_literal
  when ALPHA then scan_keyword_or_identifier(possible_keyword: true)
  when DIGIT then scan_numeric_literal
  else            OTHER
  end
end
peek_char(length = 1) click to toggle source

StringScanner#peek returns next byte which could be an incomplete utf multi-byte character

# File lib/elastic_apm/sql/tokenizer.rb, line 90
def peek_char(length = 1)
  # The maximum byte count of utf chars is 4:
  # > In UTF-8, characters from the U+0000..U+10FFFF range (the UTF-16
  #   accessible range) are encoded using sequences of 1 to 4 octets.
  # # https://tools.ietf.org/html/rfc3629
  return nil if length > 4

  char = @scanner.peek(length)

  return nil if char.empty?
  return char if char.valid_encoding?

  peek_char(length + 1)
end
scan_bracketed_comment() click to toggle source

rubocop:disable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 197
def scan_bracketed_comment
  nesting = 1

  while (char = next_char)
    case char
    when '/'
      next unless peek_char == '*'
      next_char
      nesting += 1
    when '*'
      next unless peek_char == '/'
      next_char
      nesting -= 1
      return COMMENT if nesting == 0
    end
  end
end
scan_bracketed_or_cql_comment() click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 188
def scan_bracketed_or_cql_comment
  case peek_char
  when '*' then scan_bracketed_comment
  when '/' then scan_cql_comment
  else OTHER
  end
end
scan_cql_comment() click to toggle source

rubocop:enable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 216
def scan_cql_comment
  return OTHER unless peek_char == '/'

  while (char = next_char)
    break if char == "\n"
  end

  COMMENT
end
scan_dollar_sign() click to toggle source

rubocop:disable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 134
def scan_dollar_sign
  while (peek = peek_char)
    case peek
    when DIGIT
      next_char while peek_char =~ DIGIT
    when '$', '_', ALPHA, SPACE
      # PostgreSQL supports dollar-quoted string literal syntax,
      # like $foo$...$foo$. The tag (foo in this case) is optional,
      # and if present follows identifier rules.
      while (char = next_char)
        case char
        when '$'
          # This marks the end of the initial $foo$.
          snap = text
          slice = input.slice(scanner.pos, input.length)
          index = slice.index(snap)
          next unless index && index >= 0

          delta = index + snap.bytesize
          @byte_end += delta
          scanner.pos += delta
          return STRING
        when SPACE
          # Unknown token starting with $, consume chars until space.
          @byte_end -= char.bytesize
          return OTHER
        end
      end
    else break
    end
  end

  OTHER
end
scan_keyword_or_identifier(possible_keyword:) click to toggle source

rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 106
def scan_keyword_or_identifier(possible_keyword:)
  while (peek = peek_char)
    if peek == '_' || peek == '$' || peek =~ DIGIT
      possible_keyword = false
      next next_char
    end

    next next_char if ALPHA.match?(peek)

    break
  end

  return IDENT unless possible_keyword

  snap = text

  if snap.length < KEYWORD_MIN_LENGTH || snap.length > KEYWORD_MAX_LENGTH
    return IDENT
  end

  keyword = KEYWORDS[snap.length].find { |kw| snap.upcase == kw.to_s }
  return keyword if keyword

  IDENT
end
scan_numeric_literal() click to toggle source

rubocop:disable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 256
def scan_numeric_literal
  period = false
  exponent = false

  while (peek = peek_char)
    case peek
    when DIGIT then next_char
    when '.'
      return NUMBER if period
      next_char
      period = true
    when 'e', 'E'
      return NUMBER if exponent
      next_char
      next_char if /[+-]/.match?(peek_char)
    else break
    end
  end

  NUMBER
end
scan_quoted_indentifier(delimiter) click to toggle source

rubocop:enable Metrics/CyclomaticComplexity

# File lib/elastic_apm/sql/tokenizer.rb, line 170
def scan_quoted_indentifier(delimiter)
  while (char = next_char)
    next unless char == delimiter

    if delimiter == '"' && peek_char == delimiter
      next next_char
    end

    break
  end

  # Remove quotes from identifier
  @byte_start += char.bytesize
  @byte_end -= char.bytesize

  IDENT
end
scan_simple_comment() click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 226
def scan_simple_comment
  return OTHER unless peek_char == '-'

  while (char = next_char)
    break if char == "\n"
  end

  COMMENT
end
scan_string_literal() click to toggle source
# File lib/elastic_apm/sql/tokenizer.rb, line 236
def scan_string_literal
  delimiter = "'"

  while (char = next_char)
    if char == '\\'
      # Skip escaped character, e.g. 'what\'s up?'
      next_char
      next
    end

    next unless char == delimiter

    return STRING unless peek_char
    return STRING if peek_char != delimiter

    next_char
  end
end