class MODL::Parser::UnicodeEscapeReplacer

Unicode replacements for MODL files.

Constants

BACKSLASH
BACKSLASH_U
HEX
TILDE
TILDE_U

Public Class Methods

convert_unicode_sequences(str) click to toggle source
# File lib/modl/parser/unicode_escape_replacer.rb, line 35
def self.convert_unicode_sequences(str)
  start = 0
  result = str

  until result.nil?
    # We could have a backslash-u escape sequence or a ~u escape sequence
    back_slash_u_index = result.index(BACKSLASH_U, start)
    tilde_u_index = result.index(TILDE_U, start)

    # Filter out cases with no escape sequences.
    unicode_str_idx = 0
    if tilde_u_index.nil? && back_slash_u_index.nil?
      break
    elsif tilde_u_index.nil?
      unicode_str_idx = back_slash_u_index # No ~? Must be backslash
    elsif back_slash_u_index.nil?
      unicode_str_idx = tilde_u_index # No backslash? Must be ~
    else
      # Pick the first escaped character and proceed with that one.
      unicode_str_idx = [back_slash_u_index, tilde_u_index].min
    end

    try_parse_result = try_parse(result, unicode_str_idx + 2)

    # Next time round the loop we start searching after the current escape sequence.
    start = unicode_str_idx + 1

    # If the escape sequence is itself escaped then don't replace it
    if unicode_str_idx > 0 && (result[unicode_str_idx - 1] == TILDE || result[unicode_str_idx - 1] == BACKSLASH)
      next
    end

    # Get the codepoint value and replace the escape sequence
    if try_parse_result.code_point > 0
      chars = try_parse_result.code_point.chr(Encoding::UTF_8)
      result = replace(result, chars, unicode_str_idx, try_parse_result.length + 2)
    end
  end
  result
end

Private Class Methods

has_enough_digits?(s, idx, n) click to toggle source

Can we get `n` hex digits from the string at the `idx` location?

# File lib/modl/parser/unicode_escape_replacer.rb, line 98
def self.has_enough_digits?(s, idx, n)
  i = 0
  chars = s.chars

  while i < n && (idx + i) < s.length
    c = chars[idx + i]
    unless c =~ /[0-9a-fA-F]/
      return false
    end
    i += 1
  end
  i == n
end
replace(s, value, unicode_str_index, length) click to toggle source

Replace a unicode value in a String

# File lib/modl/parser/unicode_escape_replacer.rb, line 81
def self.replace(s, value, unicode_str_index, length)
  left = s.slice(0, unicode_str_index)
  ends = [s.length, unicode_str_index + length].min
  right = s.slice(ends, s.length)
  left + value.to_s + right
end
try_parse(str, idx) click to toggle source

Attempt to parse a unicode character starting at `idx` in `str`

# File lib/modl/parser/unicode_escape_replacer.rb, line 115
def self.try_parse(str, idx)
  # Check for a 6-digit unicode value
  if has_enough_digits? str, idx, 6
    value = str.slice(idx, 6).to_i(HEX)
    return TryParse.new(value, 6) if valid_range? value
  end

  # Check for a 5-digit unicode value
  if has_enough_digits? str, idx, 5
    value = str.slice(idx, 5).to_i(HEX)
    return TryParse.new(value, 5) if valid_range? value
  end

  # Check for a 4-digit unicode value
  if has_enough_digits? str, idx, 4
    value = str.slice(idx, 4).to_i(HEX)
    return TryParse.new(value, 4) if valid_range? value
  end
  return TryParse.new(0, 4)
end
valid_range?(value) click to toggle source

Check whether the value is a valid unicode codepoint

# File lib/modl/parser/unicode_escape_replacer.rb, line 91
def self.valid_range?(value)
  (value >= 0x100000 && value <= 0x10ffff) || (value >= 0x10000 && value <= 0xfffff) || (value >= 0 && value <= 0xd7ff) || (value >= 0xe000 && value <= 0xffff)
end