class UTF8Cleaner::URIString

Cleans invalid %-encodings from URI-encoded strings.

Constants

HEX_CHARS
HEX_CHARS_REGEX
INVALID_PERCENT_ENCODING_REGEX

Attributes

data[RW]

Public Class Methods

new(data) click to toggle source
# File lib/utf8-cleaner/uri_string.rb, line 10
def initialize(data)
  self.data = data
end

Public Instance Methods

cleaned() click to toggle source
# File lib/utf8-cleaner/uri_string.rb, line 14
def cleaned
  if valid?
    data
  else
    encoded_char_array.join
  end
end
valid?() click to toggle source
# File lib/utf8-cleaner/uri_string.rb, line 22
def valid?
  valid_uri_encoded_utf8(data)
rescue Encoding::CompatibilityError
  false
end

Private Instance Methods

encoded_char_array() click to toggle source

Returns an array of valid URI-encoded UTF-8 characters.

# File lib/utf8-cleaner/uri_string.rb, line 31
def encoded_char_array
  char_array = []
  index = 0

  while (index < data.length) do
    char = data[index]

    if char == '%'
      # Skip the next two characters, which are the encoded byte
      # indicates by this %. (We'll change this later for multibyte characters.)
      skip_next = 2

      # If the next character is not a hex char, drop the percent and it
      unless data[index + 1] =~ HEX_CHARS_REGEX
        index += 2
        next
      end

      # If the character after that is not a hex char, drop the percent and
      # both of the following chars.
      unless data[index + 2] =~ HEX_CHARS_REGEX
        index += 3
        next
      end

      # How long is this character?
      first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
      bytes = utf8_char_length_in_bytes(first_byte)

      # Grab the specified number of encoded bytes
      utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)

      # Did we get the right number of bytes?
      if utf8_char_encoded_bytes.length == bytes

        # We did. Is it a valid character?
        utf8_char_encoded = utf8_char_encoded_bytes.join

        if valid_uri_encoded_utf8(utf8_char_encoded)
          # It's valid!
          char_array << utf8_char_encoded

          # If we're dealing with a multibyte character, skip more than two
          # of the next characters, which have already been processed.
          skip_next = bytes * 3 - 1
        end
      end
      index += skip_next
    else
      # This was not an encoded character, so just add it and move to the next.
      char_array << char
    end
    index += 1
  end

  char_array
end
next_n_bytes_from(index, num_bytes) click to toggle source

Grab the next num_bytes URI-encoded bytes from the raw character array. Returns an array like ['%E2', '%9C', '%93']

# File lib/utf8-cleaner/uri_string.rb, line 101
def next_n_bytes_from(index, num_bytes)
  return [] if data.length < index + (3 * num_bytes)

  num_bytes.times.map do |n|
    # Look for percent signs in the right places
    pct_index = index + (3 * n)
    if data[pct_index] == '%'
      byte = data[pct_index + 1..pct_index + 2]
    else
      # An expected percent sign was missing. The whole character is invalid.
      return []
    end
    '%' + byte
  end
end
utf8_char_length_in_bytes(first_byte) click to toggle source

If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes; if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes; and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes. first_byte is a string like “0x13”

# File lib/utf8-cleaner/uri_string.rb, line 121
def utf8_char_length_in_bytes(first_byte)
  if first_byte.hex < 'C0'.hex
    1
  elsif first_byte.hex < 'DF'.hex
    2
  elsif first_byte.hex < 'EF'.hex
    3
  else
    4
  end
end
valid_uri_encoded_utf8(string) click to toggle source
# File lib/utf8-cleaner/uri_string.rb, line 89
def valid_uri_encoded_utf8(string)
  URI::DEFAULT_PARSER.unescape(string).force_encoding('UTF-8').valid_encoding? &&
    string !~ INVALID_PERCENT_ENCODING_REGEX
rescue ArgumentError => e
  if e.message =~ /invalid byte sequence/
    return false
  end
  raise e
end