class WTF8Fixer

Public Class Methods

fix(string) click to toggle source

Fixes strings, converts to UTF-8 even if mixed (ISO-8859-1 + UTF-8)

Example:

>> WTF8Fixer.fix('café ' + 'café'.encode('iso-8859-1').force_encoding('utf-8'))
=> 'café café'

Arguments:

string: (String)
# File lib/wtf8-fixer.rb, line 12
def self.fix(string)
  fix_bytes! string.bytes
end

Private Class Methods

fix_bytes!(input) click to toggle source
# File lib/wtf8-fixer.rb, line 18
def self.fix_bytes!(input)
  buffer = []
  while input.length > 0
    n = num_bits input[0]
    if n <= 1 || n > input.length
      data = iso_to_unicode! input
    else
      data = unicode_to_unicode! input
      if data.size == 0
        data = iso_to_unicode! input
      end
    end
    buffer.push *data
  end

  buffer.pack('C*').force_encoding('utf-8')
end
iso_to_unicode!(input) click to toggle source
# File lib/wtf8-fixer.rb, line 36
def self.iso_to_unicode!(input)
  current_byte = (input.shift()) & 0xff
  if (current_byte & 0x80) == 0
    current_byte
  else
    [
      0xc0 | ((current_byte >> 6) & 0x1f),
      0x80 | (current_byte & 0x3f)
    ]
  end
end
num_bits(b) click to toggle source
# File lib/wtf8-fixer.rb, line 61
def self.num_bits(b)
  if (b & 0x80) == 0
    0
  elsif (b & 0xC0) == 0x80
    1
  elsif (b & 0xE0) == 0xC0
    2
  elsif (b & 0xF0) == 0xE0
    3
  elsif (b & 0xF8) == 0xF0
    4
  else
    0
  end
end
unicode_to_unicode!(input) click to toggle source
# File lib/wtf8-fixer.rb, line 48
def self.unicode_to_unicode!(input)
  n = num_bits input[0]
  if n == 0
    return input.shift
  elsif n == 1
    return []
  elsif input[1 ... n].any? { |item| (item & 0xc0) != 0x80 }
    return []
  else
    return input.shift(n)
  end
end