module Condenser::EncodingUtils

Constants

BOM

Internal: Mapping unicode encodings to byte order markers.

CHARSET_SIZE
CHARSET_START

Internal: @charset bytes

Public Instance Methods

detect(str) click to toggle source

Public: Basic string detecter.

Attempts to parse any Unicode BOM otherwise falls back to the environment's external encoding.

str - ASCII-8BIT encoded String

Returns encoded String.

# File lib/condenser/encoding_utils.rb, line 21
def detect(str)
  str = detect_unicode_bom(str)

  # Fallback to environment's external encoding
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding.default_external)
  end

  str
end
detect_css(str) click to toggle source

Public: Detect and strip @charset from CSS style sheet.

str - String.

Returns a encoded String.

# File lib/condenser/encoding_utils.rb, line 77
def detect_css(str)
  str = detect_unicode_bom(str)

  if name = scan_css_charset(str)
    encoding = Encoding.find(name)
    str = str.dup
    str.force_encoding(encoding)
    len = "@charset \"#{name}\";".encode(encoding).size
    str.slice!(0, len)
    str
  end

  # Fallback to UTF-8
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding::UTF_8)
  end

  str
end
detect_html(str) click to toggle source

Public: Detect charset from HTML document.

Attempts to parse any Unicode BOM otherwise attempt Charlock detection and finally falls back to the environment's external encoding.

str - String.

Returns a encoded String.

# File lib/condenser/encoding_utils.rb, line 143
def detect_html(str)
  str = detect_unicode_bom(str)

  # Fallback to environment's external encoding
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding.default_external)
  end

  str
end
detect_unicode(str) click to toggle source

Public: Detect Unicode string.

Attempts to parse Unicode BOM and falls back to UTF-8.

str - ASCII-8BIT encoded String

Returns encoded String.

# File lib/condenser/encoding_utils.rb, line 39
def detect_unicode(str)
  str = detect_unicode_bom(str)

  # Fallback to UTF-8
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding::UTF_8)
  end

  str
end
detect_unicode_bom(str) click to toggle source

Public: Detect and strip BOM from possible unicode string.

str - ASCII-8BIT encoded String

Returns UTF 8/16/32 encoded String without BOM or the original String if no BOM was present.

# File lib/condenser/encoding_utils.rb, line 56
def detect_unicode_bom(str)
  bom_bytes = str.byteslice(0, 4).bytes.to_a

  BOM.each do |encoding, bytes|
    if bom_bytes[0, bytes.size] == bytes
      str = str.dup
      str.force_encoding(Encoding::BINARY)
      str.slice!(0, bytes.size)
      str.force_encoding(encoding)
      return str
    end
  end

  return str
end
scan_css_charset(str) click to toggle source

Internal: Scan binary CSS string for @charset encoding name.

str - ASCII-8BIT encoded String

Returns encoding String name or nil.

# File lib/condenser/encoding_utils.rb, line 106
def scan_css_charset(str)
  buf = []
  i = 0

  str.each_byte.each do |byte|
    # Halt on line breaks
    break if byte == 0x0A || byte == 0x0D

    # Only ascii bytes
    next unless 0x0 < byte && byte <= 0xFF

    if i < CHARSET_SIZE
    elsif i == CHARSET_SIZE
      if buf == CHARSET_START
        buf = []
      else
        break
      end
    elsif byte == 0x22
      return buf.pack('C*')
    end

    buf << byte
    i += 1
  end

  nil
end