module Magic

Just to make things neater, split out the FFI part here

Constants

ASCII_CHARSET
CHUNK_SIZE
CONTEXT_SIZE
EXTENDED_ASCII_CHARSET

currently libmagic doesn't distinguish the various extended ASCII charsets except ISO-8859-1

LAST_ASCII_CHAR
MAGIC_CHECK
MAGIC_COMPRESS
MAGIC_CONTINUE
MAGIC_DEBUG
MAGIC_DEVICES
MAGIC_ERROR
MAGIC_MIME
MAGIC_MIME_ENCODING
MAGIC_MIME_TYPE
MAGIC_NONE
MAGIC_NO_CHECK_APPTYPE
MAGIC_NO_CHECK_ASCII
MAGIC_NO_CHECK_COMPRESS
MAGIC_NO_CHECK_ELF
MAGIC_NO_CHECK_SOFT
MAGIC_NO_CHECK_TAR
MAGIC_NO_CHECK_TOKENS
MAGIC_NO_CHECK_TROFF
MAGIC_PRESERVE_ATIME
MAGIC_RAW
PROBLEMATIC_EXTENDED_ASCII_CHAR
REGEX
VERSION

Public Class Methods

file_charset(filename) click to toggle source
# File lib/libmagic.rb, line 13
def file_charset(filename)
  mime_type_to_charset(file_mime_type(filename))
end
file_charset!(filename) click to toggle source

Exhaustively checks file contents. The plan is you should always be able to trust the answer this method returns.

# File lib/libmagic.rb, line 18
def file_charset!(filename)
  quick_answer = file_charset(filename)

  if quick_answer == ASCII_CHARSET
    return File.open(filename) { |io| io_charset(io) } # try harder
  else
    return quick_answer
  end
end
io_charset(io) click to toggle source
# File lib/libmagic.rb, line 28
def io_charset(io)
  io.rewind
  special_characters = collect_special_characters(io)
  return special_characters.empty? ? ASCII_CHARSET : string_charset(special_characters)
ensure
  io.rewind
end
string_charset(text) click to toggle source
# File lib/libmagic.rb, line 36
def string_charset(text)
  quick_answer = mime_type_to_charset(string_mime_type(text))
  if quick_answer == ASCII_CHARSET
    text.each_byte { |byte| return EXTENDED_ASCII_CHARSET if byte == PROBLEMATIC_EXTENDED_ASCII_CHAR }
  end
  return quick_answer
end

Private Class Methods

collect_special_characters(io) click to toggle source
# File lib/libmagic.rb, line 47
def collect_special_characters(io)
  special_characters_with_context = ""
  buffer = ""
  leading_index = trailing_index = buffer_bytesize = 0 # cache buffer.bytesize for performance
  last_detection = nil

  while leading_index < buffer_bytesize || !io.eof? do
    # Add to buffer if needed
    if leading_index == buffer_bytesize
      # Need to read more, but save whatever is after trailing_index in the current buffer
      buffer.slice!(0...trailing_index)
      # adjust all indices
      last_detection -= trailing_index if last_detection
      leading_index  -= trailing_index
      trailing_index -= trailing_index
      buffer << io.read(CHUNK_SIZE)
      buffer_bytesize = buffer.bytesize
    end

    byte = buffer.getbyte(leading_index)
    if byte > LAST_ASCII_CHAR
      last_detection = leading_index
    end

    if last_detection
      if last_detection < (leading_index - CONTEXT_SIZE)
        # It has been CONTEXT_SIZE bytes since the last non-ascii character, so we should write this chunk to the results
        special_characters_with_context << buffer.byteslice(trailing_index...leading_index)
        trailing_index = leading_index + 1 # Just think of it as trailing_index = "leading_index's value at the end of the loop"
        last_detection = nil
      end
    else
      # Advance the trailing index if it we haven't seen a non-ascii byte and the trailing index is CONTEXT_SIZE bytes behind
      if trailing_index == leading_index - CONTEXT_SIZE
        trailing_index += 1
      end
    end

    leading_index += 1
  end

  # Deal with leftovers, if present
  if last_detection
    special_characters_with_context << buffer.byteslice(trailing_index, leading_index)
  end

  return special_characters_with_context
end
file_mime_type(filename) click to toggle source
# File lib/libmagic.rb, line 158
def file_mime_type(filename)
  cookie = load_cookie
  return process_result(cookie, magic_file(cookie, filename))
end
mime_type_to_charset(mime_type) click to toggle source
# File lib/libmagic.rb, line 96
def mime_type_to_charset(mime_type)
  if match = REGEX.match(mime_type)
    return standardize_charset(match[1])
  else
    return nil
  end
end
process_result(cookie, mime_type) click to toggle source
# File lib/libmagic.rb, line 172
def process_result(cookie, mime_type)
  return mime_type unless mime_type.nil?
  raise magic_error(cookie)
end
standardize_charset(nonstandard) click to toggle source

file returns different things in different versions

# File lib/libmagic.rb, line 105
def standardize_charset(nonstandard)
  case nonstandard
  when "unknown-8bit"
    "unknown"
  else
    nonstandard
  end
end
string_mime_type(string) click to toggle source
# File lib/libmagic.rb, line 153
def string_mime_type(string)
  cookie = load_cookie
  return process_result(cookie, magic_buffer(cookie, string, string.size))
end