module Magic
Just to make things neater, split out the FFI part here
Constants
- ASCII_CHARSET
- CHUNK_SIZE
- CONTEXT_SIZE
- EXTENDED_ASCII_CHARSET
currently libmagic doesn't distinguish the various extended ASCII charsets except ISO-8859-1
- LAST_ASCII_CHAR
- MAGIC_CHECK
- MAGIC_COMPRESS
- MAGIC_CONTINUE
- MAGIC_DEBUG
- MAGIC_DEVICES
- MAGIC_ERROR
- MAGIC_MIME
- MAGIC_MIME_ENCODING
- MAGIC_MIME_TYPE
- MAGIC_NONE
- MAGIC_NO_CHECK_APPTYPE
- MAGIC_NO_CHECK_ASCII
- MAGIC_NO_CHECK_COMPRESS
- MAGIC_NO_CHECK_ELF
- MAGIC_NO_CHECK_SOFT
- MAGIC_NO_CHECK_TAR
- MAGIC_NO_CHECK_TOKENS
- MAGIC_NO_CHECK_TROFF
- MAGIC_PRESERVE_ATIME
- MAGIC_RAW
- MAGIC_SYMLINK
- PROBLEMATIC_EXTENDED_ASCII_CHAR
- REGEX
- VERSION
Public Class Methods
file_charset(filename)
click to toggle source
# File lib/libmagic.rb, line 13 def file_charset(filename) mime_type_to_charset(file_mime_type(filename)) end
file_charset!(filename)
click to toggle source
Exhaustively checks file contents. The plan is you should always be able to trust the answer this method returns.
# File lib/libmagic.rb, line 18 def file_charset!(filename) quick_answer = file_charset(filename) if quick_answer == ASCII_CHARSET return File.open(filename) { |io| io_charset(io) } # try harder else return quick_answer end end
io_charset(io)
click to toggle source
# File lib/libmagic.rb, line 28 def io_charset(io) io.rewind special_characters = collect_special_characters(io) return special_characters.empty? ? ASCII_CHARSET : string_charset(special_characters) ensure io.rewind end
string_charset(text)
click to toggle source
# File lib/libmagic.rb, line 36 def string_charset(text) quick_answer = mime_type_to_charset(string_mime_type(text)) if quick_answer == ASCII_CHARSET text.each_byte { |byte| return EXTENDED_ASCII_CHARSET if byte == PROBLEMATIC_EXTENDED_ASCII_CHAR } end return quick_answer end
Private Class Methods
collect_special_characters(io)
click to toggle source
# File lib/libmagic.rb, line 47 def collect_special_characters(io) special_characters_with_context = "" buffer = "" leading_index = trailing_index = buffer_bytesize = 0 # cache buffer.bytesize for performance last_detection = nil while leading_index < buffer_bytesize || !io.eof? do # Add to buffer if needed if leading_index == buffer_bytesize # Need to read more, but save whatever is after trailing_index in the current buffer buffer.slice!(0...trailing_index) # adjust all indices last_detection -= trailing_index if last_detection leading_index -= trailing_index trailing_index -= trailing_index buffer << io.read(CHUNK_SIZE) buffer_bytesize = buffer.bytesize end byte = buffer.getbyte(leading_index) if byte > LAST_ASCII_CHAR last_detection = leading_index end if last_detection if last_detection < (leading_index - CONTEXT_SIZE) # It has been CONTEXT_SIZE bytes since the last non-ascii character, so we should write this chunk to the results special_characters_with_context << buffer.byteslice(trailing_index...leading_index) trailing_index = leading_index + 1 # Just think of it as trailing_index = "leading_index's value at the end of the loop" last_detection = nil end else # Advance the trailing index if it we haven't seen a non-ascii byte and the trailing index is CONTEXT_SIZE bytes behind if trailing_index == leading_index - CONTEXT_SIZE trailing_index += 1 end end leading_index += 1 end # Deal with leftovers, if present if last_detection special_characters_with_context << buffer.byteslice(trailing_index, leading_index) end return special_characters_with_context end
file_mime_type(filename)
click to toggle source
# File lib/libmagic.rb, line 158 def file_mime_type(filename) cookie = load_cookie return process_result(cookie, magic_file(cookie, filename)) end
mime_type_to_charset(mime_type)
click to toggle source
# File lib/libmagic.rb, line 96 def mime_type_to_charset(mime_type) if match = REGEX.match(mime_type) return standardize_charset(match[1]) else return nil end end
process_result(cookie, mime_type)
click to toggle source
# File lib/libmagic.rb, line 172 def process_result(cookie, mime_type) return mime_type unless mime_type.nil? raise magic_error(cookie) end
standardize_charset(nonstandard)
click to toggle source
file returns different things in different versions
# File lib/libmagic.rb, line 105 def standardize_charset(nonstandard) case nonstandard when "unknown-8bit" "unknown" else nonstandard end end
string_mime_type(string)
click to toggle source
# File lib/libmagic.rb, line 153 def string_mime_type(string) cookie = load_cookie return process_result(cookie, magic_buffer(cookie, string, string.size)) end