class Libis::Format::Tool::IdentificationTool
Attributes
bad_mimetypes[RW]
bad_puids[RW]
Public Class Methods
bad_mimetype(mimetype)
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 20 def self.bad_mimetype(mimetype) self.instance.bad_mimetype(mimetype) end
new()
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 164 def initialize @bad_mimetypes = [nil, '', 'None', 'application/octet-stream'] @bad_puids = [nil, 'fmt/unknown'] end
run(file, recursive = false, options = {})
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 24 def self.run(file, recursive = false, options = {}) options ||= {} if file.is_a?(Array) return run_list file, options elsif file.is_a?(String) && File.exists?(file) && File.readable?(file) if File.directory?(file) return run_dir(file, recursive, options) elsif File.file?(file) return self.instance.run(file, options) end end raise ArgumentError, 'IdentificationTool: file argument should be a path to an existing file or directory or a list of those' end
run_dir(file, recursive = true, options = {})
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 39 def self.run_dir(file, recursive = true, options = {}) self.instance.run_dir file, recursive, options end
run_list(filelist , options = {})
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 43 def self.run_list(filelist , options = {}) self.instance.run_list filelist, options end
Protected Instance Methods
annotate(result)
click to toggle source
Enhance the output with mimetype and score
# File lib/libis/format/tool/identification_tool.rb, line 95 def annotate(result) # Enhance result with mimetype if needed if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid]) result[:mimetype] = get_mimetype(result[:puid]) end # Normalize the mimetype Libis::Format::Library.normalize(result) # Default score is 5 result[:score] = 5 # Weak detection score is 1 result[:score] = 1 if bad_mimetypes.include? result[:mimetype] # freeze all strings result.each {|_, v| v.freeze if v.is_a?(String)} # Adapt score based on matchtype result[:matchtype] = result[:matchtype].to_s.downcase case result[:matchtype] # Signature match increases score with 2 when 'signature' result[:score] += 2 # typeinfo = ::Libis::Format::Library.get_info_by(:puid, result[:puid]) # ext = File.extname(result[:filename]) # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext) # Container match increases score with 4 when 'container' result[:score] += 4 # typeinfo = ::Libis::Format::Library.get_info_by(:puid, result[:puid]) # ext = File.extname(result[:filename]) # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext) # Extension match is the weakest identification; score is lowered by 2 points when 'extension' result[:score] -= 2 # Magic code (file tool) is to be trused even less when 'magic' result[:score] -= 3 # Or no change otherwise else # do nothing end # Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub, # Office OpenXML, OpenDocument, jar, maff, svx) if result[:mimetype] == 'application/zip' result[:score] -= 2 end # Return result enhanced with mimetype and score fields result end
bad_mimetype(mimetype)
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 169 def bad_mimetype(mimetype) @bad_mimetypes << mimetype end
create_list_file(filelist) { |path| ... }
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 49 def create_list_file(filelist) list_file = Tempfile.new(%w'file .list') filelist.each do |fname| list_file.write "#{fname}\n" end list_file.close yield(list_file.path) ensure list_file.unlink end
find_files(dir, recurse = true)
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 60 def find_files(dir, recurse = true) args = [] args << '-L' args << dir.escape_for_string args << '-maxdepth' << '1' unless recurse args << '-type' << 'f' args << '-print' output = ::Libis::Tools::Command.run('find', *args) warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty? output[:out] end
get_mimetype(puid)
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 154 def get_mimetype(puid) ::Libis::Format::Library.get_field_by(:puid, puid, :mimetype) rescue nil end
get_puid(mimetype)
click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 158 def get_puid(mimetype) ::Libis::Format::Library.get_field_by(:mimetype, mimetype, :puid) rescue nil end
process_output(output)
click to toggle source
Reformat output to make it easier to post-process and decide on the preferred format
input format: [
{ filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }
]
output format:
{ <filename> => [<result>, ...], ... }
<result> is the enchanced Hash output of the identification tool:
{ mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
# File lib/libis/format/tool/identification_tool.rb, line 85 def process_output(output) output.reduce({}) do |results, x| filepath = File.absolute_path(x.delete(:filepath)).freeze results[filepath] ||= [] results[filepath] << annotate(x) results end end