class Libis::Format::Tool::IdentificationTool

Attributes

bad_mimetypes[RW]
bad_puids[RW]

Public Class Methods

bad_mimetype(mimetype) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 20
def self.bad_mimetype(mimetype)
  self.instance.bad_mimetype(mimetype)
end
new() click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 164
def initialize
  @bad_mimetypes = [nil, '', 'None', 'application/octet-stream']
  @bad_puids = [nil, 'fmt/unknown']
end
run(file, recursive = false, options = {}) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 24
def self.run(file, recursive = false, options = {})
  options ||= {}
  if file.is_a?(Array)
    return run_list file, options
  elsif file.is_a?(String) && File.exists?(file) && File.readable?(file)
    if File.directory?(file)
      return run_dir(file, recursive, options)
    elsif File.file?(file)
      return self.instance.run(file, options)
    end
  end
  raise ArgumentError,
        'IdentificationTool: file argument should be a path to an existing file or directory or a list of those'
end
run_dir(file, recursive = true, options = {}) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 39
def self.run_dir(file, recursive = true, options = {})
  self.instance.run_dir file, recursive, options
end
run_list(filelist , options = {}) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 43
def self.run_list(filelist , options = {})
  self.instance.run_list filelist, options
end

Protected Instance Methods

annotate(result) click to toggle source

Enhance the output with mimetype and score

# File lib/libis/format/tool/identification_tool.rb, line 95
def annotate(result)
  # Enhance result with mimetype if needed
  if bad_mimetypes.include?(result[:mimetype]) && !bad_puids.include?(result[:puid])
    result[:mimetype] = get_mimetype(result[:puid])
  end

  # Normalize the mimetype
  Libis::Format::Library.normalize(result)

  # Default score is 5
  result[:score] = 5

  # Weak detection score is 1
  result[:score] = 1 if bad_mimetypes.include? result[:mimetype]

  # freeze all strings
  result.each {|_, v| v.freeze if v.is_a?(String)}

  # Adapt score based on matchtype
  result[:matchtype] = result[:matchtype].to_s.downcase
  case result[:matchtype]

    # Signature match increases score with 2
    when 'signature'
      result[:score] += 2
    # typeinfo = ::Libis::Format::Library.get_info_by(:puid, result[:puid])
    # ext = File.extname(result[:filename])
    # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)

    # Container match increases score with 4
    when 'container'
      result[:score] += 4
    # typeinfo = ::Libis::Format::Library.get_info_by(:puid, result[:puid])
    # ext = File.extname(result[:filename])
    # result[:score] += 1 if typeinfo and typeinfo[:EXTENSIONS].include?(ext)

    # Extension match is the weakest identification; score is lowered by 2 points
    when 'extension'
      result[:score] -= 2

    # Magic code (file tool) is to be trused even less
    when 'magic'
      result[:score] -= 3

    # Or no change otherwise
    else
      # do nothing
  end

  # Detecting a zip file should decrease the score as it may hide one of the many zip-based formats (e.g. epub,
  # Office OpenXML, OpenDocument, jar, maff, svx)
  if result[:mimetype] == 'application/zip'
    result[:score] -= 2
  end

  # Return result enhanced with mimetype and score fields
  result
end
bad_mimetype(mimetype) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 169
def bad_mimetype(mimetype)
  @bad_mimetypes << mimetype
end
create_list_file(filelist) { |path| ... } click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 49
def create_list_file(filelist)
  list_file = Tempfile.new(%w'file .list')
  filelist.each do |fname|
    list_file.write "#{fname}\n"
  end
  list_file.close
  yield(list_file.path)
ensure
  list_file.unlink
end
find_files(dir, recurse = true) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 60
def find_files(dir, recurse = true)
  args = []
  args << '-L'
  args << dir.escape_for_string
  args << '-maxdepth' << '1' unless recurse
  args << '-type' << 'f'
  args << '-print'
  output = ::Libis::Tools::Command.run('find', *args)
  warn "Find command errors: #{output[:err].join("\n")}" unless output[:err].empty?
  output[:out]
end
get_mimetype(puid) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 154
def get_mimetype(puid)
  ::Libis::Format::Library.get_field_by(:puid, puid, :mimetype) rescue nil
end
get_puid(mimetype) click to toggle source
# File lib/libis/format/tool/identification_tool.rb, line 158
def get_puid(mimetype)
  ::Libis::Format::Library.get_field_by(:mimetype, mimetype, :puid) rescue nil
end
process_output(output) click to toggle source

Reformat output to make it easier to post-process and decide on the preferred format

input format: [

{ filepath: <filename>, mimetype: <mimetype>, matchtype: <matchtype>, ... }

]

output format:

{ <filename> => [<result>, ...], ... }

<result> is the enchanced Hash output of the identification tool:

{ mimetype: <mimetype>, puid: <puid>, matchtype: <matchtype>, score: <score>, ...}
# File lib/libis/format/tool/identification_tool.rb, line 85
def process_output(output)
  output.reduce({}) do |results, x|
    filepath = File.absolute_path(x.delete(:filepath)).freeze
    results[filepath] ||= []
    results[filepath] << annotate(x)
    results
  end
end