class Libis::Ingester::FormatValidator

Protected Instance Methods

apply_formats(item, format_list) click to toggle source
# File lib/libis/ingester/tasks/format_validator.rb, line 102
def apply_formats(item, format_list)

  if item.is_a? Libis::Ingester::FileItem
    format = format_list[item.fullpath]
    if format.empty?
      warn "Could not determine MIME type. Using default 'application/octet-stream'.", item
    else
      debug "MIME type '#{format[:mimetype]}' detected.", item
    end
    item.properties['mimetype'] = format[:mimetype] || 'application/octet-stream'
    item.properties['puid'] = format[:puid] || 'fmt/unknown'
    item.properties['format_identification'] = format
  else
    item.each do |subitem|
      apply_formats(subitem, format_list)
    end
  end

end
collect_filepaths(item) click to toggle source
# File lib/libis/ingester/tasks/format_validator.rb, line 122
def collect_filepaths(item)
  return File.absolute_path(item.fullpath) if item.is_a? Libis::Ingester::FileItem
  item.map do |subitem|
    collect_filepaths(subitem)
  end.flatten.compact
end
process(item) click to toggle source
# File lib/libis/ingester/tasks/format_validator.rb, line 39
def process(item)

  msg = case item.properties['puid']
        when 'fmt/494'
          'Microsoft Office Encrypted Document'
        when 'fmt/754'
          'password protected Microsoft Word Document'
        when 'fmt/755'
          'password protected Microsoft Word Document Template'
        else
          nil
        end

  if msg
    case parameter(:encrypted_doc)
    when 'FAIL'
      raise Libis::WorkflowError, "Found #{msg}: #{item.filepath}"
    when 'WARN'
      warn "Found #{msg}: #{item.filepath}"
    when 'DUMMY'
      replace_with_dummy(item, "File <i>#{item.filepath}</i> is a #{msg}")
      warn "#{msg} '#{item.filepath}' was replaced with a dummy file"
    else
      raise Libis::WorkflowAbort, "Unknown value for encrypted_doc parameter encountered."
    end
  end

  if item.properties['format_ext_mismatch']
    format_type = item.properties[:format_type]
    extensions = format_type ? Libis::Format::TypeDatabase.type_extentions(format_type) : []
    message = 'Found document with wrong extension `%s`; format is %s%s, puid: %s, valid extensions: %s' %
        [
            File.extname(item.filepath),
            item.properties['format_name'],
            item.properties['format_version'].blank? ? '' : " (#{item.properties['format_version']})",
            item.properties['puid'],
            extensions.map {|x| ".#{x}"}.join(' ')
        ]
    case parameter(:ext_mismatch)
    when 'FAIL'
      raise Libis::WorkflowError, message
    when 'WARN'
      warn message
    when 'FIX'
      warn message
      if (ext = extensions.first)
        old_name = item.properties['original_path']
        old_name ||= File.basename(item.properties['filename'])
        new_name = File.join(File.dirname(old_name), "#{File.basename(old_name, '.*')}.#{ext}")
        item.properties['original_path'] = new_name
        item.save!
        warn "File will be renamed to '#{File.basename(new_name)}' in the repository."
      else
        message = 'Could not fix extenstion of file %s as no extension for the format (%s - %s - %s) is known in the type database' %
            [item.filepath, item.properties['puid'], item.properties['format_name'], item.properties['format_version']]
        raise Libis::WorkflowError, message
      end
    else
      warn message
    end
  end
end
replace_with_dummy(item, message) click to toggle source
# File lib/libis/ingester/tasks/format_validator.rb, line 129
def replace_with_dummy(item, message)
  work_dir = File.join(item.get_run.work_dir, item.id)
  FileUtils.mkpath(work_dir) unless Dir.exists?(work_dir)
  file_path = File.join(work_dir, "#{File.basename(item.filename, '.*')}.docx")
  html = '<html><head></head><body><h1/><h1>%s</h1><h1/>%s</body></html>' % [
      'The preservation system rejected this file for the following reason:',
      "File <i>#{item.filepath}</i> is a <b>#{message}</b>"
  ]
  Htmltoword.config.custom_templates_path = File.join(Libis::Ingester::ROOT_DIR, 'config')
  Htmltoword::Document.create_and_save(html, file_path, 'Warning')
  item.filename = file_path
  result = Libis::Format::Identifier.get(item.fullpath) || {}
  process_messages(result, item)
  apply_formats(item, result[:formats])
end