class NauktisUtils::Duplicate

Class to find and handle duplicate files.

Attributes

handling_strategy[RW]

Public Class Methods

algorithm(name) click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 6
def self.algorithm(name)
  key = name.to_sym
  @@algorithms ||= {}
  unless @@algorithms.has_key? key
    @@algorithms[:name] = proc { |file| File.basename(file).downcase }
    @@algorithms[:size] = proc { |file| File.size(file) }
    @@algorithms[:md5] = proc { |file| FileDigester.digest(file, :md5) }
    @@algorithms[:sha1] = proc { |file| FileDigester.digest(file, :sha1) }
    @@algorithms[:sha3] = proc { |file| FileDigester.digest(file, :sha3) }
  end
  @@algorithms.fetch key
end
new(handling_strategy) click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 132
def initialize(handling_strategy)
  @handling_strategy = handling_strategy
end

Public Instance Methods

clean(directories) click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 136
def clean(directories)
  logger.info "Searching duplicates in #{directories}"
  directories.map! { |d| File.expand_path(d) }
  files = files_in(directories)
  logger.info "Number of files: #{files.size.to_s(:delimited)}"
  size_before = size_of(directories)
  logger.info "Total size: #{size_before.to_s(:human_size)}"

  @groupings = [self.class.algorithm(:size), self.class.algorithm(:md5), self.class.algorithm(:sha3)]
  multi_group_by(files, 0)

  size_after = size_of(directories)
  logger.info "Total size: #{size_after.to_s(:human_size)}"
  reduction_ratio = (100 * (size_before - size_after) / size_before.to_f).round(2)
  logger.info "Size reduced by #{reduction_ratio}% (#{size_after.to_s(:delimited)}/#{size_before.to_s(:delimited)})"
end

Private Instance Methods

files_in(directories) click to toggle source

Returns the list of files in the directories provided

# File lib/nauktis_utils/duplicate.rb, line 176
def files_in(directories)
  files = []
  Find.find(*directories) do |path|
    unless File.directory?(path) or File.symlink?(path)
      files << File.expand_path(path)
    end
  end
  files.uniq
end
handle_duplicates(duplicates) click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 165
def handle_duplicates(duplicates)
  # For extra safety we check a file doesn't appear twice.
  unless duplicates.uniq == duplicates
    s = "A file appears twice: #{duplicates}"
    logger.error s
    raise s
  end
  handling_strategy.handle(duplicates)
end
multi_group_by(files, index) click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 155
def multi_group_by(files, index)
  if index >= @groupings.size
    handle_duplicates(files)
  else
    files.group_by(&@groupings[index]).values.each do |sub|
      multi_group_by(sub, index + 1) if sub.size > 1
    end
  end
end
size_of(directories) click to toggle source

Returns the total size of the directories provided

# File lib/nauktis_utils/duplicate.rb, line 187
def size_of(directories)
  size = 0
  files_in(directories).each do |f|
    size += File.size(f)
  end
  size
end