class NauktisUtils::Duplicate
Class to find and handle duplicate files.
Attributes
Public Class Methods
algorithm(name)
click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 6 def self.algorithm(name) key = name.to_sym @@algorithms ||= {} unless @@algorithms.has_key? key @@algorithms[:name] = proc { |file| File.basename(file).downcase } @@algorithms[:size] = proc { |file| File.size(file) } @@algorithms[:md5] = proc { |file| FileDigester.digest(file, :md5) } @@algorithms[:sha1] = proc { |file| FileDigester.digest(file, :sha1) } @@algorithms[:sha3] = proc { |file| FileDigester.digest(file, :sha3) } end @@algorithms.fetch key end
new(handling_strategy)
click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 132 def initialize(handling_strategy) @handling_strategy = handling_strategy end
Public Instance Methods
clean(directories)
click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 136 def clean(directories) logger.info "Searching duplicates in #{directories}" directories.map! { |d| File.expand_path(d) } files = files_in(directories) logger.info "Number of files: #{files.size.to_s(:delimited)}" size_before = size_of(directories) logger.info "Total size: #{size_before.to_s(:human_size)}" @groupings = [self.class.algorithm(:size), self.class.algorithm(:md5), self.class.algorithm(:sha3)] multi_group_by(files, 0) size_after = size_of(directories) logger.info "Total size: #{size_after.to_s(:human_size)}" reduction_ratio = (100 * (size_before - size_after) / size_before.to_f).round(2) logger.info "Size reduced by #{reduction_ratio}% (#{size_after.to_s(:delimited)}/#{size_before.to_s(:delimited)})" end
Private Instance Methods
files_in(directories)
click to toggle source
Returns the list of files in the directories provided
# File lib/nauktis_utils/duplicate.rb, line 176 def files_in(directories) files = [] Find.find(*directories) do |path| unless File.directory?(path) or File.symlink?(path) files << File.expand_path(path) end end files.uniq end
handle_duplicates(duplicates)
click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 165 def handle_duplicates(duplicates) # For extra safety we check a file doesn't appear twice. unless duplicates.uniq == duplicates s = "A file appears twice: #{duplicates}" logger.error s raise s end handling_strategy.handle(duplicates) end
multi_group_by(files, index)
click to toggle source
# File lib/nauktis_utils/duplicate.rb, line 155 def multi_group_by(files, index) if index >= @groupings.size handle_duplicates(files) else files.group_by(&@groupings[index]).values.each do |sub| multi_group_by(sub, index + 1) if sub.size > 1 end end end
size_of(directories)
click to toggle source
Returns the total size of the directories provided
# File lib/nauktis_utils/duplicate.rb, line 187 def size_of(directories) size = 0 files_in(directories).each do |f| size += File.size(f) end size end