class RDup::Scanner

Attributes

dirs[RW]
files[RW]
full_hash_map[R]
full_hashes[R]
header_hash_map[R]
header_hashes[R]
opts[RW]
size_map[R]
stats[R]

Public Class Methods

new(opts) click to toggle source
# File lib/rdup.rb, line 32
def initialize(opts)
  @opts = Defaults.dup
  @files = []
  @dirs = []
  @stats = {}
  @header_hashes = {}
  @full_hashes = {}
  @size_map = {}
  @header_hash_map = {}
  @full_hash_map = {}

  @opts.update(opts)

  opts[:arguments].each do |path|
    if File.file?(path)
      @files << path
    elsif File.directory?(path)
      @dirs << path
    else
      STDERR.puts "Warning: skip `#{dir}' because it's neither a file nor a directory"
    end
  end
end

Public Instance Methods

run() click to toggle source
# File lib/rdup.rb, line 56
def run
  find_all_files
  fcount = @stats.size
  puts "Found #{fcount} files to be compared for duplication."
  if fcount == 0
    return
  end

  build_size_map
  reduce_groups(@size_map)
  gcount = @size_map.size
  fcount = count_files(@size_map)
  puts "Found #{gcount} sets of files with identical sizes. (#{fcount} files in total)"
  if fcount == 0
    return
  end

  build_header_hash_map
  reduce_groups(@header_hash_map)
  gcount = @header_hash_map.size
  fcount = count_files(@header_hash_map)
  puts "Found #{gcount} sets of files with identical header hashes. (#{fcount} files in total)"
  if fcount == 0
    return
  end

  build_full_hash_map
  reduce_groups(@full_hash_map)
  gcount = @full_hash_map.size
  fcount = count_files(@full_hash_map)
  puts "Found #{gcount} sets of files with identical hashes. (#{fcount} files in total)"
  if fcount == 0
    return
  end

  @full_hash_map.each_with_index do |pair, i|
    full_hash, group = pair
    size = @stats[group[0]].size
    puts "\n[#{i + 1}/#{gcount}] SHA1: #{full_hash}, Size: #{csf(size)} bytes"
    group.each_with_index do |path, j|
      stat = @stats[path]
      if @opts[:show_mtime]
        puts "  #{j + 1}) #{stat.mtime}  #{path}"
      else
        puts "  #{j + 1}) #{path}"
      end
    end

    if @opts[:deletion]
      survivals = which_to_preserve(group)
      group.each_with_index do |path, index|
        if survivals.include?(index + 1)
          puts "  [+] #{path}"
        else
          puts "  [-] #{path}"
          remove_file(path) unless @opts[:dry_run]
        end
      end
    end
  end
end

Private Instance Methods

build_full_hash_map() click to toggle source

@header_hash_map: full_hash => [file1, file2, …]

# File lib/rdup.rb, line 185
def build_full_hash_map
  @header_hash_map.each_value do |paths|
    paths.each do |path|
      stat = @stats[path]
      if stat.size <= @opts[:header_size]
        full_hash = stat.full_hash
      else
        full_hash = Digest::SHA1.new.file(path).hexdigest
        @stats[path].full_hash = full_hash
      end

      if @full_hash_map.has_key?(full_hash)
        @full_hash_map[full_hash] << path
      else
        @full_hash_map[full_hash] = [path]
      end
    end
  end
end
build_header_hash_map() click to toggle source

@header_hash_map: header_hash => [file1, file2, …]

# File lib/rdup.rb, line 164
def build_header_hash_map
  header_size = @opts[:header_size]
  @size_map.each do |size, paths|
    paths.each do |path|
      header = File.open(path, 'rb'){|f| f.read(header_size)}
      header = '' if header.nil?  # empty file

      header_hash = Digest::SHA1.new.hexdigest(header)
      @stats[path].header_hash = header_hash
      @stats[path].full_hash = header_hash if size <= header_size

      if @header_hash_map.has_key?(header_hash)
        @header_hash_map[header_hash] << path
      else
        @header_hash_map[header_hash] = [path]
      end
    end
  end
end
build_size_map() click to toggle source

Group the files by size @size_map: file_size => [file1, file2, …]

# File lib/rdup.rb, line 152
def build_size_map
  @stats.each do |path, stat|
    size = stat.size
    if @size_map.has_key?(size)
      @size_map[size] << path
    else
      @size_map[size] = [path]
    end
  end
end
count_files(map) click to toggle source
# File lib/rdup.rb, line 209
def count_files(map)
  map.values.flatten.size
end
csf(number) click to toggle source

Comma-separated format

# File lib/rdup.rb, line 214
def csf(number)
  number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
end
find_all_files() click to toggle source
# File lib/rdup.rb, line 120
def find_all_files
  @files.each do |path|
    stat = File.stat(path)
    if stat.size >= @opts[:min_size]
      @stats[path] = FileStat.new(stat.size, stat.mtime)
    else
      @files.delete(path)
    end
  end

  pwd = Dir.pwd
  @dirs.each do |dir|
    begin
      Dir.chdir(dir)
      Dir['**/*'].each do |path|
        stat = File.stat(path)
        if stat.file? and stat.size >= @opts[:min_size]
          path = File.join(dir, path)
          @files << path
          @stats[path] = FileStat.new(stat.size, stat.mtime)
        end
      end
    rescue => e
      STDERR.puts "Error: #{e}"
    ensure
      Dir.chdir(pwd)
    end
  end
end
reduce_groups(map) click to toggle source
# File lib/rdup.rb, line 205
def reduce_groups(map)
  map.delete_if {|key, paths| paths.size == 1}
end
remove_file(path) click to toggle source
# File lib/rdup.rb, line 244
def remove_file(path)
  begin
    File.unlink(path)
  rescue => e
    STDERR.puts "Error: #{e}"
  end
end
which_to_preserve(group) click to toggle source

Ask the user which files to preserve. Return an array of numbers

# File lib/rdup.rb, line 220
def which_to_preserve(group)
  while true
    all = 1.upto(group.size).to_a
    print "Which to preserve (#{all.join(',')} or all): "
    input = STDIN.readline.strip
    if input.empty?
      # continue
    elsif ['a', 'all'].include?(input.downcase)
      return all
    elsif input =~ /^[\d\s,]+$/
      nums = input.split(/[,\s]+/).delete_if(&:empty?).map(&:to_i)
      if nums.empty?
        STDERR.puts 'Illegal answer. Please input some numbers.'
      elsif nums.min < 1 || nums.max > group.size
        STDERR.puts "Illegal number. Allowed range: [1, #{group.size}]"
      else # good answer
        return nums
      end
    else
      STDERR.puts 'Illegal answer. Only numbers/commas/spaces allowed.'
    end
  end
end