class CSVUtils::CSVSort

Utility class for sorting the rows for a csv file

Attributes

csv_file[R]
csv_options[R]
has_headers[R]
headers[R]
new_csv_file[R]

Public Class Methods

new(csv_file, new_csv_file, has_headers = true, csv_options = {}) click to toggle source
# File lib/csv_utils/csv_sort.rb, line 11
def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
  @csv_file = csv_file
  @new_csv_file = new_csv_file
  @has_headers = has_headers
  @csv_options = csv_options
  @csv_part_files = []
  @files_to_delete = []
end

Public Instance Methods

sort(batch_size = 100_000, &block) click to toggle source
# File lib/csv_utils/csv_sort.rb, line 20
def sort(batch_size = 100_000, &block)
  create_sorted_csv_part_files(batch_size, &block)
  merge_csv_part_files(&block)
end

Private Instance Methods

create_sorted_csv_part_files(batch_size, &block) click to toggle source
# File lib/csv_utils/csv_sort.rb, line 68
def create_sorted_csv_part_files(batch_size, &block)
  src = CSV.open(csv_file, 'rb', **csv_options)

  @headers = src.shift if has_headers

  batch = []
  create_batch_part_proc = Proc.new do
    batch.sort!(&block)
    @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
    CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
      csv << @headers if @headers
      batch.each { |row| csv << row }
    end
    batch = []
  end

  while (row = src.shift)
    batch << row
    create_batch_part_proc.call if batch.size >= batch_size
  end

  create_batch_part_proc.call if batch.size > 0

  src.close
end
merge_csv_part_files(&block) click to toggle source
# File lib/csv_utils/csv_sort.rb, line 94
def merge_csv_part_files(&block)
  file_merge_cnt = 0

  while @csv_part_files.size > 1
    file_merge_cnt += 1

    csv_part_file1 = @csv_part_files.shift
    csv_part_file2 = @csv_part_files.shift
    @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"

    merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)

    File.unlink(csv_part_file1)
    File.unlink(csv_part_file2)
  end

  if @csv_part_files.size > 0
    FileUtils.mv(@csv_part_files.last, new_csv_file)
  else
    FileUtils.cp(@csv_file, new_csv_file)
  end
end
merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file) { |row1, row2| ... } click to toggle source
# File lib/csv_utils/csv_sort.rb, line 27
def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
  src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
  src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
  dest = CSV.open(dest_csv_file, 'wb', **csv_options)

  if @headers
    dest << @headers
    src1.shift
    src2.shift
  end

  row1 = src1.shift
  row2 = src2.shift

  append_row1_proc = Proc.new do
    dest << row1
    row1 = src1.shift
  end

  append_row2_proc = Proc.new do
    dest << row2
    row2 = src2.shift
  end

  while row1 || row2
    if row1.nil?
      append_row2_proc.call
    elsif row2.nil?
      append_row1_proc.call
    elsif yield(row1, row2) <= 0
      append_row1_proc.call
    else
      append_row2_proc.call
    end
  end

  src1.close
  src2.close
  dest.close
end