class ExternalSortModule::Sorter

Performs an external sort of a binary file. Used by the GeoTree module to shuffle buffered point sets into a random order prior to adding to the tree, in order to create a balanced tree.

Constants

MAX_CHUNKS_

Public Class Methods

new(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_) click to toggle source

Constructor @param path of file to sort @param element_size size, in bytes, of each element @param comparator to compare elements; if nil, compares the bytes as substrings

# File lib/geotree/externalsort.rb, line 193
def initialize(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_)
  raise ArgumentError,"no such file" if !File.file?(path)

  @comparator = comparator || Proc.new do |x,y|
    bx,ox = x
    by,oy = y
    bx[ox,@element_size] <=> by[oy,@element_size]
  end

  @path = path

  @work_file = nil

  @file_len = File.size(path)
  if @file_len == 0 || @file_len % element_size != 0
    raise ArgumentError,"File length #{@file_len} is not a positive multiple of element size #{element_size}"
  end
  @element_size = element_size
  @max_chunks = max_chunks
  @max_chunk_size = max_chunk_size - max_chunk_size % element_size
  raise ArgumentError if @max_chunk_size <= 0
end

Public Instance Methods

sort() click to toggle source
# File lib/geotree/externalsort.rb, line 216
def sort
  @file = File.open(@path,"r+b")

  # Break file into chunks, sorting them in place
  build_initial_segments
  sort_chunks_in_place

  require 'tempfile'
  
  @work_file = Tempfile.new('_externalsort_')
  @work_file.binmode

  while @segments.size > 1
    @segments = merge_segments(@segments)
  end

  @work_file.unlink
end

Private Instance Methods

build_initial_segments() click to toggle source

Partition the file into segments, each the size of a chunk

# File lib/geotree/externalsort.rb, line 328
def build_initial_segments
  db = false

  !db || pr("build_initial_segments, @file_len=#@file_len\n")
  raise IllegalStateException if @file_len == 0

  @segments = []
  off = 0
  while off < @file_len
    seg_len = [@file_len - off, @max_chunk_size].min
    @segments << [off, seg_len]
    off += seg_len
  end
end
merge_segments(segs) click to toggle source

Merge segments into one; if too many to handle at once, process recursively

# File lib/geotree/externalsort.rb, line 238
def merge_segments(segs)

  return segs if segs.size <= 1

  if segs.size > MAX_CHUNKS_
    k = segs.size/2
    s1 = segs[0 .. k]
    s2 = segs[k+1 .. -1]
    ret = merge_segments(s1)
    ret.concat(merge_segments(s2))
    return ret
  end

  # Build a chunk for reading each segment; also, determine
  # bounds of the set of segments.

  # Sort the chunks by their next elements.

  segset_start = nil
  segset_end = nil

  chunks = []
  segs.each do |sg|
    off,len = sg

    ch = ChunkReader.new(@file, off, len, @element_size, @max_chunk_size)
    chunks << ch
    if !segset_start
      segset_start = off
      segset_end = off+len
    else
      segset_start = [segset_start,off].min
      segset_end = [segset_end,off+len].max
    end
  end
  segset_size = segset_end - segset_start

  # Sort the chunks into order by their peek items, so the lowest item is at the end of the array
  chunks.sort! do |a,b|
    ex = a.peek
    ey = b.peek
    @comparator.call(ey,ex)
  end

  # Build a chunk for writing merged result to work file
  wch = ChunkWriter.new(@work_file,0,segset_size, @element_size, @max_chunk_size)

  while !chunks.empty?
    ch = chunks.pop
    buff,off =  ch.peek
    wch.write(buff,off)
    ch.read

    next if ch.done

    # Examine this chunk's next item to reinsert the chunk back into the sorted array.
    # Perform a binary search:
    i0 = 0
    i1 = chunks.size
    while i0 < i1
      i = (i0+i1)/2
      ci =  chunks[i]
      if @comparator.call(ci.peek, ch.peek) > 0
        i0 = i+1
      else
        i1 = i
      end
    end
    chunks.insert(i1, ch)
  end

  # Read from work file and write to segment set's position in original file

  rch = ChunkReader.new(@work_file,0,segset_size, @element_size, @max_chunk_size)
  wch = ChunkWriter.new(@file,segset_start,segset_size, @element_size, @max_chunk_size)

  while !rch.done
    buff,off = rch.peek
    wch.write(buff,off)
    rch.read
  end

  # We must flush the file we're writing to, now that the
  # operation is complete
  @file.flush

  [[segset_start,segset_size]]
end
sort_chunks_in_place() click to toggle source
# File lib/geotree/externalsort.rb, line 343
def sort_chunks_in_place
  @segments.each do |offset,length|
    ch = ChunkRandomAccess.new(@file, offset, length, @element_size)

    a =  (0 ... ch.num_elements).to_a

    a.sort! do |x,y|
      ex = ch.element(x)
      ey = ch.element(y)
      @comparator.call(ex,ey)
    end

    # Construct another buffer, in the sorted order
    b = zero_bytes(@element_size * a.size)
    j = 0
    a.each do |i|
      buff,off = ch.element(i)
      b[j, @element_size] = buff[off,@element_size]
      j += @element_size
    end
    ch.replace_buffer_with(b)
    ch.write
  end
end