class ExternalSortModule::Sorter
Performs an external sort of a binary file. Used by the GeoTree module to shuffle buffered point sets into a random order prior to adding to the tree, in order to create a balanced tree.
Constants
- MAX_CHUNKS_
Public Class Methods
new(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_)
click to toggle source
Constructor @param path of file to sort @param element_size size, in bytes, of each element @param comparator to compare elements; if nil, compares the bytes as substrings
# File lib/geotree/externalsort.rb, line 193 def initialize(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_) raise ArgumentError,"no such file" if !File.file?(path) @comparator = comparator || Proc.new do |x,y| bx,ox = x by,oy = y bx[ox,@element_size] <=> by[oy,@element_size] end @path = path @work_file = nil @file_len = File.size(path) if @file_len == 0 || @file_len % element_size != 0 raise ArgumentError,"File length #{@file_len} is not a positive multiple of element size #{element_size}" end @element_size = element_size @max_chunks = max_chunks @max_chunk_size = max_chunk_size - max_chunk_size % element_size raise ArgumentError if @max_chunk_size <= 0 end
Public Instance Methods
sort()
click to toggle source
# File lib/geotree/externalsort.rb, line 216 def sort @file = File.open(@path,"r+b") # Break file into chunks, sorting them in place build_initial_segments sort_chunks_in_place require 'tempfile' @work_file = Tempfile.new('_externalsort_') @work_file.binmode while @segments.size > 1 @segments = merge_segments(@segments) end @work_file.unlink end
Private Instance Methods
build_initial_segments()
click to toggle source
Partition the file into segments, each the size of a chunk
# File lib/geotree/externalsort.rb, line 328 def build_initial_segments db = false !db || pr("build_initial_segments, @file_len=#@file_len\n") raise IllegalStateException if @file_len == 0 @segments = [] off = 0 while off < @file_len seg_len = [@file_len - off, @max_chunk_size].min @segments << [off, seg_len] off += seg_len end end
merge_segments(segs)
click to toggle source
Merge segments into one; if too many to handle at once, process recursively
# File lib/geotree/externalsort.rb, line 238 def merge_segments(segs) return segs if segs.size <= 1 if segs.size > MAX_CHUNKS_ k = segs.size/2 s1 = segs[0 .. k] s2 = segs[k+1 .. -1] ret = merge_segments(s1) ret.concat(merge_segments(s2)) return ret end # Build a chunk for reading each segment; also, determine # bounds of the set of segments. # Sort the chunks by their next elements. segset_start = nil segset_end = nil chunks = [] segs.each do |sg| off,len = sg ch = ChunkReader.new(@file, off, len, @element_size, @max_chunk_size) chunks << ch if !segset_start segset_start = off segset_end = off+len else segset_start = [segset_start,off].min segset_end = [segset_end,off+len].max end end segset_size = segset_end - segset_start # Sort the chunks into order by their peek items, so the lowest item is at the end of the array chunks.sort! do |a,b| ex = a.peek ey = b.peek @comparator.call(ey,ex) end # Build a chunk for writing merged result to work file wch = ChunkWriter.new(@work_file,0,segset_size, @element_size, @max_chunk_size) while !chunks.empty? ch = chunks.pop buff,off = ch.peek wch.write(buff,off) ch.read next if ch.done # Examine this chunk's next item to reinsert the chunk back into the sorted array. # Perform a binary search: i0 = 0 i1 = chunks.size while i0 < i1 i = (i0+i1)/2 ci = chunks[i] if @comparator.call(ci.peek, ch.peek) > 0 i0 = i+1 else i1 = i end end chunks.insert(i1, ch) end # Read from work file and write to segment set's position in original file rch = ChunkReader.new(@work_file,0,segset_size, @element_size, @max_chunk_size) wch = ChunkWriter.new(@file,segset_start,segset_size, @element_size, @max_chunk_size) while !rch.done buff,off = rch.peek wch.write(buff,off) rch.read end # We must flush the file we're writing to, now that the # operation is complete @file.flush [[segset_start,segset_size]] end
sort_chunks_in_place()
click to toggle source
# File lib/geotree/externalsort.rb, line 343 def sort_chunks_in_place @segments.each do |offset,length| ch = ChunkRandomAccess.new(@file, offset, length, @element_size) a = (0 ... ch.num_elements).to_a a.sort! do |x,y| ex = ch.element(x) ey = ch.element(y) @comparator.call(ex,ey) end # Construct another buffer, in the sorted order b = zero_bytes(@element_size * a.size) j = 0 a.each do |i| buff,off = ch.element(i) b[j, @element_size] = buff[off,@element_size] j += @element_size end ch.replace_buffer_with(b) ch.write end end