class EM::Voldemort::BinaryJson
Codec for Voldemort’s custom binary serialization format. The Voldemort
codebase itself refers to this format as “json”, even though it has virtually nothing in common with JSON. It’s actually more like Avro, but with less sophisticated schema evolution, and less compact. We’re only using it because the Hadoop job for building read-only stores requires it. The format is roughly documented at github.com/voldemort/voldemort/wiki/Binary-JSON-Serialization
This code is adapted from Alejandro Crosa’s voldemort-rb gem (MIT License). github.com/acrosa/voldemort-rb
Constants
- BYTE_MAX_VAL
- BYTE_MIN_VAL
- DOUBLE_MIN_VAL
- FLOAT_MIN_VAL
- INT_MAX_VAL
- INT_MIN_VAL
- LONG_MAX_VAL
- LONG_MIN_VAL
- SHORT_MAX_VAL
- SHORT_MIN_VAL
- STRING_MAX_LEN
Attributes
has_version_tag[R]
schema_versions[R]
Public Class Methods
new(schema_by_version, has_version_tag=true)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 27 def initialize(schema_by_version, has_version_tag=true) @has_version_tag = has_version_tag @schema_versions = schema_by_version.each_with_object({}) do |(version, schema), hash| hash[version.to_i] = parse_schema(schema) end end
Public Instance Methods
decode(bytes)
click to toggle source
Parses a binary JSON string into Ruby objects
# File lib/em-voldemort/binary_json.rb, line 45 def decode(bytes) bytes.force_encoding(Encoding::BINARY) input = StringIO.new(bytes) version = has_version_tag ? input.read(1).ord : 0 schema = schema_versions[version] raise ClientError, "no registered schema for version #{version}" unless schema read(input, schema) end
encode(object)
click to toggle source
Serializes a Ruby object to binary JSON
# File lib/em-voldemort/binary_json.rb, line 35 def encode(object) ''.force_encoding(Encoding::BINARY).tap do |bytes| newest_version = schema_versions.keys.max schema = schema_versions[newest_version] bytes << newest_version.chr if has_version_tag write(object, bytes, schema) end end
Private Instance Methods
parse_schema(schema)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 56 def parse_schema(schema) # tolerate use of single quotes in place of double quotes in the schema schema = schema.gsub("'", '"') if schema =~ /\A[\{\[]/ # check if the json is a list or string, since these are # the only ones that JSON.parse() will work with JSON.parse(schema) else # otherwise it's a primitive, so just strip the quotes schema.gsub('"', '') end end
read(input, schema)
click to toggle source
parsing
# File lib/em-voldemort/binary_json.rb, line 231 def read(input, schema) case schema when Hash then read_map(input, schema) when Array then read_list(input, schema) when 'string' then read_bytes(input) when 'int8' then read_int8(input) when 'int16' then read_int16(input) when 'int32' then read_int32(input) when 'int64' then read_int64(input) when 'float32' then read_float32(input) when 'float64' then read_float64(input) when 'date' then read_date(input) when 'bytes' then read_bytes(input) when 'boolean' then read_boolean(input) else raise ClientError, "unrecognised binary json schema: #{schema.inspect}" end end
read_boolean(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 275 def read_boolean(input) value = input.read(1).unpack('c').first return nil if value < 0 value > 0 end
read_bytes(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 325 def read_bytes(input) length = read_length(input) input.read(length) if length >= 0 end
read_date(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 320 def read_date(input) timestamp = read_int64(input) timestamp && Time.at(timestamp / 1000.0) end
read_float32(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 310 def read_float32(input) value = input.read(4).unpack('g').first value unless value == FLOAT_MIN_VAL end
read_float64(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 315 def read_float64(input) value = input.read(8).unpack('G').first value unless value == DOUBLE_MIN_VAL end
read_int16(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 294 def read_int16(input) value = to_signed(input.read(2).unpack('n').first, 16) value unless value == SHORT_MIN_VAL end
read_int32(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 299 def read_int32(input) value = to_signed(input.read(4).unpack('N').first, 32) value unless value == INT_MIN_VAL end
read_int64(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 304 def read_int64(input) high, low = input.read(8).unpack('NN') value = to_signed(high << 32 | low, 64) value unless value == LONG_MIN_VAL end
read_int8(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 281 def read_int8(input) value = input.read(1).unpack('c').first value unless value == BYTE_MIN_VAL end
read_length(input)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 256 def read_length(input) size = input.read(2).unpack('n').first if size == 0xFFFF -1 elsif size & 0x8000 > 0 (size & 0x3FFF) << 16 | input.read(2).unpack('n').first else size end end
read_list(input, schema)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 267 def read_list(input, schema) size = read_length(input) return nil if size < 0 [].tap do |object| size.times { object << read(input, schema.first) } end end
read_map(input, schema)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 249 def read_map(input, schema) return nil if input.read(1).unpack('c') == [-1] schema.sort.each_with_object({}) do |(key, value_type), object| object[key.to_sym] = read(input, value_type) end end
to_signed(value, bits)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 286 def to_signed(value, bits) if value >= 2 ** (bits - 1) value - 2 ** bits else value end end
write(object, bytes, schema)
click to toggle source
serialization
# File lib/em-voldemort/binary_json.rb, line 72 def write(object, bytes, schema) case schema when Hash if object.is_a? Hash write_map(object, bytes, schema) else raise ClientError, "serialization error: #{object.inspect} does not match schema #{schema.inspect}" end when Array if object.is_a? Array write_list(object, bytes, schema) else raise ClientError, "serialization error: #{object.inspect} does not match schema #{schema.inspect}" end when 'string' then write_bytes( object, bytes) when 'int8' then write_int8( object, bytes) when 'int16' then write_int16( object, bytes) when 'int32' then write_int32( object, bytes) when 'int64' then write_int64( object, bytes) when 'float32' then write_float32(object, bytes) when 'float64' then write_float64(object, bytes) when 'date' then write_date( object, bytes) when 'bytes' then write_bytes( object, bytes) when 'boolean' then write_boolean(object, bytes) else raise ClientError, "unrecognised binary json schema: #{schema.inspect}" end end
write_boolean(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 100 def write_boolean(object, bytes) if object.nil? bytes << [BYTE_MIN_VAL].pack('c') elsif object bytes << 1.chr else bytes << 0.chr end end
write_bytes(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 188 def write_bytes(object, bytes) if object.nil? write_int16(-1, bytes) else write_length(object.length, bytes) bytes << object end end
write_date(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 170 def write_date(object, bytes) if object.nil? write_int64(nil, bytes) else write_int64((object.to_f * 1000).to_i, bytes) end end
write_float32(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 154 def write_float32(object, bytes) if object == FLOAT_MIN_VAL raise ClientError, "Can't use #{FLOAT_MIN_VAL} because it is used to represent nil" else bytes << [object || FLOAT_MIN_VAL].pack('g') end end
write_float64(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 162 def write_float64(object, bytes) if object == DOUBLE_MIN_VAL raise ClientError, "Can't use #{DOUBLE_MIN_VAL} because it is used to represent nil" else bytes << [object || DOUBLE_MIN_VAL].pack('G') end end
write_int16(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 124 def write_int16(object, bytes) if object.nil? bytes << [SHORT_MIN_VAL].pack('n') elsif object > SHORT_MIN_VAL && object <= SHORT_MAX_VAL bytes << [object].pack('n') else raise ClientError, "value out of int16 range: #{object}" end end
write_int32(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 134 def write_int32(object, bytes) if object.nil? bytes << [INT_MIN_VAL].pack('N') elsif object > INT_MIN_VAL && object <= INT_MAX_VAL bytes << [object].pack('N') else raise ClientError, "value out of int32 range: #{object}" end end
write_int64(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 144 def write_int64(object, bytes) if object.nil? bytes << [INT_MIN_VAL, 0].pack('NN') elsif object > LONG_MIN_VAL && object <= LONG_MAX_VAL bytes << [object / 2**32, object % 2**32].pack('NN') else raise ClientError, "value out of int64 range: #{object}" end end
write_int8(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 114 def write_int8(object, bytes) if object.nil? bytes << [BYTE_MIN_VAL].pack('c') elsif object > BYTE_MIN_VAL && object <= BYTE_MAX_VAL bytes << [object].pack('c') else raise ClientError, "value out of int8 range: #{object}" end end
write_length(length, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 178 def write_length(length, bytes) if length < SHORT_MAX_VAL bytes << [length].pack('n') elsif length < STRING_MAX_LEN bytes << [length | 0xC0000000].pack('N') else raise ClientError, 'string is too long to be serialized' end end
write_list(object, bytes, schema)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 218 def write_list(object, bytes, schema) if schema.length != 1 raise ClientError, "Schema error: a list must have one item, unlike #{schema.inspect}" elsif object.nil? write_int16(-1, bytes) else write_length(object.length, bytes) object.each {|item| write(item, bytes, schema.first) } end end
write_map(object, bytes, schema)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 197 def write_map(object, bytes, schema) if object.nil? bytes << [-1].pack('c') else bytes << [1].pack('c') if object.size != schema.size raise ClientError, "Fields of object #{object.inspect} do not match schema #{schema.inspect}" end schema.sort.each do |key, value_type| if object.has_key?(key.to_s) write(object[key.to_s], bytes, value_type) elsif object.has_key?(key.to_sym) write(object[key.to_sym], bytes, value_type) else raise ClientError, "Object #{object.inspect} does not have #{key} field required by the schema" end end end end
write_string(object, bytes)
click to toggle source
# File lib/em-voldemort/binary_json.rb, line 110 def write_string(object, bytes) write_bytes(object, bytes) end