class MachineLearningWorkbench::Compressor::VectorQuantization
Standard Vector Quantization
Attributes
Public Class Methods
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 11 def initialize ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed @rng = Random.new rseed # TODO: RNG CURRENTLY NOT USED!! @dims = Array(dims) check_lrate lrate # hack: so that we can overload it in dlr_vq @lrate = lrate @simil_type = simil_type || raise("missing simil_type") @encoding_type = encoding_type || raise("missing encoding_type") @init_centr_vrange ||= vrange @vrange = case vrange when Array raise ArgumentError, "vrange size not 2: #{vrange}" unless vrange.size == 2 vrange.map &method(:Float) when Range [vrange.first, vrange.last].map &method(:Float) else raise ArgumentError, "vrange: unrecognized type: #{vrange.class}" end init_centrs nc: ncentrs @ntrains = [0]*ncentrs # per-centroid number of trainings @utility = NArray.zeros [code_size] # trace how 'useful' are centroids to encodings @ncodes = 0 end
Public Instance Methods
Verify lrate to be present and withing unit bounds As a separate method only so it can be overloaded in `DecayingLearningRateVQ`
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 46 def check_lrate lrate raise ArgumentError, "Pass a `lrate` between 0 and 1" unless lrate&.between?(0,1) end
HACKKETY HACKKETY HACK (can't wait to refactor after the deadline)
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 40 def code_size encoding_type == :sparse_coding_v1 ? 2*ncentrs : ncentrs end
Encode a vector tracks utility of centroids based on how much they contribute to encoding TODO: `encode = Encodings.const_get(type)` in initialize` NOTE: hashes of lambdas or modules cannot access ncodes and utility TODO: refactor anyway through `stats` object, this thing is getting out of hand
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 86 def encode vec, type: encoding_type case type when :most_similar simils = similarities vec code = simils.max_index @ncodes += 1 @utility[code] += 1 code when :most_similar_ary simils = similarities vec code = simils.new_zeros code[simils.max_index] = 1 @ncodes += 1 @utility += code code when :ensemble simils = similarities vec code = simils tot = simils.sum tot = 1 if tot < 1e-5 # HACK: avoid division by zero contrib = code / tot @ncodes += 1 @utility += (contrib - utility) / ncodes # cumulative moving average code when :norm_ensemble simils = similarities vec tot = simils.sum # NOTE this actually makes a big discontinuity if the total is equal to zero. # Does that even ever happen? I guess only w/ reset img (zeros) as lone centroid. # Which after first gen is really useless and should just be dropped anyway... tot = 1 if tot < 1e-5 # HACK: avoid division by zero code = simils / tot @ncodes += 1 @utility += (code - utility) / ncodes # cumulative moving average code when :sparse_coding_v1 raise "requires centroids normalized to unit length!" @encoder = nil if @encoder&.shape&.first != centrs.shape.first # Danafar & Cuccu: compact form linear regression encoder @encoder ||= (centrs.dot centrs.transpose).invert.dot centrs raw_code = @encoder.dot(vec) # separate positive and negative features (NOTE: all features will be positive) # i.e. split[0...n] = max {0, raw[i]}; split[n...2*n] = max {0, -raw[i]} # TODO: cite Coates & Ng # TODO: optimize and remove redundant variables split_code = raw_code.concatenate(-raw_code) split_code[split_code<0] = 0 # normalize such that the code sums to 1 norm_code = split_code / split_code.sum # Danafar: drop to say 80% of info (à la pca) thold = 0.2 sparse_code = norm_code.dup sum = 0 # NOTE: the last element in the sort below has the highest contribution and # should NEVER be put to 0, even if it could contribute alone to 100% of the # total # NOTE: upon further study I disagree this represent information content unless # the centroids are unit vectors. So I'm commenting this implementation now, # together with the following, until I implement a switch to normalize the # centroids based on configuration. # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97 # norm_code.sort_index[0...-1].each do |idx| norm_code.size.times.sort_by { |i| norm_code[i] }[0...-1].each do |idx| sparse_code[idx] = 0 sum += norm_code[idx] break if sum >= thold # we know the code's total is normalized to 1 and has no negatives end code = sparse_code / sparse_code.sum # re-normalize sum to 1 @ncodes += 1 @utility += (code - utility) / ncodes # cumulative moving average code when :sparse_coding_v2 # Cuccu & Danafar: incremental reconstruction encoding # turns out to be closely related to (Orthogonal) Matching Pursuit raise "requires centroids normalized to unit length!" # return centrs.dot vec # speed test for the rest of the system sparse_code = NArray.zeros code_size resid = vec # cap the number of non-zero elements in the code max_nonzero = [1,ncentrs/3].max max_nonzero.times do |i| # OPT: remove msc from centrs at each loop # the algorithm should work even without this opt because # we are working on the residuals each time simils = centrs.dot resid # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97 # msc = simils.max_index simils = simils.to_a simils_abs = simils.map &:abs msc = simils_abs.index simils_abs.max # most similar centroid max_simil = simils[msc] # remember to distinguish here to use the pos/neg features trick sparse_code[msc] = max_simil reconstr = max_simil * centrs[msc, true] resid -= reconstr # puts "resid#{i} #{resid.abs.mean}" # if debug epsilon = 0.005 # print resid.abs.mean, ' ' # print sparse_code.to_a, ' ' break if resid.abs.mean <= epsilon end # should normalize sum to 1? code = sparse_code #/ sparse_code.sum # normalize sum to 1 @ncodes += 1 @utility += (code - utility) / ncodes # cumulative moving average code when :sparse_coding # Cuccu: Direct residual encoding # return centrs.dot vec # speed test for the rest of the system sparse_code = NArray.zeros code_size resid = vec # cap the number of non-zero elements in the code max_nonzero = [1,ncentrs/3].max max_nonzero.times do |i| # OPT: remove msc from centrs at each loop # the algorithm should work even without this opt because # we are working on the residuals each time diff = (centrs - resid).abs.sum(1) # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97 # msc = diff.max_index diff = diff.to_a msc = diff.index diff.min # most similar centroid min_diff = diff[msc] # remember to distinguish here to use the pos/neg features trick sparse_code[msc] = 1 reconstr = centrs[msc, true] resid -= reconstr resid[(resid<0).where] = 0 # ignore artifacts introduced by the centroids in reconstruction # puts "resid#{i} #{resid.abs.mean}" # if debug epsilon = 0.005 # print resid.abs.mean, ' ' if $ngen == 2; exit if $ngen==3 # print sparse_code.to_a, ' ' if $ngen == 3; exit if $ngen==4 break if resid.abs.mean <= epsilon end code = sparse_code @ncodes += 1 @utility += (code - utility) / ncodes # cumulative moving average code else raise ArgumentError, "Unrecognized encode #{type}" end end
Initializes a list of centroids
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 51 def init_centrs nc: ncentrs, base: nil, proport: nil @centrs = nc.times.map { new_centr base, proport }.to_na end
Returns index and similitude of most similar centroid to vector @return [Array<Integer, Float>] the index of the most similar centroid,
followed by the corresponding similarity
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 290 def most_similar_centr vec simils = similarities vec max_idx = simils.max_index [max_idx, simils[max_idx]] end
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 35 def ncentrs @centrs.shape.first end
Creates a new (random) centroid If a base is passed, this is meshed with the random centroid. This is done to facilitate distributing the training across centroids. TODO: USE RNG HERE!!
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 59 def new_centr base=nil, proport=nil raise ArgumentError, "Either both or none" if base.nil? ^ proport.nil? # require 'pry'; binding.pry if base.nil? ^ proport.nil? ret = NArray.new(*dims).rand(*init_centr_vrange) ret = ret * (1-proport) + base * proport if base&&proport ret end
Per-pixel errors in reconstructing vector @return [NArray] residuals
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 298 def reconstr_error vec, code: nil, type: encoding_type code ||= encode vec, type: type resid = vec - reconstruction(code, type: type) # we ignore the extra stuff coming from the centroids, # only care that everything in the obs is represented in centrs resid[resid<0] = 0 if encoding_type == :sparse_coding resid end
Reconstruct vector from its code (encoding)
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 253 def reconstruction code, type: encoding_type case type when :most_similar centrs[code, true] when :most_similar_ary centrs[code.eq(1), true] when :ensemble # tot = code.reduce :+ # centrs.zip(code).map { |centr, contr| centr*contr/tot }.reduce :+ centrs.dot(code) / code.sum when :norm_ensemble centrs.dot code # centrs.zip(code).map { |centr, contr| centr*contr }.reduce :+ when :sparse_coding_v1 raise "requires normalized centroids!" reconstr_code = code[0...(code.size/2)] - code[(code.size/2)..-1] reconstr = centrs.transpose.dot reconstr_code when :sparse_coding_v2 raise "requires normalized centroids!" # BUG IN NARRAY DOT!! ruby-numo/numo-narray#99 # reconstr = code.dot centrs reconstr = code.expand_dims(0).dot centrs when :sparse_coding # the code is binary, so just sum over the corresponding centroids # note: sum, not mean, because of how it's used in reconstr_error reconstr = centrs[code.cast_to(Numo::Bit).where, true].sum(0) else raise ArgumentError, "unrecognized reconstruction type: #{type}" end end
Computes similarities between vector and all centroids
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 73 def similarities vec, type: simil_type raise NotImplementedError if vec.shape.size > 1 raise "need to check since centrs is a NArray now" if type == :mse # simil_fn = SIMIL[type] || raise(ArgumentError, "Unrecognized simil #{type}") # centrs.map { |centr| simil_fn.call centr, vec } centrs.dot vec end
Train on vector list
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 321 def train vec_lst, debug: false # Two ways here: # - Batch: canonical, centrs updated with each vec # - Parallel: could be parallel either on simils or on training (?) # Unsure on the correctness of either Parallel, let's stick with Batch vec_lst.each_with_index do |vec, i| trained_idx = train_one vec print '.' if debug @ntrains[trained_idx] += 1 if @ntrains end end
Train on one vector @return [Integer] index of trained centroid
# File lib/machine_learning_workbench/compressor/vector_quantization.rb, line 309 def train_one vec, eps: nil # NOTE: ignores epsilon if passed trg_idx, _simil = most_similar_centr(vec) # note: uhm that actually looks like a dot product... maybe faster? # `[c[i], vec].dot([1-lrate, lrate])` # norm_vec = vec / NLinalg.norm(vec) # centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + norm_vec * lrate centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + vec * lrate trg_idx end