class SigFil::StatisticalOutlierRemoval

Constants

SEARCHER

Attributes

dataset[RW]
mean_k[RW]
searcher[RW]
std_mul[RW]

Public Class Methods

new(dataset, mean_k = 2, std_mul = 0.0, searcher = :kdtree) click to toggle source

mean_k: Number of points to use mean distance estimation std_mul: Standard deviation multipliera threshold

# File lib/sigfil/statistical_outlier_removal.rb, line 16
def initialize dataset, mean_k = 2, std_mul = 0.0, searcher = :kdtree
  @dataset  = dataset
  @mean_k   = mean_k
  @std_mul  = std_mul
  unless SEARCHER.include?(searcher)
    raise ArgumentError, "Unknown searcher type: #{searcher}"
  else
    @searcher = searcher
  end
end

Public Instance Methods

apply_filter(scale_factors = nil) click to toggle source
# File lib/sigfil/statistical_outlier_removal.rb, line 27
def apply_filter scale_factors = nil
  if @searcher == :flann
    Flann.set_distance_type!(:l2)
  end

  if scale_factors
    unless scale_factors.size == @dataset.cols
      raise ArgumentError, "scale_factors.size != dataset.cols"
    else
      dataset = @dataset.clone
      scale_factors.each_with_index do |s, i|
        dataset[0..-1, i] *= s
      end
    end
  else
    dataset = @dataset
  end

  case @searcher
  when :flann
    searcher = Flann::Index.new(@dataset) do |params|
      params[:algorithm]    = :kdtree
      params[:trees]        = 4
      params[:centers_init] = :gonzales
    end
    searcher.build!
  when :kdtree
    dataset_h = dataset.to_a.each_with_index.map {|pt, i| [i, pt]}.to_h
    searcher  = Containers::KDTree.new(dataset_h)
  end

  distances = Array.new(dataset.rows, 0.0)
  dataset.each_row(:clone).with_index do |row, iii|
    case @searcher
    when :flann
      _, dis= searcher.nearest_neighbors(row, @mean_k + 1)
      distances[iii] = NMatrix[*dis[1..-1]].mean[0]
    when :kdtree
      dis = searcher.find_nearest(row.to_a, @mean_k + 1).map do
        |r| sqrt(r[0])
      end
    end

    distances[iii] = NMatrix[*dis[1..-1]].mean[0]
  end

  nm_d   = NMatrix[*distances]
  d_mean = nm_d.mean[0]
  d_std  = nm_d.std[0]
  d_th   = d_mean + @std_mul*d_std

  filtered = []
  distances.each_with_index do |d, i|
    if d <= d_th
      filtered << @dataset.row(i).to_a
    end
  end

  return NMatrix[*filtered]
end