class Datasets::LIBSVM

Public Class Methods

new(name, note: nil, default_feature_value: 0) click to toggle source
Calls superclass method Datasets::Dataset::new
# File lib/datasets/libsvm.rb, line 34
def initialize(name,
               note: nil,
               default_feature_value: 0)
  super()
  @libsvm_dataset_metadata = fetch_dataset_info(name)
  @file = choose_file(note)
  @default_feature_value = default_feature_value
  @metadata.id = "libsvm-#{normalize_name(name)}"
  @metadata.name = "LIBSVM dataset: #{name}"
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
end

Public Instance Methods

each() { |record| ... } click to toggle source
# File lib/datasets/libsvm.rb, line 46
def each
  return to_enum(__method__) unless block_given?

  open_data do |input|
    n_features = @libsvm_dataset_metadata.n_features
    csv = CSV.new(input, col_sep: " ")
    csv.each do |row|
      label = parse_label(row.shift)
      features = [@default_feature_value] * n_features
      row.each do |column|
        next if column.nil?
        index, value = column.split(":", 2)
        features[Integer(index, 10) - 1] = parse_value(value)
      end
      yield(Record.new(label, features))
    end
  end
end

Private Instance Methods

choose_file(note) click to toggle source
# File lib/datasets/libsvm.rb, line 82
def choose_file(note)
  files = @libsvm_dataset_metadata.files
  return files.first if note.nil?

  available_notes = []
  @libsvm_dataset_metadata.files.find do |file|
    return file if file.note == note
    available_notes << file.note if file.note
  end

  name = @libsvm_dataset_metadata.name
  message = "unavailable note: #{name}: #{note.inspect}: "
  message << "available notes: ["
  message << available_notes.collect(&:inspect).join(", ")
  message << "]"
  raise ArgumentError, message
end
fetch_dataset_info(name) click to toggle source
# File lib/datasets/libsvm.rb, line 66
def fetch_dataset_info(name)
  list = LIBSVMDatasetList.new
  available_datasets = []
  list.each do |record|
    available_datasets << record.name
    if record.name == name
      return record
    end
  end
  message = "unavailable LIBSVM dataset: #{name.inspect}: "
  message << "available datasets: ["
  message << available_datasets.collect(&:inspect).join(", ")
  message << "]"
  raise ArgumentError, message
end
normalize_name(name) click to toggle source
# File lib/datasets/libsvm.rb, line 112
def normalize_name(name)
  name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
end
open_data(&block) click to toggle source
# File lib/datasets/libsvm.rb, line 100
def open_data(&block)
  data_path = cache_dir_path + @file.name
  unless data_path.exist?
    download(data_path, @file.url)
  end
  if data_path.extname == ".bz2"
    extract_bz2(data_path, &block)
  else
    File.open(data_path, &block)
  end
end
parse_label(label) click to toggle source
# File lib/datasets/libsvm.rb, line 116
def parse_label(label)
  labels = label.split(",").collect do |value|
    parse_value(value)
  end
  if labels.size == 1
    labels[0]
  else
    labels
  end
end
parse_value(value) click to toggle source
# File lib/datasets/libsvm.rb, line 127
def parse_value(value)
  if value.include?(".")
    Float(value)
  else
    Integer(value, 10)
  end
end