class DGaffML::Dataset
Attributes
dataset[RW]
Public Class Methods
new(client, dataset_response)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 4 def initialize(client, dataset_response) @client = client @dataset = dataset_response @dataset_id = @dataset["id"] @user_id = @dataset["user_id"] end
Public Instance Methods
cast_val(value, directive)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 94 def cast_val(value, directive) if directive == "Integer" return value.to_i elsif directive == "Float" return value.to_f elsif directive == "Time" if value.length == 10 and value.scan(/\d/).count == 10 return Time.at(value).to_i elsif value.length == 13 and value.scan(/\d/).count == 13 return Time.at(value).to_i else return Chronic.parse(value).to_i end elsif directive == "Text" or directive == "Phrase" return clean_str(value).split(" ").collect{|word| Stemmer::stem_word(word)} elsif directive == "Categorical" return value end end
clean_str(string)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 67 def clean_str(string) string. gsub(/[^A-Za-z0-9(),!?\'\`]/, " "). gsub(" ", " "). gsub("\'s", " \'s"). gsub("", ""). gsub("\'ve", " \'ve"). gsub("n\'t", " n\'t"). gsub("\'re", " \'re"). gsub("\'d", " \'d"). gsub("\'ll", " \'ll"). gsub(",", " , "). gsub("!", " ! "). gsub("\(", " \\( "). gsub("\)", " \\) "). gsub(" \\\( \\\( \\\( ", " \(\(\( "). gsub(" \\\) \\\) \\\) ", " \)\)\) "). gsub("\?", " \? "). gsub(/\s{2,}/, " "). gsub(Regexp.new("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"), "<URL/>"). gsub("www", " "). gsub("com", " "). gsub("org", " "). strip. downcase end
convert(rows, dataset_keys, dataset_classes)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 34 def convert(rows, dataset_keys, dataset_classes) transposed = rows.transpose detexted = [] labels = [] transposed.each_with_index do |col, i| if dataset_classes[i] == "Phrase" || dataset_classes[i] == "Text" self.dataset["conversion_pipeline"][dataset_keys[i]]["unique_terms"].each do |term| counted = [] col.each do |row| row = [row.to_s] if row.nil? counted << row.count(term) end detexted << counted end elsif dataset_classes[i] == "Categorical" counted = [] col.each do |val| counted << self.dataset["conversion_pipeline"][dataset_keys[i]]["unique_terms"].index(val.to_s) end detexted << counted else conversion_pipeline = self.dataset["conversion_pipeline"][dataset_keys[i]] replaced = col.collect{|r| r||conversion_pipeline["average"]} dist = conversion_pipeline["max"]-conversion_pipeline["min"] detexted << replaced detexted << replaced.collect{|r| (r-conversion_pipeline["min"]).to_f/dist} if dist > 0 detexted << replaced.collect{|r| (r-conversion_pipeline["average"]).to_f/conversion_pipeline["stdev"]} if conversion_pipeline["stdev"] > 0 detexted << replaced.collect{|r| r.abs} end end return detexted.transpose end
export_model()
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 114 def export_model @client.export_model(@dataset_id) end
predict(obs)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 11 def predict(obs) predictions = @client.predict(@dataset_id, translate_obs(obs)) if self.dataset["conversion_pipeline"].keys.include?("label") return predictions.collect{|x| self.dataset["conversion_pipeline"]["label"][x]} else return predictions end end
translate_obs(obs)
click to toggle source
# File lib/dgaff_ml/dataset.rb, line 20 def translate_obs(obs) dataset_keys = (self.dataset["conversion_pipeline"].keys-["label", "internal_headers"]).sort_by(&:to_i) dataset_classes = dataset_keys.collect{|k| self.dataset["col_classes"][k.to_i]} translated_rows = [] obs.each do |row| translated_row = [] row.each_with_index do |el, i| translated_row << cast_val(el, dataset_classes[i]) end translated_rows << translated_row end self.convert(translated_rows, dataset_keys, dataset_classes) end