class MSAbundanceSim
Constants
- DEFAULTS
- VERSION
Public Class Methods
downshift(value, min_threshold, probability_threshold, random=rand())
click to toggle source
takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).
# File lib/ms_abundance_sim.rb, line 83 def downshift(value, min_threshold, probability_threshold, random=rand()) if (value >= min_threshold) && (random < probability_threshold) value - rand(min_threshold..value) else value end end
get_fold_change(a_i, event_rate, max_abundance)
click to toggle source
# File lib/ms_abundance_sim.rb, line 70 def get_fold_change(a_i, event_rate, max_abundance) raw_fc = inverse_transform_sample(event_rate) inv_share = (1.0-(a_i[0].to_f / max_abundance)) norm_fc = raw_fc * inv_share trans = rand(0..inv_share) sign = [1,-1].sample return (norm_fc + trans) * sign end
inverse_transform_sample(event_rate)
click to toggle source
probability (y)
# File lib/ms_abundance_sim.rb, line 59 def inverse_transform_sample(event_rate) max = poisson(event_rate, event_rate) probability = rand * (max) list_of_num_occurrences = (0..event_rate*4).to_a.shuffle # attempt to capture the entire distribution without wasting tests list_of_num_occurrences.each do |num_occurences| p_num_occurences = poisson(event_rate, num_occurences) return num_occurences if probability < p_num_occurences end puts "Problem: Should have found a num_occurences in inverse_transform_sample" end
poisson(event_rate, num_occurences)
click to toggle source
event_rate (lambda) num_occurences (k)
# File lib/ms_abundance_sim.rb, line 54 def poisson(event_rate, num_occurences) return event_rate**num_occurences * Math.exp(-event_rate) / num_occurences.factorial.to_f end
process_files(filenames, opts)
click to toggle source
returns a Hash keyed by filenames and pointing to the output of process_file
(a list of case and control filenames, keyed by ‘case’ and ‘control’)
# File lib/ms_abundance_sim.rb, line 45 def process_files(filenames, opts) outputs = filenames.map do |filename| MSAbundanceSim.new.process_file(filename, opts) end filenames.zip(outputs).to_h end
sample_abundance(abundances, fold_change)
click to toggle source
# File lib/ms_abundance_sim.rb, line 91 def sample_abundance(abundances, fold_change) abundance = 0.0 if abundances.size > 1 # linear interpolation idx = [1..abundances.size-1].sample(1) next_idx = idx+1 < abundances.size ? idx+1 : abundances[0] # edge case: 2 abundances abundance = abundances[idx] + rand * (abundances[next_idx] - abundances[idx]) else # sample w/variance if fold_change >= 0 # positive fold change abundance = abundances[0].to_f * fold_change + abundances[0] else # negative fold change fold_change *= -1 abundance = abundances[0] / (2.0**fold_change) end end return abundance + [1,-1].sample * rand * 0.1 * abundance end
Public Instance Methods
get_protein_entries(filename)
click to toggle source
# File lib/ms_abundance_sim.rb, line 166 def get_protein_entries(filename) protein_entries = [] IO.foreach(filename) do |line| line.chomp! if line[0] == ">" protein_entries << ProteinEntry.new(*parse_entry_line(line)) else protein_entries.last.additional_lines << line end end protein_entries end
parse_entry_line(line)
click to toggle source
returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’)
# File lib/ms_abundance_sim.rb, line 158 def parse_entry_line(line) octothorpe_index = line.rindex("#") entry_line_wo_abundance = line[0...octothorpe_index] abundance_str = line[(octothorpe_index+1)..-1] abundances = abundance_str.split(",").map(&:to_f).sort [entry_line_wo_abundance, abundances] end
process_file(filename, opts)
click to toggle source
# File lib/ms_abundance_sim.rb, line 109 def process_file(filename, opts) opts = DEFAULTS.merge(opts) basename = filename.chomp(File.extname(filename)) protein_entries = get_protein_entries(filename) max_abundance = protein_entries.max_by {|entry| entry.abundances.last }.abundances.last # generate which proteins will be differentially expressed num_proteins_to_diff_express = (protein_entries.size * opts[:pct_diff_express]/100.0).to_i diff_expressed_ids = (0...protein_entries.size).to_a.sample(num_proteins_to_diff_express).to_set output = Hash.new {|hash, key| hash[key] = [] } case_and_controls_needed = ([:case] * opts[:num_case]) + ([:control] * opts[:num_control]) case_and_controls_needed.each_with_index do |sample_type, sample_number| # for each sample outfilename = "#{basename}_#{sample_number}_#{sample_type}" output[sample_type] << outfilename File.open(outfilename, 'w') do |outfile| protein_entries.each_with_index do |protein_entry, idx| # put first line of fasta with simulated abundance variance = opts[ if sample_type == :case && diff_expressed_ids.include?(idx) :case_variance else :control_variance end ] fold_change = MSAbundanceSim.get_fold_change(protein_entry.abundances, variance, max_abundance) downshift_params = %w(min_threshold probability_threshold) .map {|key| 'downshift_' + key } .map(&:to_sym) .map {|key| opts[key] } downshifted_fold_change = MSAbundanceSim.downshift(fold_change, *downshift_params) sample_abundance = MSAbundanceSim.sample_abundance(protein_entry.abundances, downshifted_fold_change) outfile.puts [protein_entry.entry_line_wo_abundance, sample_abundance].join(opts[:output_abundance_separator]) outfile.puts protein_entry.additional_lines.join("\n") end end end output end