class MSAbundanceSim

Constants

DEFAULTS
VERSION

Public Class Methods

downshift(value, min_threshold, probability_threshold, random=rand()) click to toggle source

takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).

# File lib/ms_abundance_sim.rb, line 83
def downshift(value, min_threshold, probability_threshold, random=rand())
  if (value >= min_threshold) && (random < probability_threshold)
    value - rand(min_threshold..value)
  else
    value
  end
end
get_fold_change(a_i, event_rate, max_abundance) click to toggle source
# File lib/ms_abundance_sim.rb, line 70
def get_fold_change(a_i, event_rate, max_abundance)
  raw_fc = inverse_transform_sample(event_rate)
  inv_share = (1.0-(a_i[0].to_f / max_abundance))
  norm_fc = raw_fc * inv_share
  trans = rand(0..inv_share)
  sign = [1,-1].sample
  return (norm_fc + trans) * sign
end
inverse_transform_sample(event_rate) click to toggle source

probability (y)

# File lib/ms_abundance_sim.rb, line 59
def inverse_transform_sample(event_rate)
  max = poisson(event_rate, event_rate)
  probability = rand * (max)
  list_of_num_occurrences = (0..event_rate*4).to_a.shuffle # attempt to capture the entire distribution without wasting tests
  list_of_num_occurrences.each do |num_occurences|
    p_num_occurences = poisson(event_rate, num_occurences)
    return num_occurences if probability < p_num_occurences
  end
  puts "Problem: Should have found a num_occurences in inverse_transform_sample"
end
poisson(event_rate, num_occurences) click to toggle source

event_rate (lambda) num_occurences (k)

# File lib/ms_abundance_sim.rb, line 54
def poisson(event_rate, num_occurences)
  return event_rate**num_occurences * Math.exp(-event_rate) / num_occurences.factorial.to_f
end
process_files(filenames, opts) click to toggle source

returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’)

# File lib/ms_abundance_sim.rb, line 45
def process_files(filenames, opts)
  outputs = filenames.map do |filename|
    MSAbundanceSim.new.process_file(filename, opts)
  end
  filenames.zip(outputs).to_h
end
sample_abundance(abundances, fold_change) click to toggle source
# File lib/ms_abundance_sim.rb, line 91
def sample_abundance(abundances, fold_change)
  abundance = 0.0
  if abundances.size > 1 # linear interpolation
    idx = [1..abundances.size-1].sample(1)
    next_idx = idx+1 < abundances.size ? idx+1 : abundances[0] # edge case: 2 abundances
    abundance = abundances[idx] + rand * (abundances[next_idx] - abundances[idx])
  else # sample w/variance
    if fold_change >= 0 # positive fold change
      abundance = abundances[0].to_f * fold_change + abundances[0]
    else # negative fold change
      fold_change *= -1
      abundance = abundances[0] / (2.0**fold_change)
    end
  end
  return abundance + [1,-1].sample * rand * 0.1 * abundance
end

Public Instance Methods

get_protein_entries(filename) click to toggle source
# File lib/ms_abundance_sim.rb, line 166
def get_protein_entries(filename)
  protein_entries = []
  IO.foreach(filename) do |line|
    line.chomp!
    if line[0] == ">"
      protein_entries << ProteinEntry.new(*parse_entry_line(line))
    else
      protein_entries.last.additional_lines << line
    end
  end
  protein_entries
end
parse_entry_line(line) click to toggle source

returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’)

# File lib/ms_abundance_sim.rb, line 158
def parse_entry_line(line)
  octothorpe_index = line.rindex("#")
  entry_line_wo_abundance = line[0...octothorpe_index]
  abundance_str = line[(octothorpe_index+1)..-1]
  abundances = abundance_str.split(",").map(&:to_f).sort
  [entry_line_wo_abundance, abundances]
end
process_file(filename, opts) click to toggle source
# File lib/ms_abundance_sim.rb, line 109
def process_file(filename, opts)
  opts = DEFAULTS.merge(opts)

  basename = filename.chomp(File.extname(filename))

  protein_entries = get_protein_entries(filename)
  max_abundance = protein_entries.max_by {|entry| entry.abundances.last }.abundances.last

  # generate which proteins will be differentially expressed
  num_proteins_to_diff_express = (protein_entries.size * opts[:pct_diff_express]/100.0).to_i
  diff_expressed_ids = (0...protein_entries.size).to_a.sample(num_proteins_to_diff_express).to_set

  output = Hash.new {|hash, key| hash[key] = [] }
  case_and_controls_needed = ([:case] * opts[:num_case]) + ([:control] * opts[:num_control])
  case_and_controls_needed.each_with_index do |sample_type, sample_number| # for each sample
    outfilename = "#{basename}_#{sample_number}_#{sample_type}"
    output[sample_type] << outfilename
    File.open(outfilename, 'w') do |outfile|
      protein_entries.each_with_index do |protein_entry, idx|
        # put first line of fasta with simulated abundance
        variance = opts[
          if sample_type == :case && diff_expressed_ids.include?(idx)
            :case_variance
          else
            :control_variance
          end
        ]

        fold_change = MSAbundanceSim.get_fold_change(protein_entry.abundances, variance, max_abundance)

        downshift_params = %w(min_threshold probability_threshold)
          .map {|key| 'downshift_' + key }
          .map(&:to_sym)
          .map {|key| opts[key] }

        downshifted_fold_change = MSAbundanceSim.downshift(fold_change, *downshift_params)

        sample_abundance = MSAbundanceSim.sample_abundance(protein_entry.abundances, downshifted_fold_change)
        outfile.puts [protein_entry.entry_line_wo_abundance, sample_abundance].join(opts[:output_abundance_separator])
        outfile.puts protein_entry.additional_lines.join("\n")
      end
    end
  end
  output
end