class Ms::Ident::Peptide::Db

the object itself is a modified Hash. It is initialized with the database file and a protein array can be retrieved with the [] method given an amino acid sequence. All other methods are untested at this time and should be avoided!

Constants

DEFAULT_PEPTIDE_CENTRIC_DB
KEY_VALUE_DELIMITER
MAX_NUM_AA_EXPANSION
PROTEIN_DELIMITER
STANDARD_AA

the twenty standard amino acids

Public Class Methods

cmdline(argv) click to toggle source
# File lib/ms/ident/peptide/db.rb, line 24
def self.cmdline(argv)

  opt = {
    :remove_digestion_file => true,
    :enzyme => Ms::InSilico::Digester::TRYPSIN
  }
  opts = OptionParser.new do |op|
    op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
    op.separator "output: "
    op.separator "    <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
    op.separator "format:"
    op.separator "    PEPTIDE: ID1<tab>ID2<tab>ID3..."
    op.separator ""    
    op.separator "    Initiator Methionines - by default, will generate two peptides"
    op.separator "    for any peptide found at the N-termini starting with 'M'"
    op.separator "    (i.e., one with and one without the leading methionine)"
    op.separator ""
    op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
    op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
    op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
    op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
    op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Ms::Insilico::Digester.const_get(v.upcase) }
    op.on("--list-enzymes", "lists approved enzymes and exits") do
      puts Ms::InSilico::Digester::ENZYMES.keys.join("\n")
      exit
    end
  end

  opts.parse!(argv)

  if argv.size == 0
    puts opts || exit
  end

  argv.map do |file|
    Ms::Ident::Peptide::Db.peptide_centric_db(file, opt)
  end
end
expand_peptides(peptide, expand_aa) click to toggle source

does combinatorial expansion of all letters requesting it. expand_aa is hash like: {‘X’=>STANDARD_AA} returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to be expanded returns an empty array if there is no expansion

# File lib/ms/ident/peptide/db.rb, line 151
def self.expand_peptides(peptide, expand_aa)
  letters_in_order = expand_aa.keys.sort
  index_and_key = []
  peptide.split('').each_with_index do |char,i| 
    if let_index = letters_in_order.index(char) 
      index_and_key << [i, letters_in_order[let_index]]
    end
  end
  if index_and_key.size > MAX_NUM_AA_EXPANSION
    return nil
  end
  to_expand = [peptide]
  index_and_key.each do |i,letter|
    new_peps = []
    while current_pep = to_expand.shift do
      new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
    end
    to_expand = new_peps.flatten
  end
  to_expand
end
new(db_file) click to toggle source
# File lib/ms/ident/peptide/db.rb, line 173
def initialize(db_file)
  self.replace(YAML.load_file(db_file))
end
peptide_centric_db(fasta_file, opts={}) click to toggle source

writes a new file with the added ‘min_aaseq<Integer>’ creates a temporary digestion file that contains all peptides digesting with certain missed_cleavages (i.e., min_seq_length is not applied to this file but on the final peptide centric db) returns the full name of the written file.

# File lib/ms/ident/peptide/db.rb, line 68
def self.peptide_centric_db(fasta_file, opts={})
  opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)

  (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa) 
  start_time = Time.now
  print "Digesting #{fasta_file} ..." if $VERBOSE

  if expand_aa
    letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
  end

  base = fasta_file.chomp(File.extname(fasta_file))
  digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
  File.open(digestion_file, "w") do |fh|
    Ms::Fasta.open(fasta_file) do |fasta|
      fasta.each do |prot|
        peptides = enzyme.digest(prot.sequence, missed_cleavages)
        if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
          m_peps = []
          init_methionine_peps = []
          peptides.each do |pep|
            # if the peptide is at the beginning of the protein sequence
            if prot.sequence[0,pep.size] == pep
              m_peps << pep[1..-1]
            end
          end
          peptides.push(*m_peps)
        end
        if expand_aa
          peptides = peptides.map do |pep|
            if pep =~ letters_to_expand_re
              expand_peptides(pep, expand_aa)
            else
              pep
            end
          end.flatten
        end
        fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE


  start_time = Time.now
  print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE

  hash = Hash.new {|h,k| h[k] = [] }
  ::IO.foreach(digestion_file) do |line|
    (prot, *peps) = line.chomp!.split(/\s+/)
    # prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
    peps.each do |pep|
      if pep.size >= min_length
        hash[pep] << prot
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  base = digestion_file.chomp(File.extname(digestion_file))
  final_outfile = base + ".min_aaseq#{min_length}" + ".yml"

  start_time = Time.now
  print "Writing results to #{} ..." if $VERBOSE

  File.open(final_outfile, 'w') do |out|
    hash.each do |k,v|
      out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  if remove_digestion_file
    File.unlink(digestion_file)
  end
  File.expand_path(final_outfile)
end

Public Instance Methods

[](key) click to toggle source

returns the protein id’s as an array

# File lib/ms/ident/peptide/db.rb, line 180
def [](key)
  old_bracket(key).chomp.split(PROTEIN_DELIMITER)
end
Also aliased as: old_bracket
old_bracket(key)
Alias for: []