class Bio::PepXML

Attributes

peptide_name_to_object[RW]
protein_name_to_object[RW]

Public Class Methods

log() click to toggle source
# File lib/pep_xml.rb, line 36
def self.log
  Bio::PepXML.new.log
end
parse(io) click to toggle source
# File lib/pep_xml.rb, line 40
def self.parse(io)
  protein_name_to_object = {}
  peptide_name_to_object = {}

  #pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
  #  c+=1; p e.attributes['protein_descr'].strip;
  #  e.elements.each{|e|
  #    p e.name, e.attributes['protein_descr'].strip};break}
  xml = REXML::Document.new(io)

  parse_name_and_description = lambda do |e|
    name = e.attributes['protein'].strip
    description = e.attributes['protein_descr'].strip
    if name.nil? or name == ''
      name = e.attributes['protein_descr'].strip
    else
      description = name+' '+description
    end
    name.gsub!(/\t.*/,'')
    description.gsub!(/[\t\n]/,' ')

    [name, description]
  end

  #TODO: some better sanity checking here would be ideal.
  num_hits_parsed = 0
  xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
    hit_number = hit.attributes['hit_rank']
    raise "Parsing error on #{hit}" if hit_number.nil?
    next if hit_number != "1"

    # Parse the primary hit
    name1, description1 = parse_name_and_description.call(hit)
    raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
    spectrum_name = hit.parent.parent.attributes['spectrum'].strip
    raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?

    # It is possible to have multiple peptides both hit the spectra with hit_rank="1"
    # This happens when when e.g. leucine and isoleucine are possible.
    spectrum = peptide_name_to_object[spectrum_name]
    if spectrum.nil?
      spectrum = Peptide.new
      spectrum.identifier = spectrum_name
      peptide_name_to_object[spectrum_name] = spectrum
    end


    protein1 = protein_name_to_object[name1]
    if protein1.nil?
      protein1 = Protein.new
      protein1.identifier = name1
      protein1.descriptive_name = description1
      protein1.peptides = []
      protein_name_to_object[name1] = protein1
    end
    protein1.peptides.push spectrum
    spectrum.parent_proteins ||= []
    spectrum.parent_proteins.push protein1


    # Parse the alternate hits. Only look at children with protein_descr attributes - these are
    # these are the alternate proteins
    hit.each_element_with_attribute('protein_descr') do |e|
      name, description = parse_name_and_description.call(e)

      alternate = protein_name_to_object[name]
      if alternate.nil?
        alternate = Protein.new
        alternate.identifier = name
        alternate.descriptive_name = description
        alternate.peptides = []
        protein_name_to_object[name] = alternate
      end
      alternate.peptides.push spectrum
      spectrum.parent_proteins.push alternate
    end

    # Don't count the same protein multiple times - might happen when a spectru
    spectrum.parent_proteins.uniq!

    num_hits_parsed += 1
  end
  log.info "Parsed #{num_hits_parsed} search hits"

  pepxml = Bio::PepXML.new
  pepxml.protein_name_to_object = protein_name_to_object
  pepxml.peptide_name_to_object = peptide_name_to_object

  return pepxml
end