class Mspire::Sequest::Srf

require ‘mspire/sequest/pepxml’

Constants

Dta

total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1) unknown is, well unknown…

Out

Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )

Attributes

base_name[RW]

the base name of the file with no extension

base_name_noext[RW]

the base name of the file with no extension

base_name_noext=[RW]

the base name of the file with no extension

dta_files[RW]
filtered_by_precursor_mass_tolerance[RW]

a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance

header[RW]
index[RW]

a parallel array to dta_files and out_files where each entry is:

first_scan, last_scan, charge
out_files[RW]
params[RW]
resident_dir[RW]

the directory the srf file was residing in when the filename was passed in. May not be available.

version[RW]

a String: 3.5, 3.3 or 3.2

Public Class Methods

get_sequest_params_and_finish_pos(filename) click to toggle source

returns a Sequest::Params object or nil if none

# File lib/mspire/sequest/srf.rb, line 53
def self.get_sequest_params_and_finish_pos(filename)
  # split the file in half and only read the second half (since we can be
  # confident that the params file will be there!)

  params = nil
  finish_parsing_io_pos = nil
  File.open(filename, 'rb') do |handle|
    halfway = handle.stat.size / 2
    handle.seek halfway
    last_half = handle.read
    if sequest_start_from_last_half = last_half.rindex('[SEQUEST]')
      params_start_index =  sequest_start_from_last_half + halfway
      handle.seek(params_start_index)
      params = Mspire::Sequest::Params.new.parse_io(handle)
      finish_parsing_io_pos = handle.pos
    else
      nil  # not found
    end
  end
  [params, finish_parsing_io_pos]
end
new(filename=nil, opts={}) click to toggle source

opts:

:filter_by_precursor_mass_tolerance => true | false (default true)
     this will filter by the sequest params prec tolerance as is
     typically done by the Bioworks software.

:read_pephits => true | false (default true)
     will attempt to read peptide hit information (equivalent to .out
     files), otherwise, just reads the dta information.
# File lib/mspire/sequest/srf.rb, line 92
def initialize(filename=nil, opts={})
  @peptide_hits = []
  @dta_files = []
  @out_files = []
  if filename
    from_file(filename, opts)
  end
end

Public Instance Methods

dta_start_byte() click to toggle source
# File lib/mspire/sequest/srf.rb, line 75
def dta_start_byte
  case @version
  when '3.2' ; 3260
  when '3.3' ; 3644
  when '3.5' ; 3644
  end
end
filter_by_precursor_mass_tolerance!() click to toggle source
  1. updates the out_file’s list of hits based on passing peptide_hits (but not

the original hit id; rank is implicit in array ordering)

  1. recalculates deltacn values completely if number of hits changed (does

not touch deltacn orig)

This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge! should be run after this method to ensure correct protein -> peptide linkages.

# File lib/mspire/sequest/srf.rb, line 110
def filter_by_precursor_mass_tolerance!
  pmt = params.peptide_mass_tolerance.to_f
  methd = nil  # the method to

  case params.peptide_mass_units
  when '0'
    amu_based = true
    milli_amu = false
  when '1'
    amu_based = true
    milli_amu = true
  when '2'
    amu_based = false
  end

  self.filtered_by_precursor_mass_tolerance = true
  self.out_files.each do |out_file|
    hits = out_file.hits
    before = hits.size
    hits.reject! do |pep|
      if amu_based
        if milli_amu
          (pep.deltamass.abs > (pmt/1000))
        else
          (pep.deltamass.abs > pmt)
        end
      else
        (pep.ppm.abs > pmt)
      end
    end
    if hits.size != before
      out_file.hits = hits # <- is this necessary
      Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits)
      out_file.num_hits = hits.size
    end
  end
  self
end
from_file(filename, opts) click to toggle source

returns self opts are the same as for ‘new’

# File lib/mspire/sequest/srf.rb, line 166
def from_file(filename, opts)
  @resident_dir = File.dirname(File.expand_path(filename))
  opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts)

  (@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename)
  return unless @params

  dup_references = 0
  dup_refs_gt_0 = false

  dup_references = @params.print_duplicate_references.to_i
  if dup_references == 0
    # warn %Q{
    #*****************************************************************************
    #WARNING: This srf file lists only 1 protein per peptide! (based on the
    #print_duplicate_references parameter in the sequest.params file used in its
    #creation)  So, downstream output will likewise only contain a single protein
    #for each peptide hit.  In many instances this is OK since downstream programs
    #will recalculate protein-to-peptide linkages from the database file anyway.
    #For complete protein lists per peptide hit, .srf files must be created with
    #print_duplicate_references > 0. HINT: to capture all duplicate references,
    #set the sequest parameter 'print_duplicate_references' to 100 or greater.
    #*****************************************************************************
    #        }
  else
    dup_refs_gt_0 = true
  end

  File.open(filename, 'rb') do |fh|
    @header = Mspire::Sequest::Srf::Header.from_io(fh)
    @version = @header.version

    unpack_35 = case @version
                when '3.2'
                  false
                when '3.3'
                  false
                when '3.5'
                  true
                end

    if @header.combined
      @base_name = File.basename(filename, '.*')
      # I'm not sure why this is the case, but the reported number is too
      # big by one on the 2 files I've seen so far, so we will correct it here!
      @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1
      if opts[:read_pephits] == false
        raise NotImplementedError, "on combined files must read everything right now!"
      end
      (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0)
    else
      @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first

      @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35)
      if opts[:read_pephits]
        # need the params file to know if the duplicate_references is set > 0
        raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil?
        @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0)

        # FOR DISPLAY ONLY!
        #@out_files.each do |f|
        #  if f.num_hits == 10
        #    p f.hits.last
        #  end
        #end

        if fh.eof?
          #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
          @params = nil
          @index = []
        end
      end
    end

    fh.pos = after_params_io_pos

    # This is very sensitive to the grab_params method in sequest params
    fh.read(12)  ## gap between last params entry and index

    @index = read_scan_index(fh,@header.num_dta_files)
  end


  ### UPDATE SOME THINGS:
  # give each hit a base_name, first_scan, last_scan
  if opts[:read_pephits] && !@header.combined
    @index.each_with_index do |ind,i|
      mass_measured = @dta_files[i][0]
      outfile = @out_files[i]
      outfile.first_scan = ind[0]
      outfile.last_scan = ind[1]
      outfile.charge = ind[2]

      pep_hits = @out_files[i].hits
      @peptide_hits.push( *pep_hits )
      pep_hits.each do |pep_hit|
        pep_hit[15] = @base_name
        pep_hit[16] = ind[0]
        pep_hit[17] = ind[1]
        pep_hit[18] = ind[2]
        # add the deltamass
        pep_hit[12] = pep_hit[0] - mass_measured  # real - measured (deltamass)
        pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm
        pep_hit[19] = self  ## link with the srf object
      end
    end

    filter_by_precursor_mass_tolerance! if params
  end

  self
end
protein_class() click to toggle source
# File lib/mspire/sequest/srf.rb, line 48
def protein_class
  Mspire::Sequest::Srf::Out::Protein
end
read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) click to toggle source
# File lib/mspire/sequest/srf.rb, line 149
def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0)
  dta_files = Array.new(num_files)
  out_files = Array.new(num_files)
  start = dta_start_byte
  fh.pos = start

  num_files.times do |i|
    dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35) 
    #p dta_files[i]
    out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
    #p out_files[i]
  end
  [dta_files, out_files]
end
read_dta_files(fh, num_files, unpack_35) click to toggle source

returns an array of dta_files

# File lib/mspire/sequest/srf.rb, line 301
def read_dta_files(fh, num_files, unpack_35)
  dta_files = Array.new(num_files)
  start = dta_start_byte
  fh.pos = start

  header.num_dta_files.times do |i|
    dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35) 
  end
  dta_files
end
read_out_files(fh,number_files, unpack_35, dup_refs_gt_0) click to toggle source

filehandle (fh) must be at the start of the outfiles. ‘read_dta_files’ will put the fh there.

# File lib/mspire/sequest/srf.rb, line 314
def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0)
  out_files = Array.new(number_files)
  header.num_dta_files.times do |i|
    out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0)
  end
  out_files
end
read_scan_index(fh, num) click to toggle source

returns an index where each entry is [first_scan, last_scan, charge]

# File lib/mspire/sequest/srf.rb, line 280
def read_scan_index(fh, num)
  #string = fh.read(80)
  #puts "STRING: "
  #p string
  #puts string
  #File.open("tmp.tmp",'wb') {|out| out.print string }
  #abort 'her'
  ind_len = 24
  index = Array.new(num)
  unpack_string = 'III'
  st = ''
  ind_len.times do st << '0' end  ## create a 24 byte string to receive data
  num.times do |i|
    fh.read(ind_len, st)
    result = st.unpack(unpack_string)
    index[i] = st.unpack(unpack_string)
  end
  index
end