module Mspire::Mzml::Reader

Attributes

Public Instance Methods

get_default_data_processing_ids(io, index_list, lookback=300) click to toggle source

returns a hash keyed by :spectrum or :chromatogram that gives the id (aka ref) as a string.

# File lib/mspire/mzml/reader.rb, line 43
def get_default_data_processing_ids(io, index_list, lookback=300)
  hash = {}
  index_list.each_pair do |name, index|
    if index.size > 0
      # ^ we cannot quickly retrieve a defaultDataProcessingRef unless there
      # is at least one spectrum/chromatogram to start with.  However, if
      # there is no spectrum/chromatogram, then the defaultDataProcessingRef
      # will not be needed either.
      io.bookmark do |io|
        io.pos = index[0] - lookback 
        hash[name] = io.read(lookback)[/<#{name}List.*defaultDataProcessingRef=['"](.*?)['"]/m, 1]
      end
    end
  end
  hash
end
get_header_string(io) click to toggle source

saves ~ 3 seconds when reading a 83M mzML file to scrape off the header string (even though we’re just handing in an IO object to Nokogiri::XML::Document.parse and we are very careful to not parse too far).

# File lib/mspire/mzml/reader.rb, line 64
def get_header_string(io)
  chunk_size = 2**12
  loc = 0
  string = ''
  while chunk = @io.read(chunk_size)
    string << chunk
    start_looking = ((loc-20) < 0) ? 0 : (loc-20)
    break if string[start_looking..-1] =~ /<(spectrum|chromatogram)/
      loc += chunk_size
  end
  string
end
read_header!(list_type_to_default_data_processing_id) click to toggle source

list_type_to_default_data_processing_id is a hash keyed by :spectrum or :chromatogram that gives the default data_processing_object for the SpectrumList and/or the ChromatogramList. This information is not obtainable from the header string, so must be pre-obtained.

# File lib/mspire/mzml/reader.rb, line 81
def read_header!(list_type_to_default_data_processing_id)
  @io.rewind

  string = get_header_string(@io)
  doc = Nokogiri::XML.parse(string, nil, @encoding, Mspire::Mzml::Parser::NOBLANKS)

  doc.remove_namespaces!
  mzml_n = doc.root
  if mzml_n.name == 'indexedmzML'
    mzml_n = mzml_n.child
  end

  @id = mzml_n[:id]

  cv_list_n = mzml_n.child
  self.cvs = cv_list_n.children.map do |cv_n|
    Mspire::Mzml::CV.from_xml(cv_n)
  end

  # get the file description node but deal with it after getting ref_hash
  file_description_n = cv_list_n.next

  xml_n = file_description_n.next

  # a hash of referenceable_param_groups indexed by id
  @link = {}

  if xml_n.name == 'referenceableParamGroupList'
    self.referenceable_param_groups = xml_n.children.map do |rpg_n|
      Mspire::Mzml::ReferenceableParamGroup.from_xml(rpg_n) # <- no ref_hash (not made yet)
    end
    @link[:ref_hash] = self.referenceable_param_groups.index_by(&:id)
    xml_n = xml_n.next
  end

  # now we can set the file description because we have the ref_hash
  self.file_description = Mspire::Mzml::FileDescription.from_xml(file_description_n, @link)
  @link[:source_file_hash] = self.file_description.source_files.index_by(&:id)


  loop do
    case xml_n.name
    when 'sampleList'
      self.samples = xml_n.children.map do |sample_n|
        Mspire::Mzml::Sample.from_xml(sample_n, @link)
      end
      @link[:sample_hash] = self.samples.index_by(&:id)
    when 'softwareList'  # required
      self.software_list = xml_n.children.map do |software_n|
        Mspire::Mzml::Software.from_xml(software_n, @link)
      end
      @link[:software_hash] = self.software_list.index_by(&:id)
    when 'instrumentConfigurationList'
      self.instrument_configurations = xml_n.children.map do |inst_config_n|
        Mspire::Mzml::InstrumentConfiguration.from_xml(inst_config_n, @link)
      end
      @link[:instrument_configuration_hash] = self.instrument_configurations.index_by(&:id)
    when 'dataProcessingList'
      self.data_processing_list = xml_n.children.map do |data_processing_n|
        Mspire::Mzml::DataProcessing.from_xml(data_processing_n, @link)
      end
      @link[:data_processing_hash] = self.data_processing_list.index_by(&:id)
    when 'run'
      @link[:index_list] = @index_list
      list_type_to_default_data_processing_id.each do |type, process_id|
        @link["#{type}_default_data_processing".to_sym] = @link[:data_processing_hash][process_id]
      end
      self.run = Mspire::Mzml::Run.from_xml(@io, xml_n, @link)
      break
    end
    xml_n = xml_n.next
  end
end
set_from_xml_io!(xml_io) click to toggle source
# File lib/mspire/mzml/reader.rb, line 30
def set_from_xml_io!(xml_io)
  @io = xml_io
  begin
    @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
  rescue EOFError
    raise RuntimeError, "no encoding present in XML!  (Is this even an xml file?)"
  end
  @index_list = Mspire::Mzml::IndexList.from_io(@io)
  read_header!( get_default_data_processing_ids(@io, @index_list) )
end