class Bio::HMMER::HMMER3::DefaultHMMSearchReport

This class is for parsing HMMSearch outputs from the default output

Constants

DELIMITER

Delimiter of each entry for Bio::FlatFile support.

Public Class Methods

new(data) click to toggle source
# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 30
def initialize(data)
  # The input data is divided into chunks, a hash called @report_chunks (previously called subdata)
  @report_chunks = get_subdata(data)
  
  @log = Bio::Log::LoggerPlus['bio-hmmer3report']
end

Public Instance Methods

hits() click to toggle source

Return an array of HMMER3::Hit objects from this report

# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 109
def hits
  return [] unless @report_chunks['alignment'].match(/^>>/)
  # For each hit sequence (hits)
  sequence_annotations = @report_chunks['alignment'].split(">>")
  # puts "Found #{sequence_annotations.length} different hits e.g. #{sequence_annotations[0]}\n\n and #{sequence_annotations[1]} and \n\n #{sequence_annotations[sequence_annotations.length-1]}"
  
  alignments = []
  sequence_annotations.each_with_index do |seq_annot, i|
    #Ignore the first split as it is rubbish leftover from the split above
    next if i==0
    
    # Now split on \n\n. Each of these splits should have 1 or more domains associated
    
    #First of this split will be "stanzas" like this
    #>> 637984252  Acid345_2236 D-isomer specific 2-hydroxyacid dehydrogenase, NAD-binding [Korebacter versatilis Ellin345]
    #   #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
    # ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
    #   1 !  120.2   0.5   7.6e-39   1.2e-35       1     133 []       3     313 ..       3     313 .. 0.98
    
    #And AFTER the first split comes "stanzas" like this
    #  Alignments for each domain:
    #  == domain 1    score: 120.2 bits;  conditional E-value: 7.6e-39
    #                 EEECST.-CCHHHHHCC..TEEEEEEC.GSSHHHHHC....-SEEEE-TTS-BSHHHHCC-TT--EEEES----TTB-HHHHHH---EEE--TTTTHHH CS
    #                 xxxxxxxxxxxxxxxxx..xxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx RF
    #  2-Hacid_dh   1 vlileplreeelellke..gvevevkd.ellteellekakdadalivrsntkvtaevleklpkLkviatagvGvDniDldaakerGIlVtnvpgystes 96
    #                 +++ e++ +++++l+k+    +v++ d   ++e+lle++k+adalivrs   v+a++le++ +L+vi++agvGvDni l+aa+++GI+V+n+pg+++ +
    #   637982115   3 IVVAEKIAKAAIDLFKQdpTWNVVTPDqVAQKEQLLEQLKGADALIVRSAVFVDAAMLEHADQLRVIGRAGVGVDNIELEAATRKGIAVMNTPGANAIA 101
    #                 6889999**********8544777777778888****************************************************************** PP
    stanzas = seq_annot.split("\n\n")
    
    hit = Hit.new
    hsps = []
    
    # Parse the first
    lines = stanzas[0].split("\n")
    sequence_name = lines[0].gsub(/^ /,'')
    hit.sequence_name = sequence_name
    
    @log.debug "Now parsing table for #{sequence_name}"
    return [] if lines[1].match(/^   \[No individual domains that/)

    lines[3..(lines.length-1)].each do |line|
      annotation = Hit::Hsp.new
      
      splits = line.split(/\s+/)
      i = 1
      annotation.number = splits[i].to_i; i+=2
      annotation.score = splits[i].to_f; i+=1
      annotation.bias = splits[i].to_f; i+=1
      annotation.c_evalue = splits[i].to_f; i+=1
      annotation.i_evalue = splits[i].to_f; i+=1
      annotation.hmmfrom = splits[i].to_i; i+=1
      annotation.hmm_to = splits[i].to_i; i+=2
      annotation.alifrom = splits[i].to_i; i+=1
      annotation.ali_to = splits[i].to_i; i+=2
      annotation.envfrom = splits[i].to_i; i+=1
      annotation.env_to = splits[i].to_i; i+=2
      annotation.acc = splits[i].to_f
      hsps.push annotation
    end
    
    # Parse the second stanza and beyond
    current_hsp = nil
    (1..(stanzas.length-1)).each_with_index do |aln, index|
      next if stanzas[aln] == "\n"
      stanza = stanzas[aln].split("\n")
      line_offset = 0
      line_offset += 1 if index==0 #to account for the "Alignments for each domain:" line
      # Is this a new HSP being described?
      if matches = stanza[line_offset].match(/^  == domain (\d+)/)
        domain_index = matches[1].to_i-1
        current_hsp = hsps[domain_index]
        line_offset += 1 # to account for the "== domain 1    score: 26.8 bits;  conditional E-value: 5.7e-10" line
      end
      
      # Detect CS and RF lines
      line_offset += 1 if stanza[line_offset].split(/\s+/)[2] == 'CS'
      line_offset += 1 if stanza[line_offset].split(/\s+/)[2] == 'RF'
      
      # Add the lines to the relevant places
      current_hsp.hmmseq ||= ''
      current_hsp.hmmseq += stanza[line_offset].split(/\s+/)[3]
      current_hsp.flatseq ||= ''
      current_hsp.flatseq += stanza[2+line_offset].split(/\s+/)[3]
    end
    
    hit.hsps = hsps
    alignments.push hit
    
    @log.debug "Parsed alignments for sequence #{hit.sequence_name}" if @log.debug?
  end
  
  return alignments
end
query() click to toggle source
# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 83
def query
  return @query unless @query.nil?
  parse_query
  return @query
end
query_accession() click to toggle source
# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 89
def query_accession
  return @query_accession unless @query_accession.nil?
  parse_query
  return @query_accession
end
query_description() click to toggle source
# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 95
def query_description
  return @query_description unless @query_description.nil?
  parse_query
  return @query_description
end

Private Instance Methods

get_subdata(data) click to toggle source

Return the report split up into chunks, so that those chunks can be further processed. Chunks are returned as a hash

# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 39
def get_subdata(data)
  subdata = {}
  header_prefix = '\Ahmmsearch :: search' ## # hmmsearch :: search profile(s) against a sequence database
  query_prefix  = '^Query:' ## Query:       2-Hacid_dh  [M=133]
  hit_prefix    = '^Scores for complete sequences' ## Scores for complete sequences (score includes all domains):
  aln_prefix    = '^Domain annotation for each sequence' ## Domain annotation for each sequence (and alignments):
  stat_prefix   = '^\nInternal pipeline statistics summary:' ## Internal pipeline statistics summary:
  
  # if header exists, get it. Header only occurs in the first report
  if data =~ /#{header_prefix}/
    subdata["header"] = data[/(\A.+?)(?=#{query_prefix})/m]
  end
  
  # split rest of report into sub-sections
  subdata["query"] =      data[/(#{query_prefix}.+?)(?=#{hit_prefix})/m]
  subdata["hit"] =        data[/(#{hit_prefix}.+?)(?=#{aln_prefix})/m]
  subdata["alignment"] =  data[/(#{aln_prefix}.+?)(?=#{stat_prefix})/m]
  subdata["statistics"] = data[/(#{stat_prefix}.+)\z/m]

  return subdata
end
parse_query() click to toggle source

Parse the Query:, Accession: and :Description parts of the

# File lib/bio/appl/hmmer/hmmer3/default_report.rb, line 63
def parse_query
  @report_chunks['query'].each_line do |line|
    splits = line.split(':')
    raise "Unexpected form of query header found in hmmsearch query chunk #{line.inspect}" unless splits.length>1
    key = splits[0]
    value = splits[1..(splits.length-1)].join(':').strip #in case there is colons in the value itself
    
    if key == 'Query'
      @query = value
    elsif key == 'Accession'
      @query_accession = value
    elsif key == 'Description'
      @query_description = value
    else
      raise "Unexpected form of query header found in hmmsearch query chunk #{line.inspect}"
    end
  end
end