class ParseFasta::SeqFile

Attributes

type[RW]

@!attribute type

@return [Symbol] the type of the SeqFile (:fasta or :fastq)

Public Class Methods

new(fname, args = {}) click to toggle source

@param fname [String] the name of the fastA or fastQ file to

parse

@param type [Symbol] whether the file is :fasta or :fastq @param check_fasta_seq [Bool] keyword arg for whether to check

for '>' in the sequence of fastA files.

@raise [ParseFasta::Error::FileNotFoundError] if the file is not

found

@raise [ParseFasta::Error::DataFormatError] if the file doesn't

start with a '>' or a '@'
# File lib/parse_fasta/seq_file.rb, line 19
def initialize fname, args = {}
  type = check_file fname

  @check_fasta_seq = args.fetch :check_fasta_seq, true
  @fname = fname
  @type = type
end
open(fname, args = {}) click to toggle source

An alias for SeqFile.new

@return [SeqFile] a SeqFile object

# File lib/parse_fasta/seq_file.rb, line 30
def self.open fname, args = {}
  self.new fname, args
end

Public Instance Methods

each_record(&b) click to toggle source

Analagous to IO#each_line, SeqFile#each_record is used to go through a fastA or fastQ file record by record. It will accept gzipped files as well.

If the input is a fastA file, then the record that is yielded will have the desc and qual instance variables be nil. If it is a fastQ record then those instance variables will not be nil.

@example Parsing a fastA file

ParseFasta::SeqFile.open("seqs.fa").each_record do |rec|
  puts [rec.header, rec.seq].join "\t"

  rec.desc.nil? #=> true
  rec.qual.nil? #=> true
end

@example Parsing a gzipped fastQ file

ParseFasta::SeqFile.open("seqs.fq.gz").each_record do |rec|
  puts [rec.header, rec.seq, rec.desc, rec.qual].join "\t"
end

@yieldparam record [ParseFasta::Record] A Record object with all

the info of the record

@raise [ParseFasta::Error::SequenceFormatError] if a fastA file

contains a record with a '>' character in the header, and the
SeqFile object was not initialized with check_fasta_seq: false
# File lib/parse_fasta/seq_file.rb, line 60
def each_record &b
  line_parser = "parse_#{@type}_lines"

  if gzipped? @fname
    each_record_gzipped line_parser, &b
  else
    each_record_non_gzipped line_parser, &b
  end
end

Private Instance Methods

check_file(fname) click to toggle source
# File lib/parse_fasta/seq_file.rb, line 236
def check_file fname
  first_char = get_first_char fname

  if first_char == ">"
    :fasta
  elsif first_char == "@"
    :fastq
  else
    raise ParseFasta::Error::DataFormatError,
          "The file does not look like fastA or fastQ " +
              "-- #{fname}"
  end
end
each_record_gzipped(line_parser, &b) click to toggle source
# File lib/parse_fasta/seq_file.rb, line 79
def each_record_gzipped line_parser, &b
  File.open(@fname, "rt") do |file|
    loop do
      begin
        gz_reader = Zlib::GzipReader.new file

        self.send line_parser, gz_reader, &b

        # check if there are any more blobs to read
        if (unused = gz_reader.unused)
          # rewind to the start of the last blob
          file.seek -unused.length, IO::SEEK_END
        else
          # there are no more blobs to read
          break
        end
      end
    end
  end
end
each_record_non_gzipped(line_parser, &b) click to toggle source
# File lib/parse_fasta/seq_file.rb, line 73
def each_record_non_gzipped line_parser, &b
  File.open(@fname, "rt") do |f|
    self.send line_parser, f, &b
  end
end
get_first_char(fname) click to toggle source

Get the first char of the file whether it is gzip'd or not. No need to rewind the stream afterwards.

# File lib/parse_fasta/seq_file.rb, line 214
def get_first_char fname
  if File.exists? fname
    begin
      f = Zlib::GzipReader.open fname
    rescue Zlib::GzipFile::Error
      f = File.open fname
    end


    begin
      first_char = f.each.peek[0]

      return first_char
    ensure
      f.close
    end
  else
    raise ParseFasta::Error::FileNotFoundError,
          "No such file or directory -- #{fname}"
  end
end
gzipped?(fname) click to toggle source
# File lib/parse_fasta/seq_file.rb, line 177
def gzipped? fname
  begin
    f = Zlib::GzipReader.open fname
    return true
  rescue Zlib::GzipFile::Error => e
    return false
  ensure
    f.close if f
  end
end
parse_fasta_line(line, header, sequence) { |record(header: strip, seq: sequence, check_fasta_seq: check_fasta_seq)| ... } click to toggle source
# File lib/parse_fasta/seq_file.rb, line 100
def parse_fasta_line line, header, sequence, &b
  line.chomp!
  len = line.length

  if header.empty? && line.start_with?(">")
    header = line[1, len] # drop the '>'
  elsif line.start_with? ">"
    yield Record.new(header: header.strip,
                     seq: sequence,
                     check_fasta_seq: @check_fasta_seq)

    header = line[1, len]
    sequence = ""
  else
    sequence << line
  end

  [header, sequence]
end
parse_fasta_lines(file_reader) { |record(header: strip, seq: sequence, check_fasta_seq: check_fasta_seq)| ... } click to toggle source
# File lib/parse_fasta/seq_file.rb, line 148
def parse_fasta_lines file_reader, &b
  header = ""
  sequence = ""

  line_reader = which_line_reader file_reader
  file_reader.send(*line_reader) do |line|
    header, sequence = parse_fasta_line line, header, sequence, &b
  end

  # yield the final seq
  yield Record.new(header: header.strip,
                   seq: sequence,
                   check_fasta_seq: @check_fasta_seq)
end
parse_fastq_line(line, header, seq, desc, qual, count) { |record(header: header, seq: seq, desc: desc, qual: qual)| ... } click to toggle source
# File lib/parse_fasta/seq_file.rb, line 120
def parse_fastq_line line, header, seq, desc, qual, count, &b
  line.chomp!

  case count
    when 0
      header = line[1..-1]
    when 1
      seq = line
    when 2
      desc = line[1..-1]
    when 3
      count = -1
      qual = line

      yield Record.new(header: header,
                       seq:    seq,
                       desc:   desc,
                       qual:   qual)
    else
      raise ParseFasta::Error::ParseFastaError,
            "Something went wrong in parse_fastq_line"
  end

  count += 1

  [header, seq, desc, qual, count]
end
parse_fastq_lines(file_reader, &b) click to toggle source
# File lib/parse_fasta/seq_file.rb, line 163
def parse_fastq_lines file_reader, &b
  count  = 0
  header = ""
  seq    = ""
  desc   = ""
  qual   = ""

  line_reader = which_line_reader file_reader
  file_reader.send(*line_reader) do |line|
    header, seq, desc, qual, count =
        parse_fastq_line line, header, seq, desc, qual, count, &b
  end
end
which_line_reader(file_reader) click to toggle source

The Zlib::GzipReader can't handle files where the line separator is r. This could all be avoided by using IO.popen(“gzip -cd #{fname}”, “rt”), but will gzip always be available?

# File lib/parse_fasta/seq_file.rb, line 191
def which_line_reader file_reader
  line_reader = [:each_line]
  # a valid fasta file must have at least two lines, the header
  # and the sequence
  begin
    enum = file_reader.each_line
    # if this was ruby v2.3, then we could just call .size on enum
    2.times do
      enum.next
    end
  rescue StopIteration
    # Zlib::GzipReader can handle \n and \r\n, but not \r, so if
    # we get here, the file has \r only for line endings
    line_reader = [:each, "\r"]
  ensure
    file_reader.rewind
  end

  line_reader
end