class FastaReader

The following is actually a module/trait implementation without state

Public Class Methods

emit(getbuf_func) { |id, descr, seq| ... } click to toggle source

func passes in a FASTA buffer. Every time a record is parsed it is yielded.

# File lib/bigbio/db/fasta/fastareader.rb, line 141
def FastaReader::emit getbuf_func
  seq = ""
  id = nil
  descr = nil
  while buf = getbuf_func.call
    buf.split(/\n/).each do | line |
      if line =~ /^>/
        yield id, descr, seq if descr
        descr = line[1..-1].strip
        matched = /^(\S+)/.match(descr)
        id = matched[0]
        seq = ""
      else
        seq += line.strip
      end
    end
  end
  yield id, descr, seq if descr and seq.size > 0
end
emit_fastarecord(getbuf_func) { |fasta_record| ... } click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 161
def FastaReader::emit_fastarecord getbuf_func
  emit(getbuf_func) do | id, descr, seq |
    yield FastaRecord.new(id, descr, seq) 
  end
end
new(fn, opts = {}) click to toggle source

Initalize the reader of FASTA file fn. Options can be :regex and :index (true/false)

# File lib/bigbio/db/fasta/fastareader.rb, line 12
def initialize fn, opts = {}
  @f = File.open(fn)
  @fread_once = false
  @regex = opts[:regex]
  @regex = '^(\S+)' if @regex == nil
  indexer_use opts[:index]
end

Public Instance Methods

close() click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 117
def close
  @f.close
end
digest_tag(tag) click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 97
def digest_tag tag
  if tag =~ /^>/
    descr = $'.strip
    if descr =~ /#{@regex}/
      id = $1
      # p [descr,id]
      return id, descr
    end
    p descr  # do not remove these
    p @regex
  end
  raise "Can not digest '#{tag}' using '"+@regex+"'"
end
each() { |fasta_record| ... } click to toggle source

returns a FastaRecord for every item (invokes parse_each)

# File lib/bigbio/db/fasta/fastareader.rb, line 55
def each
  parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) }
end
first() click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 59
def first
  parse_each { | id, descr, seq | 
    return FastaRecord.new(id, descr, seq) 
  }
end
get(id) click to toggle source

Return a record by its id, nil when not found

# File lib/bigbio/db/fasta/fastareader.rb, line 66
def get id
  indexed?
  if fpos = indexer_get(id)
    get_rec(fpos)
  else
    nil
  end
end
get_by_index(idx) click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 88
def get_by_index idx
  indexed?
  if fpos = indexer_get_by_index(idx)[1]
    ret = get_rec(fpos)
    return ret
  end
  nil
end
get_rec(fpos) click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 75
def get_rec fpos
  @f.seek fpos
  tag = @f.gets
  seq = ""
  begin
    line = @f.gets
    break if line =~ /^>/
    seq += line.strip 
  end while !@f.eof
  id, descr = digest_tag(tag)
  FastaRecord.new(id,descr,seq)
end
parse_each() { |id, descr, seq| ... } click to toggle source

Parse the FASTA file and yield id, descr, sequence. When the indexer is on it will index the records the first time. Note that, with indexing, when you don’t complete parsing there will be an error the second time. This is a # trade-off, otherwise one would always have to index the file and read it twice.

# File lib/bigbio/db/fasta/fastareader.rb, line 25
def parse_each
  @f.seek 0    # force file rewind
  @rec_fpos = 0
  @rec_line = @f.gets
  fpos = 0
  @count = 0
  begin
    # digest id from record description
    id, descr = digest_tag(@rec_line)
    id_fpos = @rec_fpos
    # parse the sequence
    seq = ""
    begin
      fpos = @f.tell
      line = @f.gets
      break if line =~ /^>/
      seq += line.strip 
    end while !@f.eof 
    # new record
    @count += 1
    @rec_fpos = fpos
    @rec_line = line
    # p [@rec_line, id, id_fpos]
    indexer_set(id, id_fpos) if @indexer and not @fread_once
    yield id, descr, seq
  end while !@f.eof
  @fread_once = true
end
size() click to toggle source

Returns the size of the dataset - as read. After the final record the size represents the number of items in the FASTA file

# File lib/bigbio/db/fasta/fastareader.rb, line 113
def size
  @count
end

Private Instance Methods

indexed?() click to toggle source
# File lib/bigbio/db/fasta/fastareader.rb, line 123
def indexed?
  if @indexer and not @fread_once
    # force indexer
    # $stderr.print "Force indexer"
    parse_each { | x, y, z | nil }
  end
  true
end