class FastaReader
The following is actually a module/trait implementation without state
Public Class Methods
func passes in a FASTA buffer. Every time a record is parsed it is yielded.
# File lib/bigbio/db/fasta/fastareader.rb, line 141 def FastaReader::emit getbuf_func seq = "" id = nil descr = nil while buf = getbuf_func.call buf.split(/\n/).each do | line | if line =~ /^>/ yield id, descr, seq if descr descr = line[1..-1].strip matched = /^(\S+)/.match(descr) id = matched[0] seq = "" else seq += line.strip end end end yield id, descr, seq if descr and seq.size > 0 end
# File lib/bigbio/db/fasta/fastareader.rb, line 161 def FastaReader::emit_fastarecord getbuf_func emit(getbuf_func) do | id, descr, seq | yield FastaRecord.new(id, descr, seq) end end
Initalize the reader of FASTA file fn. Options can be :regex and :index (true/false)
# File lib/bigbio/db/fasta/fastareader.rb, line 12 def initialize fn, opts = {} @f = File.open(fn) @fread_once = false @regex = opts[:regex] @regex = '^(\S+)' if @regex == nil indexer_use opts[:index] end
Public Instance Methods
# File lib/bigbio/db/fasta/fastareader.rb, line 117 def close @f.close end
# File lib/bigbio/db/fasta/fastareader.rb, line 97 def digest_tag tag if tag =~ /^>/ descr = $'.strip if descr =~ /#{@regex}/ id = $1 # p [descr,id] return id, descr end p descr # do not remove these p @regex end raise "Can not digest '#{tag}' using '"+@regex+"'" end
returns a FastaRecord
for every item (invokes parse_each
)
# File lib/bigbio/db/fasta/fastareader.rb, line 55 def each parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) } end
# File lib/bigbio/db/fasta/fastareader.rb, line 59 def first parse_each { | id, descr, seq | return FastaRecord.new(id, descr, seq) } end
Return a record by its id
, nil when not found
# File lib/bigbio/db/fasta/fastareader.rb, line 66 def get id indexed? if fpos = indexer_get(id) get_rec(fpos) else nil end end
# File lib/bigbio/db/fasta/fastareader.rb, line 88 def get_by_index idx indexed? if fpos = indexer_get_by_index(idx)[1] ret = get_rec(fpos) return ret end nil end
# File lib/bigbio/db/fasta/fastareader.rb, line 75 def get_rec fpos @f.seek fpos tag = @f.gets seq = "" begin line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof id, descr = digest_tag(tag) FastaRecord.new(id,descr,seq) end
Parse the FASTA file and yield id, descr, sequence. When the indexer is on it will index the records the first time. Note that, with indexing, when you don’t complete parsing there will be an error the second time. This is a # trade-off, otherwise one would always have to index the file and read it twice.
# File lib/bigbio/db/fasta/fastareader.rb, line 25 def parse_each @f.seek 0 # force file rewind @rec_fpos = 0 @rec_line = @f.gets fpos = 0 @count = 0 begin # digest id from record description id, descr = digest_tag(@rec_line) id_fpos = @rec_fpos # parse the sequence seq = "" begin fpos = @f.tell line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof # new record @count += 1 @rec_fpos = fpos @rec_line = line # p [@rec_line, id, id_fpos] indexer_set(id, id_fpos) if @indexer and not @fread_once yield id, descr, seq end while !@f.eof @fread_once = true end
Returns the size of the dataset - as read. After the final record the size represents the number of items in the FASTA file
# File lib/bigbio/db/fasta/fastareader.rb, line 113 def size @count end
Private Instance Methods
# File lib/bigbio/db/fasta/fastareader.rb, line 123 def indexed? if @indexer and not @fread_once # force indexer # $stderr.print "Force indexer" parse_each { | x, y, z | nil } end true end