class Cdhit
Constants
- COMMENTS
- NAME
- SEQ_FASTA
Attributes
clusters[RW]
sequence_hash_fasta[RW]
Public Class Methods
new(fasta_file, clust_file)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 35 def initialize(fasta_file, clust_file) @clusters = [] @sequence_hash_fasta=hash_fasta(fasta_file) cd_hit_clusters(clust_file) end
Public Instance Methods
cd_hit_clusters(clust_file)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 111 def cd_hit_clusters(clust_file) require 'bio-cd-hit-report' report = Bio::CdHitReport.new(clust_file) report.each_cluster do |cluster| clust=[] cluster.data.each do |member| name, master = parse_member(member) hash_seq = @sequence_hash_fasta[name] sequence = Seq.new(hash_seq[NAME], hash_seq[COMMENTS], hash_seq[SEQ_FASTA], master) clust << sequence end @clusters << clust end end
each_cluster() { |cluster| ... }
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 41 def each_cluster @clusters.each do |cluster| yield cluster end end
get_all_master()
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 92 def get_all_master master = [] each_cluster{|cluster| master << get_master(cluster) } return master end
get_master(cluster)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 87 def get_master(cluster) master= cluster.select{|seq| seq.master}.first return master end
get_sp(cluster)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 100 def get_sp(cluster) master=cluster.select{|seq| seq.db == 'sp'} if !master.empty? master=master.first else master=nil end return master end
hash_fasta(file)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 139 def hash_fasta(file) sequence_hash_fasta={} fqr=FastaQualFile.new(file) fqr.each do |name,seq_fasta,comments| sequence_hash_fasta[name[0..18]]=[name, comments, seq_fasta] #Cd-hit cuts sequence's name to 20 character (even > character) so we use 'name[0..18]' like key hash end fqr.close return sequence_hash_fasta end
master_fasta(file_name)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 47 def master_fasta(file_name) fasta=File.open(file_name,'w') each_cluster{|cluster| master=get_master(cluster) fasta.print '>'+master.name+' '+master.comments+"\n"+master.seq_fasta+"\n" } fasta.close end
master_to_sp_seq()
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 56 def master_to_sp_seq each_cluster{|cluster| master_seq = get_master(cluster) if master_seq.db != 'sp' sp_seq=get_sp(cluster) if !sp_seq.nil? cluster.map{|seq| seq.master=FALSE} sp_seq.master=TRUE end end } end
parse_member(member)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 126 def parse_member(member) member.gsub!('...','') member.gsub!('>','') fields = member.split(',') data = fields[1].split(' ',2) master = FALSE if data[1] == '*' master = TRUE end return data[0],master end
recover_different_lengths(percentage)
click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 70 def recover_different_lengths(percentage) seqs = [] each_cluster{|cluster| master = get_master(cluster) cluster.each do |seq| if seq.name == master.name next else seq_mas_len = seq.seq_fasta.length/master.seq_fasta.length*100 mas_seq_len = master.seq_fasta.length/seq.seq_fasta.length*100 seqs << seq if mas_seq_len < percentage && seq_mas_len < percentage end end } return seqs end