class Cdhit

Constants

COMMENTS
NAME
SEQ_FASTA

Attributes

clusters[RW]
sequence_hash_fasta[RW]

Public Class Methods

new(fasta_file, clust_file) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 35
def initialize(fasta_file, clust_file)
        @clusters = []
        @sequence_hash_fasta=hash_fasta(fasta_file)
        cd_hit_clusters(clust_file)
end

Public Instance Methods

cd_hit_clusters(clust_file) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 111
def cd_hit_clusters(clust_file)
        require 'bio-cd-hit-report'
        report = Bio::CdHitReport.new(clust_file)
        report.each_cluster do |cluster|
                clust=[]
                cluster.data.each do |member|
                        name, master = parse_member(member)
                        hash_seq = @sequence_hash_fasta[name]
                        sequence = Seq.new(hash_seq[NAME], hash_seq[COMMENTS], hash_seq[SEQ_FASTA], master)
                        clust << sequence
                end
                @clusters << clust
        end
end
each_cluster() { |cluster| ... } click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 41
def each_cluster
        @clusters.each do |cluster|
                yield cluster
        end
end
get_all_master() click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 92
def get_all_master
        master = []
        each_cluster{|cluster|
                master << get_master(cluster)
        }
        return master
end
get_master(cluster) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 87
def get_master(cluster)
        master= cluster.select{|seq| seq.master}.first
        return master
end
get_sp(cluster) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 100
def get_sp(cluster)
        master=cluster.select{|seq| seq.db == 'sp'}
        if !master.empty?
                master=master.first
        else
                master=nil
        end
        return master
end
hash_fasta(file) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 139
def hash_fasta(file)
        sequence_hash_fasta={}
        fqr=FastaQualFile.new(file)
        fqr.each do |name,seq_fasta,comments|
                sequence_hash_fasta[name[0..18]]=[name, comments, seq_fasta] #Cd-hit cuts sequence's name to 20 character (even > character) so we use 'name[0..18]' like key hash
        end
        fqr.close
        return sequence_hash_fasta
end
master_fasta(file_name) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 47
def master_fasta(file_name)
        fasta=File.open(file_name,'w')
        each_cluster{|cluster|
                master=get_master(cluster)
                fasta.print '>'+master.name+' '+master.comments+"\n"+master.seq_fasta+"\n"
        }
        fasta.close
end
master_to_sp_seq() click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 56
def master_to_sp_seq
        each_cluster{|cluster|
                master_seq = get_master(cluster)
                if master_seq.db != 'sp'
                        sp_seq=get_sp(cluster)
                        if !sp_seq.nil?
                                cluster.map{|seq| seq.master=FALSE}
                                sp_seq.master=TRUE
                        end                 
                end
        }
        
end
parse_member(member) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 126
def parse_member(member)
        member.gsub!('...','')
        member.gsub!('>','')
        fields = member.split(',')
        data = fields[1].split(' ',2)
        master = FALSE
        if data[1] == '*'
                master = TRUE
        end                   
        return data[0],master
end
recover_different_lengths(percentage) click to toggle source
# File lib/full_lengther_next/cdhit.rb, line 70
def recover_different_lengths(percentage)
        seqs = []
        each_cluster{|cluster|
                master = get_master(cluster)
                cluster.each do |seq|
                        if seq.name == master.name
                                next
                        else
                                seq_mas_len = seq.seq_fasta.length/master.seq_fasta.length*100
                                mas_seq_len = master.seq_fasta.length/seq.seq_fasta.length*100                     
                                seqs << seq if mas_seq_len < percentage && seq_mas_len < percentage
                        end
                end
        }
        return seqs
end