class MyWorkerManagerFln

Attributes

seqs_annotation_prot[RW]

MANAGER INITIALIZATION

seqs_some_coding[RW]

MANAGER INITIALIZATION

seqs_unknown[RW]

MANAGER INITIALIZATION

Public Class Methods

asign_coding_attributes(seq, coding) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 437
def self.asign_coding_attributes(seq, coding)
        seq.type = CODING
        @@stats_hash['unknown'] -= 1
        @@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
        @@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
        @@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
        @@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
        @@stats_hash['coding'] += 1
        coding = select_orf(coding)
        if coding[1] == 'complete'
                seq.status = TRUE 
                @@stats_hash['coding_sure'] += 1
        else
                @@stats_hash['coding_putative'] += 1
        end

        seq.t_code = coding.last
        ind = 2
        ind = 3 if coding[4] == '-'
        frame = (coding[ind]%3)+1
        frame = frame * -1 if coding[4] == '-'
        seq.hit = [coding[2], coding[3], frame]
end
correct_by_selected(selected, coding_info) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 422
def self.correct_by_selected(selected, coding_info)
        seqs_selected = []
        File.open(selected).each do |line|
                line.chomp!
                seq_name, orf_id = line.split('|', 2)
                seqs_selected << orf_id
        end
        coding_info.each do |seq_name, orfs|
                orfs.each do |orf, info|                     
                        info[1] = '-' if !seqs_selected.include?(orf)
                end
        end
        return coding_info
end
end_work_manager() click to toggle source

close files

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 190
def self.end_work_manager
        orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
        write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_functional_annotation_by_seqs, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@pre_fln_seq_lengths, @@seq_lengths, @@output_files['stats_txt'], @@output_files['stats_html'])
        write_mapping_report(@@fpkm, @@coverage_analysis, @@stats_functional_annotation_by_seqs)
        @@output_files.each do |key, handler|
                handler.close if handler.class != String
        end
end
get_annotations() click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 550
def self.get_annotations
        return        @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
end
get_coding_info(file_name) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 499
def self.get_coding_info(file_name)
        coding_info = {}
        begin
                FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
                        seq_length = seq.length
        f_len = seq.length.to_f
        x_len = seq.count('X')
                        seq_name, orf_id = name.split('|')
                        comments =~ /type:(\S+)/
                        type = $1
                        comments =~ /:(\d+)-(\d+)\(([+-])\)/
                        start = $1.to_i
                        stop = $2.to_i
                        strand = $3
                        record = coding_info[seq_name]
                        info = [x_len / f_len, type, start, stop, strand]
                        if record.nil?
                                coding_info[seq_name] = {orf_id => info}
                        else
                                record[orf_id] = info
                        end
                end
        rescue
                puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
        end
        return coding_info
end
get_max_score(orfs_hash) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 473
def self.get_max_score(orfs_hash)
        score = nil
        orfs_hash.each do |id, info|
                local = info.last
                if score.nil?
                        score = local
                else
                        score = local if local > score
                end
        end
        return score
end
get_min_Xratio(orfs_hash) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 486
def self.get_min_Xratio(orfs_hash)
        ratio = nil
        orfs_hash.each do |id, info|
                local = info.first
                if ratio.nil?
                        ratio = local
                else
                        ratio = local if local < ratio
                end
        end
        return ratio
end
get_scores(file_name, coding_info) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 527
def self.get_scores(file_name, coding_info)
        File.open(file_name).each do |line|
                line.chomp!
                fields = line.split("\t")
                name = fields.shift
                seq, orf_id = name.split('|')
                coding = coding_info[seq]
                if !coding.nil?
                        orf = coding[orf_id]
                        if !orf.nil?
                                score = fields.first.to_f
                                if score > 0
                                        orf << fields.first.to_f if !orf.nil?
                                else
                                        coding.delete(orf_id)
                                        coding_info.delete(seq) if coding.empty?
                                end
                        end
                end
        end
        return coding_info
end
get_seqs(seqs) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 414
def self.get_seqs(seqs) 
        all_seqs = ''
        seqs.each do |seq|
                all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
        end
        return all_seqs
end
init_work_manager(options) click to toggle source

open files and prepare global data

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 91
def self.init_work_manager(options)
        @@stats_hash = initialize_stats_hash
        @@stats_taxonomy = {}
        @@stats_functional_annotation_by_seqs = {}
        @@stats_different_prot_id = []
        @@stats_different_prot_id_complete_seqs = []
        @@pre_fln_seq_lengths = []
        @@seq_lengths = []

        @@map_object = {}


        @@options = options

        $verbose = options[:verbose]

        input_file = options[:fasta]
        
        if !File.exists?('fln_results')
                Dir.mkdir('fln_results')
        end

        @@func_annot_type = {
                :go_id => 5,
                :go_description => 6,
                :kegg_id => 7,
                :interpro_id => 8,
                :interpro_description => 9,
                :ec_id => 10,
                :pfam_id => 11,
                :pfam_desc => 12,
                :unipathway_id => 13
        }

        @@functional_annotations = {}
        @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('s')
        @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')

        @@fasta_file = FastaQualFile.new(input_file,'')
        file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"

        @@output_files = {}
        # Seq annotation files
        if !options[:chimera].nil?
                @@output_files[CHIMERA]              = File.open("fln_results/chimeric_sequences.txt", 'w')
                @@output_files[CHIMERA].puts file_head
        elsif File.exists?("fln_results/chimeric_sequences.txt")
                File.delete("fln_results/chimeric_sequences.txt")
        end
        @@output_files[OTHER]                 = File.open('fln_results/artifact_other.txt', 'w')
        @@output_files[MISASSEMBLED]  = File.open('fln_results/misassembled.txt', 'w')
        @@output_files[UNKNOWN]                       = File.open('fln_results/unknown.txt', 'w')
        @@output_files['db']                  = File.open('fln_results/pt_seqs', 'w')
        @@output_files[CODING]                        = File.open('fln_results/new_coding.txt', 'w')
        @@output_files[NCRNA]                 = File.open('fln_results/nc_rnas.txt', 'w')

        # Complementary files
        @@output_files['align']                       = File.open('fln_results/alignments.txt', 'w')
        @@output_files['prot']                        = File.open('fln_results/proteins.fasta', 'w') # FASTA
        @@output_files['nts']                 = File.open("fln_results/nt_seq.txt", 'w')
        @@output_files['seqs']                        = File.open('fln_results/unigenes.fasta', 'w') # FASTA
        @@output_files['stats_html']  = 'fln_results/summary_stats.html'
        @@output_files['stats_txt']           = File.open('fln_results/summary_stats.txt', 'w')

        @@output_files[CODING].puts file_head
        @@output_files['db'].puts file_head
        @@output_files[NCRNA].puts file_head

        if !options[:files2map].empty?
                @@output_files['fpkm']                       = File.open('fln_results/fpkm_per_transcript.txt', 'w')
                @@output_files['coverage']           = File.open('fln_results/coverage_per_transcript.txt', 'w')
                @@output_files['fpkm'].puts %w[Transcript_id fpkm Read_counts].join("\t")
                @@output_files['coverage'].puts %w[seq_name mean_normalized_differences mean_max mean_coverage proportion_sequence_mapped].join("\t")        
                if options[:remove_unmapped]
                        @@output_files[UNMAPPED]    = File.open('fln_results/unmapped.txt', 'w')
                        @@output_files[UNMAPPED].puts "Query_id\tfasta_length"
                end
        end

        #RepTrans module
        @@seqs_annotation_prot        = []
        @@seqs_some_coding            = []
        @@seqs_unknown                        = []

        #Transdecoder module
        @@complete_sure = []
        @@seqs_to_analyze = []

        #Mapping_info
        @@fpkm = {}
        @@coverage_analysis = {}

end
load_functional_annotations(annotation_file) click to toggle source

CUSTOM FUNCTIONS

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 309
def self.load_functional_annotations(annotation_file)
        functional_annotations = {}
        File.open(annotation_file).each do |line|
                line.chomp!
                fields = line.split("\t")
                acc = fields.shift
                functional_annotations[acc] = fields
        end
        return functional_annotations
end
orf_prediction_with_transdecoder() click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 390
def self.orf_prediction_with_transdecoder
        clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
        final_seqs = select_representative(clusters_seqs_annot_prot)
        coding_info = nil
        Dir.chdir(@@options[:temp]) do
                orfs = get_seqs(final_seqs)
                File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
                orfs = get_seqs(@@seqs_to_analyze)
                File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
                cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
                cmd << ' --reuse' if Dir.exists?('transdecoder')
                system(cmd)
                coding_info = get_coding_info('transdecoder/longest_orfs.pep')
                coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
                coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
        end
        @@seqs_to_analyze.each do |seq|
                coding = coding_info[seq.seq_name]
                asign_coding_attributes(seq, coding) if !coding.nil?
                repTrans_keep_seq(seq)
                seq.write_info(@@output_files)
        end
end
repTrans_keep_seq(seq) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 364
def self.repTrans_keep_seq(seq)
        if !@@options[:reptrans].nil?
                case seq.type 
                        when COMPLETE .. INTERNAL
                                @@seqs_annotation_prot << seq
                        when CODING
                                @@seqs_some_coding << seq
                        when UNKNOWN
                                @@seqs_unknown << seq
                end
        end
end
select_orf(orfs_hash) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 461
def self.select_orf(orfs_hash)
        orf = nil
        ratioX = get_min_Xratio(orfs_hash)
        orfs_hash.select!{|id, info| info.first == ratioX}
        orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
        orfs = orfs_hash if orfs.empty?
        max_score = get_max_score(orfs)
        orfs.select!{|id, info| info.last == max_score}
        orf = orfs.values.first
        return orf
end

Public Instance Methods

error_received(worker_error, obj) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 285
def error_received(worker_error, obj)
        data = nil
        sample = obj.first 
        if sample.class == Sequence
                data = obj.seq_name
        else
                data = obj.inspect
        end
        puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{data}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
end
get_functional_annotations(seq) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 320
def get_functional_annotations(seq)
        all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')]  #gsub removes splicing code of uniprot accesion
        if !all_info.nil?
                annotations = {}
                @@func_annot_type.each do |type, position|
                        annotations[type] = all_info[position]
                end
                annotations[:go_description].split(";").each do |annot|
                        query = @@stats_functional_annotation_by_seqs[annot]
                        if query.nil?
                                @@stats_functional_annotation_by_seqs[annot] = [seq.seq_name]
                        else
                                query << seq.seq_name 
                        end
                end
                seq.functional_annotations = annotations
        end
end
next_work() click to toggle source

this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says. Return the work data or nil if no more data is available

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 205
def next_work #Manage INput's worker
        obj = nil
        #send = 'None'
        if !@@options[:files2map].empty? && !@@options[:ref_files].empty? #Mapping task
        #     send = 'Map'
                obj = send_mapping_data 
        elsif !@@options[:files2map].empty? && @@options[:ref_files].empty? && @@map_object.length < @@options[:n_refs]
        #     send = 'Sleep'
                obj = :sleep
        else
        #     send = 'Fln'
                obj = send_fln_data
        end
        #$LOG.info "Next_work: Sent => #{send}. ref_files.length = #{@@options[:ref_files].length}; n_refs = #{@@options[:n_refs]}; map_object.length = #{@@map_object.length}"
        return obj
end
receive_fln_data(objs) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 265
def receive_fln_data(objs)
        objs.each do |seq|
                transdecoder_keep_seq(seq)
                repTrans_keep_seq(seq)
                if seq.type > UNKNOWN && seq.type < NCRNA
                        get_taxonomy(seq.hit.definition, @@stats_taxonomy) 
                        get_functional_annotations(seq)
                end
                @@fpkm[seq.seq_name] = seq.fpkm if !seq.fpkm.empty?
                @@coverage_analysis[seq.seq_name] = seq.coverage_analysis if !seq.coverage_analysis.empty?
                write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
        end
        @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths)
end
receive_mapping_data(objs) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 280
def receive_mapping_data(objs)
        data = objs
        @@map_object.merge!(data)
end
repTrans_keep_seq(seq) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 351
def repTrans_keep_seq(seq)
        if !@@options[:reptrans].nil?
                case seq.type 
                        when COMPLETE .. INTERNAL
                                @@seqs_annotation_prot << seq
                        when CODING
                                @@seqs_some_coding << seq
                        when UNKNOWN
                                @@seqs_unknown << seq
                end
        end
end
send_fln_data() click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 226
def send_fln_data
        obj = nil
        n,f,q = @@fasta_file.next_seq
        if !n.nil?
                seq = Sequence.new(n,f,q)
                begin
                        
                        if !@@map_object.empty?
                                cov_analysis = @@map_object[n]
                                if cov_analysis.nil?
                                        seq.fpkm = []
                                        seq.coverage_analysis = []
                                else
                                        seq.fpkm = cov_analysis.pop(2)
                                        seq.coverage_analysis = cov_analysis
                                end
                        end
                rescue Exception => e
                        puts e.message, e.backtrace.join("\n")
                end

                @@pre_fln_seq_lengths << f.length
                sequence_stats(seq, @@stats_hash)
                obj = {fln: seq} 
        end
        return obj
end
send_mapping_data() click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 222
def send_mapping_data
        return {mapping: @@options[:ref_files].shift}
end
too_many_errors_received() click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 296
def too_many_errors_received
        $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
end
transdecoder_keep_seq(seq) click to toggle source
# File lib/full_lengther_next/my_worker_manager_fln.rb, line 377
def    transdecoder_keep_seq(seq)
        if @@options[:acess_db].include?('p')
                case seq.type 
                        when COMPLETE
                                @@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
                        when CODING
                                @@seqs_to_analyze << seq
                        when UNKNOWN
                                @@seqs_to_analyze << seq
                end                  
        end
end
work_received(objs) click to toggle source

this method is ejecuted each time an obj is finished

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 255
def work_received(objs) #Manage OUTput's worker
        task = objs.keys.first
        data = objs.values.first
        if task == :fln
                receive_fln_data(data)
        elsif task == :mapping
                receive_mapping_data(data)
        end
end
worker_initial_config() click to toggle source

send initial config

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 301
def worker_initial_config
        return @@options
end
write_seq(seq) click to toggle source

write results to files

# File lib/full_lengther_next/my_worker_manager_fln.rb, line 341
def write_seq(seq)
        begin
                seq.write_info(@@output_files)
        rescue Exception => e
                puts "Error printing #{seq.seq_name}"
                puts e.message, e.backtrace.join("\n")
        end
        
end