module FlAnalysis

Public Instance Methods

analiza_orf_y_fl(seq, hit, options, db_name) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 8
def analiza_orf_y_fl(seq, hit, options, db_name)
        query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
        if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
                        seq_unida = UneLosHit.new(hit, query_fasta)
                        full_prot =         seq_unida.full_prot    
                        query_fasta =       seq_unida.output_seq  # repaired fasta
                        final_hit =         seq_unida.final_hit            # single hit
                        $global_warnings += seq_unida.msgs          # warning messages
        else
                query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
                final_hit = hit.first # single hit
        end
        query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]      

        full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
        original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
        seq.show_alignment(final_hit, query_fasta, show_nts) if  $verbose > 2 ## VERBOSE
        atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
        end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)

        puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
        seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if  $verbose > 1 ## VERBOSE
        puts "ATG: #{atg_status}  STOP: #{end_status}" if  $verbose > 2 ## VERBOSE

        # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
        type, status = determine_status(atg_status, end_status)
        status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
        if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
                save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
        end
end
compare_seq_length_with_subject(final_prot, distance, final_hit, type, status) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 183
def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
        if final_prot.length - 2 * distance > final_hit.s_len
                $global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
        elsif final_prot.length + 2 * distance < final_hit.s_len
                $global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
                if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len                                
                        if type == COMPLETE
                                status = FALSE
                                $global_warnings << 'VeryShorter'
                        end
                end
        end
        return status
end
determine_status(atg_status, end_status) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 162
def determine_status(atg_status, end_status)
        if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
                type = COMPLETE
        elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
                type = INTERNAL
        elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
                type = N_TERMINAL
        elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
                type = C_TERMINAL
        end
        
        if atg_status == 'putative' || end_status == 'putative'
                status = FALSE # Putative
        else
                status = TRUE # Sure
        end

        return type, status
end
exonerate_fix_frame_shift(query_fasta, hit) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 239
def exonerate_fix_frame_shift(query_fasta, hit)
        frame_shifts = []
        added_nts = 0
        hit.each_with_index do |hsp, num|
                if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
                        if !hsp.q_frameshift.empty? #There is frameshift
                                hsp.q_frameshift.each do |position, num_nts|
                                        local_add = 3 - num_nts
                                        fs_final_position = position + num_nts 
                                        $global_warnings << ['ExFrameS', fs_final_position]
                                        frame_shifts << [fs_final_position, local_add]
                                        added_nts += local_add
                                end
                        end
                end
                hsp.q_beg += added_nts if num > 0
                hsp.q_end += added_nts
        end
        add = 0
        frame_shifts.each do |position, num_nts|
                query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
                add += num_nts
        end
        return query_fasta
end
find_end(final_hit, max_distance, tmp_prot, query_fasta) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 105
def find_end(final_hit, max_distance, tmp_prot, query_fasta)
        frame_shift = check_frame_shift(final_hit)
        beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
        atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
        end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
        #puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3  - max_distance: #{max_distance}\e[0m"
        #puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
        #puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
        if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
                atg_substring = tmp_prot
                end_substring = ''
                end_status = 'incomplete'
        else
                end_status = 'putative'
                putative_end = end_substring.index('*')
                end_substring = end_substring[0 .. putative_end] if putative_end
                
                s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
                q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo
                sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
                
                if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs  <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment
                        end_status = 'complete'
                elsif sq_end_distance  < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
                        end_status = 'incomplete'
                        if putative_end
                                $global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
                        else
                                $global_warnings << ['DistSubj', sq_end_distance.abs]
                        end
                else # tenemos suficiente secuencia
                        if putative_end # tenemos un stop
                                #beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
                                stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
                                if stop_q_s.abs <= max_distance #Stop codon is in search region
                                        end_status = 'complete'
                                elsif stop_q_s < 0
                                        $global_warnings << 'UnexpSTOP3p'
                                elsif stop_q_s > 0
                                        end_status = 'complete'
                                        $global_warnings << 'QueryTooLong'
                                end
                        else # no tenemos codon de parada pero tenemos suficiente secuencia
                                end_status = 'incomplete'
                                $global_warnings << 'ProtFusion'
                        end
                end
        end
        final_prot = atg_substring + end_substring
        end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
        new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
        modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if  $verbose > 1
        final_hit.q_end = new_q_end  
        return end_status, final_prot
end
find_start(subject_start, amine_seq, distance) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 68
def find_start(subject_start, amine_seq, distance)             
        atg_status = 'putative' # complete, incomplete or putative
        stop_codon = amine_seq.rindex('*')
        if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
                _5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
                amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
                first_m = amine_seq.index('M')
                if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
                        if first_m # tenemos M
                                amine_seq = amine_seq[first_m .. amine_seq.length - 1]                                     
                                atg_status = 'complete'
                        else # con STOP pero sin M
                                $global_warnings << 'noM1'
                        end
                else # esta despues, un cambio de fase impide analizar el principio
                        $global_warnings << 'UnexpSTOP5p'
                        amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
                end
        else # no hay stop codon
                first_m = amine_seq.index('M')
                if first_m # tenemos M
                        amine_seq = amine_seq[first_m .. amine_seq.length - 1]
                        m_distance = (subject_start - amine_seq.length).abs - 10
                        if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
                                $global_warnings << 'NoStopMfar'
                                atg_status = 'incomplete'
                        else # Tenemos M
                                atg_status = 'complete'
                        end
                else # sin M
                        $global_warnings << 'noM2'
                end
        end
        return amine_seq, atg_status
end
mark_nt_seqs(final_hit, query_fasta) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 221
def mark_nt_seqs(final_hit, query_fasta)
        atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
        mark_atg = nil
        if atg == 'ATG'
                mark_atg = '_-_'     
        end
        stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
        mark_stop = nil
        if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
                mark_stop = '___'
        end
        seq5p = query_fasta[0..final_hit.q_beg-1]
        orf = query_fasta[final_hit.q_beg..final_hit.q_end]
        seq3p = query_fasta[final_hit.q_end..query_fasta.length]
        nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
        return nt_seq
end
modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 274
def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
        if new_q_end > final_hit.q_end #There is an align extension
                extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
                final_hit.q_seq = final_hit.q_seq + extend_align
        elsif new_q_end < final_hit.q_end #The align is cutted
                upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
                final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
        end
end
modify_5p_align(new_q_beg, final_hit, query_fasta) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 285
def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
        if new_q_beg < final_hit.q_beg #There is an align extension
                extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
                final_hit.q_seq = extend_align + final_hit.q_seq
        elsif new_q_beg > final_hit.q_beg #The align is cut
                seq_cut = (new_q_beg - final_hit.q_beg)/3
                gaps = final_hit.q_seq[0..seq_cut].count('-')
                seq_cut += gaps
                final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
        end
end
save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 199
def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
        # if the sequence is Complete or it hasn't previous info will be saved
        if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
                seq.type = type
                seq.status = status
                seq.db_name = db_name
                seq.seq_fasta = query_fasta
                seq.seq_aa = final_prot
                seq.hit = final_hit
                seq.warnings($global_warnings)
                $global_warnings = [] # Clean all warnings for current sequence
                seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
                if type == COMPLETE
                        seq.ignore = TRUE
                end
        end
        if  $verbose > 2
                puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
        end
end
set_start_codon(final_hit, distance, full_prot, query_fasta) click to toggle source
# File lib/full_lengther_next/fl_analysis.rb, line 41
def set_start_codon(final_hit, distance, full_prot, query_fasta)
        q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot) 
        atg_status = nil
        _5prima = q_index_start + distance

        if  final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
                atg_status = 'complete'
                tmp_prot = full_prot[q_index_start..full_prot.length]
        elsif _5prima >= final_hit.s_beg
                amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
                carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
                length_before_cut = amine_seq.length
                amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
                tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
                new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
                modify_5p_align(new_q_beg, final_hit, query_fasta)   if  $verbose > 1 ## VERBOSE, Modify query align
                final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
        else
                $global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
                atg_status = 'incomplete'
                tmp_prot = full_prot
        end

        return atg_status, tmp_prot
end
show_nts() click to toggle source

VERBOSE METHODS

# File lib/full_lengther_next/fl_analysis.rb, line 267
def show_nts
        show = FALSE  
        show = TRUE if $verbose && $verbose > 3
        return show
end