class Rebuild
Public Class Methods
new(dataset,dataset_uni_hsp,path)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 7 def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset @dataset=dataset @dataset_uni_hsp=dataset_uni_hsp @path=path @db_seqs=fasta_hash(path[:exonerate_db]) end
Public Instance Methods
add_uni_hsp(model,cluster)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 248 def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes contigs_uni_hsp='' is_contig=0 pseudogenes=[] @clusters_uni_hsp.each do |contigs| if contigs.first.first_hit.name==cluster.first.first_hit.name contigs_uni_hsp=contigs is_contig=1 break end end if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion if model.class.to_s!='Array' model=[model] end model.each do |item| contigs_uni_hsp.each do |contig| start,exons=item.compare(contig) if exons>1 && !pseudogenes.include?(contig) pseudogenes << contig end end end end return pseudogenes end
array_contigs_to_contig(array_contigs)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 852 def array_contigs_to_contig(array_contigs) contig=Contig.new(array_contigs.first.name) array_contigs.each do |cn| contig.transfer_contig_hits(cn) end contig.length=array_contigs.last.length return contig end
build_gene_array(contigs,reference=nil)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 276 def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones gene_array=[] gene_array_introns=[] last_contig='' if !reference.nil? last_contig=reference end contigs.each do |contig| array_contig=[] array_contig_introns=[] n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig #Determinar posiciones vacias if !gene_array.empty?||reference first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia if $verbose puts "\n#{contig.name} alignment step OUT OF RANGE" end gene_array=[] gene_array_introns=[] break end if first_exon==-1 gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping array_contig << 0 # Marca ausencia de exon para esa posicion array_contig_introns << 0 # Marca ausencia de intron para esa posicion end else if reference # ASignamiento de la posicion del contig respecto a la referencia void_positions=first_exon else void_positions=first_exon+gene_array.last.count(0) end void_positions.times do #Posiciones vacias cuando HAY overlapping array_contig << 0 array_contig_introns << 0 end end end #Agregar exones e intrones del contig exones=contig.exones_s introns=contig.intrones_q array_contig << exones # Marca presencia de exon para esa posicion array_contig_introns << introns # Marca presencia de exon para esa posicion gene_array << array_contig.flatten! gene_array_introns << array_contig_introns.flatten! if reference.nil? last_contig=contig end end return gene_array, gene_array_introns end
contig_compact(contigs)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 430 def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig cn_def=[] cn_backup=contigs.dup #Determinar contigs a fusionar cn_to_merge=[] s_end=nil last_position_ref=nil position_overlap=nil last_contig=nil fusion=[] contigs.length.times do fusion << FALSE end #Marcaje de contigs correlativos no solapantes contigs.each_with_index do |contig,i| if i>0 diference=contig.first_hit.first_hsp.s_beg-s_end if diference==0 || diference==1 fusion[i]=TRUE end end s_end=contig.first_hit.last_hsp.s_end end if fusion.include?(TRUE) #Construccion array contigs a fusionar y guardado de los solapantes fusion_contigs=[] count=0 # Marca la posicion de las fusiones fusion.each_with_index do |cont,i| if cont if !fusion_contigs.include?(contigs[i-1]) fusion_contigs << contigs[i-1] end if !fusion_contigs.include?(contigs[i]) fusion_contigs << contigs[i] end else if !fusion_contigs.empty?#Marcar fusiones cn_to_merge << fusion_contigs fusion_contigs=[] cn_def << count count+=1 end if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones cn_def << contigs[i] end end if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle cn_to_merge << fusion_contigs cn_def << count count+=1 end end #Generar fasta de los contig fusionados contigs_merge=contigs_seq_merge(cn_to_merge) if !contigs_merge.empty? temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w') contigs_merge.each_with_index do |seq,i| temp.puts ">Fusion_#{i}\n#{seq}" end temp.close temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w') temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}" temp_db.close end #Exonerating cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line system(cmd) #Parsing exonerate local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex")) store_local_ex = local.dataset #store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices} store_local_ex.score_correction(30) #puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}" if store_local_ex.contig_count==contigs_merge.length #Recuperar atributos en contigs y cargar array con contigs def store_local_ex.each_contig_with_index{|contig,i| contig.seq=contigs_merge[i] contig.length=contigs_merge[i].length contig.first_hit.s_length=contigs.first.first_hit.s_length cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos if contig_def==i cn_def[j]=contig end end } else cn_def=cn_backup end else cn_def=contigs end return cn_def end
contigs_seq_merge(contigs)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 533 def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar cn=[] seq='' contigs.each do |contigs_to_merge| contigs_to_merge.each do |contig| if seq.empty? seq=contig.seq else seq=seq+'n'*10+contig.seq end end cn << seq seq='' end return cn end
correct_model(model, length_model, gff_dataset, sequences_hash)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 954 def correct_model(model, length_model, gff_dataset, sequences_hash) correct_add_Ns=0 if model.class.to_s=='Array' model=array_contigs_to_contig(model) model.name=model.name.gsub('_0','') model.length=length_model end correct_add_Ns=gff_dataset.correct_left_side_contigs(model) model.modified_coordenates(correct_add_Ns) model.length+=correct_add_Ns gff_dataset.align_contigs(model) #Corregir secuencia para que alinee con las features generadas if correct_add_Ns>0 sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')] end return model end
eval_model(local_model, length)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 171 def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia recover=0 overlap=0 if local_model.class.to_s=='Array' local_model=array_contigs_to_contig(local_model) local_model.length=length end recover=recover_test(local_model,FALSE) overlap=overlap_test(local_model,FALSE) return recover, overlap end
format_model(model)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 944 def format_model(model) if model.n_hits?>1 model.each_hit_with_index{|hit,i| hit.name=hit.name+"_gene_#{i}" } else model.first_hit.name=model.first_hit.name+'_gene' end end
gene_array_and_compact(rebuild,cluster,reference)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 221 def gene_array_and_compact(rebuild,cluster,reference) # Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas #--------------------------------------------------------------------------------------------------------------------------- if rebuild gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia if $verbose gene_exons=gene_stadistics(gene_array) gene_stadistics_report(gene_exons,'EXONS') gene_introns=gene_stadistics(gene_array_introns) gene_stadistics_report(gene_introns,'INTRONS') end end # Seleccion de contigs para modelado de gen #----------------------------------------------------- if $verbose #Info cluster before compact array contigs gene_array_report(gene_array,cluster,rebuild) end if rebuild gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo end if $verbose && rebuild #Info cluster after compact array contigs gene_array_report(gene_array,cluster,rebuild) end return cluster, gene_array end
gene_array_report(gene_array,contigs,act_array)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 356 def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias if act_array puts "\nGENE ARRAY" gene_array.each_with_index do |fila,c| print "#{contigs[c].name.center(24)}\t " print "#{contigs[c].completed}\t" fila.each do |item| print "#{item}\t" end puts "\n" end end puts "\nMAP" contigs.each do |contig| print "#{contig.name.center(25)}" print contig.draw end end
gene_compact(gene_array, contigs)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 376 def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles gene_array.each_with_index do |contig,c1| if !contig next end c1_len=contig.length n_exons=contig.count{|x| x>0} gene_array.each_with_index do |contig2,c2| if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion next end c2_len=contig2.length # IGUAL if c1_len==c2_len if contig2.count{|x| x>0}==n_exons if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score gene_array[c2]=nil contigs[c2]=nil else gene_array[c1]=nil contigs[c1]=nil break end elsif contig2.count{|x| x>0}>n_exons gene_array[c1]=nil contigs[c1]=nil break else gene_array[c2]=nil contigs[c2]=nil end # MAYOR QUE elsif c1_len>c2_len if contig.count(0)<=contig2.count(0) gene_array[c2]=nil contigs[c2]=nil end # MENOR QUE elsif c1_len<c2_len if contig.count(0)==contig2.count(0) gene_array[c1]=nil contigs[c1]=nil break end end end #end contig2 end #end contig gene_array.compact! contigs.compact! end
gene_error(e, gene_name, file_error, cluster, model)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 861 def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object puts gene_name+' ERROR' file_error.puts "\n"+gene_name+"\n.............................." file_error.puts e.message e.backtrace.each do |line| file_error.puts line end file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,' cluster.each do |contig| file_error.puts contig.name #puts contig.name #contig.indices end file_error.puts '----------------------------------------------------------------------------------------' end
gene_model_cut(contigs, reference=nil)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 550 def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig q_beg=[] q_end=[] s_beg=[] s_end=[] seq=[] last_contig=nil last_score=0 length_model=0 multiple_lengths=[] add_length=TRUE add_last=0 last_position_ref=nil last_position=nil lengthy=[] out_of_range=FALSE contigs.each do |contig| score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative n_exones = contig.first_hit.hsp_count # FIRST CONTIG #------------------------------------------------------- if last_contig.nil? q_end_seq=nil #SEQ contig.first_hit.each_hsp_with_index{|hsp,i| q_beg << hsp.q_beg q_end << hsp.q_end s_beg << hsp.s_beg s_end << hsp.s_end #SEQ................................. if i==0 seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1] elsif i+1==n_exones seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1] else seq << contig.seq[q_end_seq..hsp.q_end-1] end q_end_seq=hsp.q_end # ................................... } length_model+=contig.length if !reference.nil? #Posicionamiento del primer contig en la referencia last_position_ref,ex=contig.compare(reference) if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase puts contig.name+' OUT OF RANGE' out_of_range=TRUE break end end # OTHER CONTIG #-------------------------------------------------------- else position_overlap,ex=contig.compare(last_contig) #Correccion posicion del contig en base a una referencia if !reference.nil? last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference) if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count out_of_range=TRUE puts contig.name+' OUT OF RANGE' break end end # NOT OVERLAP #.......................... if position_overlap==-1 || contig.first_hit.hsp_count==1 if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior q_beg << 0 q_end << 0 s_beg << 0 s_end << 0 multiple_lengths << length_model length_model=contig.length last=length_model add_length=FALSE seq.last << 'n'*10 #SEQ Indicacion de GAP else last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente length_model+=contig.length end q_end_seq=nil #SEQ contig.first_hit.hsps.each_with_index do |hsp,i| add_no=last if !add_length add_no=0 end q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo q_end << hsp.q_end+add_no s_beg << hsp.s_beg s_end << hsp.s_end #SEQ................................. if i==0 cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1] cs="#{cn[0..1].swapcase!}#{cn[2..-1]}" seq << cs elsif i+1==n_exones seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1] else seq << contig.seq[q_end_seq..hsp.q_end-1] end q_end_seq=hsp.q_end # ................................... end # OVERLAP #.......................... else if last_position==-1 add_last=length_model-last_contig.length end overlap=last_contig.first_hit.hsp_count-position_overlap if last_contig.first_hit.hsp_count ==1 overlap=1 end #puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}" add=0 dif=0 if last_score>=score add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end dif=add if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo #puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}" dif=contig.first_hit.hsp_at(overlap-1).q_end q_beg=q_beg.reverse.drop(1).reverse q_end=q_end.reverse.drop(1).reverse s_beg=s_beg.reverse.drop(1).reverse s_end=s_end.reverse.drop(1).reverse seq=seq.reverse.drop(1).reverse #SEQ end if overlap==1 overlap=2 end (contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg s_end << contig.first_hit.hsp_at(n+overlap-1).s_end #SEQ....................................... position_hsp=n+overlap-2 if position_hsp <0 position_hsp= 0 end position_next_hsp=n+overlap-1 if position_next_hsp < 0 position_next_hsp =0 end if n==0 cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1] cs=cn[0..1].swapcase!+cn[2..-1] seq << cs elsif position_next_hsp==contig.first_hit.hsp_count-1 seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1] else seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1] end #............................................ end else hsp_position=last_contig.first_hit.hsp_count-2 if hsp_position<0 #para los casos de los contigs q solo poseen un hsp hsp_position=0 end add=last_contig.first_hit.hsp_at(hsp_position).q_end dif=last_contig.length-add drop=1 correction=0 if overlap>1 drop=overlap-1 correction=1 add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end dif=length_model-(add+add_last) end # Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap) q_beg=q_beg.reverse.drop(drop).reverse q_end=q_end.reverse.drop(drop).reverse s_beg=s_beg.reverse.drop(drop).reverse s_end=s_end.reverse.drop(drop).reverse seq=seq.reverse.drop(drop).reverse #SEQ # Añadimos los exones de 'contig' (excepto el primero) (contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last s_beg << contig.first_hit.hsp_at(n+correction).s_beg s_end << contig.first_hit.hsp_at(n+correction).s_end #SEQ............................................................. if n+1==(contig.first_hit.hsp_count-correction) n_correction=n+correction-1 if n_correction < 0 n_correction=0 end seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1] elsif n==0 if n+correction==0 cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig else cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1] end cs=cn[0..1].swapcase!+cn[2..-1] seq << cs else seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1] end #................................................................ end end length_model+=(contig.length-dif) add_length=TRUE add_last+=add end end last_position=position_overlap last_contig=contig last_score=score lengthy << length_model end if !multiple_lengths.empty? multiple_lengths << length_model length_model=multiple_lengths end model=nil if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon') #Merge contigs under sequence reference model_length=nil if model.class.to_s=='Array' add=0 model.each_with_index do |contig,i| contig.modified_coordenates(add) add+=contig.length if i<model.length-1 add+=10 end end model_length=add+model.last.length end model_n_exones=seq.length final_seq=seq.join else #No generar modelo si al menos un contig no alinea contra la referencia model_length=nil final_seq=nil end return model, model_length, final_seq end
gene_stadistics(gene_array)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 329 def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array exons=[] length=length2D(gene_array) length.times do |column| exon=[] gene_array.each_with_index.each do |item,row| if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0 exon << gene_array[row][column] end end exons << exon end exons_stadistic=[] exons.each do |ex| exons_stadistic << ex.compact.length end return exons_stadistic end
gene_stadistics_report(exons_stadistic,tag)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 348 def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones print "\n#{tag}\t" exons_stadistic.each do |item| print "#{item}\t" end print "\n" end
iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
click to toggle source
end main method
# File lib/gene_assembler/rebuild.rb, line 93 def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash) # Model atributes model=nil length=0 seq=nil cluster_length=0 length_cluster=0 prot_reference=cluster.first.first_hit.name array_references=references_hash[prot_reference] # Model parameters recover=0 overlap=0 #Modelo de gen en ciego if $verbose puts "\n",'|||||||||| BLIND MODELING ||||||||||' end model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options) recover, overlap=eval_model(model.dup, length) if $verbose puts "\nRecover: #{recover} Overlap: #{overlap}" end guided=FALSE #Modelo de gen guiado if !array_references.nil? array_references.each do |ref| if $verbose puts "\n",'|||||||||| GUIDED MODELING ||||||||||' end guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options) if guided_model.nil? # Si algun modelo sale mal se ignora next end guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length) if $verbose puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}" end #Arbol de decisiones if guided_overlap <= 15 #Si el overlap es menor del 15 % if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap if guided_recover > recover guided=TRUE end else # A distinto overlap recover_dif=guided_recover-recover if recover_dif < 0 # Si el guided_model tiene menos recuperacion q el anterior if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap guided=TRUE end elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud guided=TRUE end end elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto guided=TRUE end if guided model=guided_model length=guided_length seq=guided_seq length_cluster=guided_length_cluster recover=guided_recover overlap=guided_overlap end end end sequences_hash[prot_reference]=seq return model, length, length_cluster end
modeling_gene(cluster, reference, rebuild)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 183 def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo model=nil model_length=nil seq=nil length_cluster=0 # Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones #-------------------------------------------------------------------------------------------------- gene_array_length_before=nil continue=TRUE gene_array=[] while continue cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference) gene_array_length_after=length2D(gene_array) if gene_array_length_after == gene_array_length_before continue=FALSE end gene_array_length_before=gene_array_length_after end length_cluster=cluster.length # Modelado del gen #---------------------------------------------------- if rebuild && !cluster.empty? && !gene_array.empty? if cluster.length >1 cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente else cluster_comp=cluster end if !cluster_comp.nil? model, model_length, seq=gene_model_cut(cluster_comp, reference) else puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED" end end return model, model_length, seq, length_cluster end
overlap_test(model,output=TRUE)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 877 def overlap_test(model,output=TRUE) perc_overlap=0 overlap=model.overlap total=0 if !overlap.empty? if output print 'WARNING: overlap/s ' end overlap.each do |length_overlap| if output print (length_overlap*-3).to_s+', ' end total+=length_overlap end perc_overlap=(total*-100.0/model.first_hit.s_length).round(2) if output puts 'nt. % Total overlap '+perc_overlap.to_s end end return perc_overlap end
position_reference_guided(contig,last_contig,last_position_ref,reference)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 840 def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1 position_ref,ex=contig.compare(reference) if !last_position_ref.nil? if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap position_overlap=(last_position_ref-position_ref).abs else #No overlap position_overlap=-1 end end return position_ref,position_overlap end
rebuild(options)
click to toggle source
MAIN METHOD
# File lib/gene_assembler/rebuild.rb, line 17 def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output file_error=File.open(@path[:error],'w') file_web=web_header(options[:web],@path[:html]) sequences_hash={} gene_name=nil model=nil statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0} puts "\nMODELING GENE",'*******************************************' @dataset.each_cluster{|cluster| begin if !cluster.nil? gene_name=cluster.first.first_hit.name end cluster_complete=cluster.dup model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen) # GeneAssembler output (gff for Gbrowse) #-------------------------------------------------------- if !model.nil? #Format Contigs children of model gff_dataset.clr_contigs gff_dataset.transfer_contigs(cluster_complete) gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report # Convertir arrays a contig y ajustar alineamiento añadiendo Ns model=correct_model(model, length_model, gff_dataset, sequences_hash) # Comprobaciones en el modelo exones=model.exones_s.length # N exones puts 'Exones: '+ exones.to_s recovered=recover_test(model) overlap=overlap_test(model) fragmentation=((length_cluster-1.00)/exones).round(2) puts 'Fragmentation: ' + fragmentation.to_s # HTML index if !file_web.nil? gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name) html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation]) end #Format Model for Gbrowse gff_dataset_model.clr_contigs format_model(model) #Añade la particula _gene al modelo gff_dataset_model.transfer_contigs(model) #Write write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name) #General statistics statistics[:genes]+=1 statistics[:total_recovered]+=recovered statistics[:total_overlap]+=overlap statistics[:total_fragmentation]+=fragmentation end rescue Exception => e gene_error(e, gene_name, file_error, cluster_complete, model) end puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *' } file_error.close web_body(file_web) puts "\nFINAL STATISTICS\n", 'Recovered genes: '+ statistics[:genes].to_s, 'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s, 'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s, 'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s write_model_fasta(sequences_hash,@path[:fasta]) end
recover_test(model,output=TRUE)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 899 def recover_test(model,output=TRUE) recovered=0 model.exones_s.each do |exon| recovered+=exon end recovered=(recovered*100.0/model.first_hit.s_length).round(2) if output puts "Recovered\t"+model.first_hit.name+"\t#{recovered}" end return recovered end
void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 803 def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados contigs=[] is_contig=1 contig=nil n=0 q_beg.each_with_index do |item,ind| if item>0 ||single if contig==nil if contig_length.class.to_s=='Array' length=contig_length[n] name="#{contig_name}_#{n}" else length=contig_length name=contig_name end contig=Contig.new(name) contig.length=length contig.type=contig_type hit_v=contig.add_hit(contig_name,s_length,1,:prot) hit_v.type=hit_type end hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0) hsp_v.type=hsp_type end if item==0 && contig!=nil && !single||q_beg.length-1==ind if single ||!q_beg.include?(0) contigs=contig else contigs << contig end n+=1 contig=nil end end return contigs end
web_body(file_web)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 921 def web_body(file_web) if !file_web.nil? html_table_footer(file_web) html_footer(file_web) file_web.close end end
web_header(web, path)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 911 def web_header(web, path) file_web=nil if web file_web=File.open(path,'w') html_header(file_web,'Gene index') html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation']) end return file_web end
write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 937 def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name) gff_model=ReportGff.new(gff_dataset_model,path,'s') gff_model.create('a') gff=ReportGff.new(gff_dataset,path,'s') gff.create('a',name) end
write_model_fasta(sequences_hash, path)
click to toggle source
# File lib/gene_assembler/rebuild.rb, line 929 def write_model_fasta(sequences_hash, path) model_file=File.open(path,'w') sequences_hash.each do |model| model_file.puts '>'+model[0]+"_model\n"+model[1] end model_file.close end