class Dataset
Attributes
clusters[RW]
contigs[RW]
references_hash[RW]
type[RW]
Public Class Methods
new(type)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 5 def initialize(type) #Carga un objeto blast para generar los objetos contig que inician esta clase @type=type #Definido pero no se usa @contigs=[] @clusters=[] @references_hash='' end
Public Instance Methods
add_contig(name)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 12 def add_contig(name) c=Contig.new(name) @contigs << c return c end
align_contigs(contig_base)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 388 def align_contigs(contig_base) limit=0 las_contig=nil if !contig_base.nil? limit=-1 last_contig=contig_base end ## Alineamiento de los contig entre si o contra una referencia add=0 align=TRUE each_contig_with_index do |contig,i| if i>limit #Calcular desplazamiento de un contig respecto al anterior en el gff overlap_exon_with_last,ex=contig.compare(last_contig) if overlap_exon_with_last==-1 if contig_base.nil? add+=last_contig.length else align=FALSE end else overlap_exon_current,ex=last_contig.compare(contig) add+=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current)) if !contig_base.nil? align=TRUE end end end #Modificacion de contigs if align || contig_base.nil? # Modificar si no existe referencia o el contig a alineado contra la referencia contig.modified_coordenates(add) contig.length+=add end if !contig_base.nil? last_contig=contig_base add=0 #Resetear desplazamiento en caso de usarse una referencia else last_contig=contig end end end
attrib_recover(dataset)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 103 def attrib_recover(dataset) #Reponer atributos en el Dataset del exonerate que se han perdido en el proceso (exonerate no los tiene), se recuperan del blast each_contig{|self_contig| dataset.each_contig{|dataset_contig| if self_contig.name==dataset_contig.name self_contig.length=dataset_contig.length self_contig.seq=dataset_contig.seq self_contig.each_hit{|hit| hit.s_length=dataset_contig.first_hit.s_length } break end } } end
clear_clusters()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 344 def clear_clusters @clusters=[] end
clr_contigs()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 76 def clr_contigs # Vacia @contigs @contigs=[] end
cluster_count()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 98 def cluster_count count=@clusters.length return count end
clustering()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 124 def clustering # Compara el subject_id entre todos los contig y agrupa en un array aquellos con mismo s_i. Cada array se guarda en el array 'clusters' finished_clusters=[] each_contig{|contig| clust=[] if finished_clusters.include?(contig.first_hit.name) next end each_contig{|contig2| if contig.first_hit.name==contig2.first_hit.name clust << contig2 contig2=nil end } finished_clusters << contig.first_hit.name if !clust.empty? @clusters << clust end } end
clusters_empty?()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 80 def clusters_empty? empty=TRUE i=0 each_cluster{|cl| i+=1 if i>0 empty=FALSE break end } return empty end
contig_count()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 93 def contig_count count=@contigs.length return count end
correct_hsp_contigs(blast_coor_type)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 118 def correct_hsp_contigs(blast_coor_type) each_contig {|contig| contig.correct_hsps(blast_coor_type) } end
correct_left_side_contigs(contig_base)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 357 def correct_left_side_contigs(contig_base) last_contig=nil limit=0 correct=0 ## Alineamiento de los contig entre si para calcular desplazamiento if !contig_base.nil? limit=-1 last_contig=contig_base end each_contig_with_index do |contig,i| # Calculo del desplazamiento necesario para corregir indices negativos en el gff if i>limit overlap_exon_with_last,ex=contig.compare(last_contig) if overlap_exon_with_last>-1 overlap_exon_current,ex=last_contig.compare(contig) diference=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current)) if diference<correct correct=diference end end end if !contig_base.nil? last_contig=contig_base else last_contig=contig end end correct*=-1 return correct end
delete_cluster(cluster)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 38 def delete_cluster(cluster) @clusters.delete(cluster) end
delete_cluster_at(ind)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 42 def delete_cluster_at(ind) @clusters.delete_at(ind) end
each_cluster() { |cluster| ... }
click to toggle source
# File lib/gene_assembler/dataset.rb, line 59 def each_cluster @clusters.each do |cluster| yield cluster end end
each_cluster_with_index() { |cluster,i| ... }
click to toggle source
# File lib/gene_assembler/dataset.rb, line 65 def each_cluster_with_index @clusters.each_with_index do |cluster,i| yield cluster,i end end
each_contig() { |contig| ... }
click to toggle source
iterador
# File lib/gene_assembler/dataset.rb, line 47 def each_contig @contigs.each do |contig| yield contig end end
each_contig_with_index() { |contig,i| ... }
click to toggle source
# File lib/gene_assembler/dataset.rb, line 53 def each_contig_with_index @contigs.each_with_index do |contig,i| yield contig,i end end
fasta(fasta_file)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 208 def fasta(fasta_file) #Crea un archivo fasta a partir de @contigs temp=File.open(fasta_file, 'w') each_contig{|contig| temp.print ">#{contig.name}\n" temp.puts contig.seq } temp.close end
filtering()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 164 def filtering #Bateria de filtros putative_contigs=[] uni_hsp=[] each_contig{ |contig| if contig.mixed? next elsif contig.is_one_hsp? #Apartamos contigs uni-hsp uni_hsp << contig next elsif contig.is_gapped? next elsif contig.is_truncated? next elsif contig.hsp_minor_than?(15) #En nt next else putative_contigs << contig if $verbose puts "#{contig.first_hit.name}\t#{contig.name}" end end } @contigs=putative_contigs return uni_hsp end
filtering_clust()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 217 def filtering_clust # Bateria de filtros q se aplica sobre @clusters. tb muestra informacion gene_clusters=[] uni_hsp=[] each_cluster{|clust| if $verbose puts "\n********************CLUSTER*************************\n" end putative_ex=[] trash_ex=[] clust.each do |contig| temp=[] if contig.mixed? temp << "#{contig.first_hit.name}\t#{contig.name}" trash_ex << temp elsif contig.is_truncated? temp << "#{contig.first_hit.name}\t#{contig.name}" trash_ex << temp elsif contig.is_one_hsp? temp << "#{contig.first_hit.name}\t#{contig.name}" trash_ex << temp uni_hsp << contig#Se guardan los contig uni-hsp, para procesado posterior elsif contig.is_gapped? temp << "#{contig.first_hit.name}\t#{contig.name}" trash_ex << temp else putative_ex << contig end end if $verbose putative_ex.each do |contig| puts "#{contig.first_hit.name}\t#{contig.name}\t\t\tsc:#{contig.first_hit.first_hsp.score}" #el score de cada hsp es el mismo, por lo que realmente pertenece al alineamiento entero end puts ',,,,,,,,,,,,,REJECTED,,,,,,,,,,,,,' trash_ex.each do |contig| puts contig end puts "\n= = = = = = = = = =MAP= = = = = = = = = = = =\n" putative_ex.each do |contig| contig.draw end end gene_clusters << putative_ex } @clusters=gene_clusters return uni_hsp end
generate_file_5_prime(file, fasta)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 510 def generate_file_5_prime(file, fasta) prime5_file = File.open(file, 'w') fasta_file = File.open(fasta, 'w') each_cluster{ |cluster| if !cluster.nil? && !cluster.empty? gene_name = cluster.first.first_hit.name cluster.each do |contig| if contig.first_hit.first_hsp.s_beg <= 10 prime5_end = contig.first_hit.first_hsp.q_beg prime5_file.puts "#{gene_name}\t#{contig.name}\t#{prime5_end}" seq = contig.seq[0..prime5_end] if !seq.nil? fasta_file.puts "#{gene_name}\n#{seq}" end end end end } prime5_file.close fasta_file.close end
info_clusters()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 144 def info_clusters # Muestra informacion sobre @Clusters, muestra contig, la proteina a la q pertenece y un diagrama del alineamiento en aa if $verbose each_cluster{|cl| puts '............................' cl.each do |c| puts "#{c.first_hit.name}\t#{c.name}" end puts "............................" } each_cluster{|clust| puts "\n********************MAP*************************\n" clust.each do |contig| contig.draw end } puts "\n" end end
load_references(references_file)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 274 def load_references(references_file) # Carga en @references_hash todas las referencias en forma de objetos contig hash={} if File.exists?(references_file) File.open(references_file, 'r').each do |line| fields=line.split contig_name=fields[0] if !fields[1].nil? structures=fields[1].split('|') all_models=[] structures.each do |structure| contig=Contig.new(contig_name) contig.add_hit(contig_name, 0, 1,:nt) if structure.nil? break end hsps=structure.split(';') s_end=0 nt_add=0 hsps.each do |hsp| coords=hsp.split('-') q_beg=coords[0].to_i q_end=coords[1].to_i s_beg=s_end+1 exon_length=q_end-q_beg+nt_add s_end=s_end+(exon_length/3) nt_add=exon_length.modulo(3) contig.first_hit.add_hsp(q_beg, q_end, s_beg, s_end, 0, 0, 0, 0) end contig.length=contig.first_hit.last_hsp.q_end all_models << contig end hash[contig_name]=all_models end end end @references_hash=hash end
load_seq(hash)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 189 def load_seq(hash) #Carga secuencias en @contigs each_contig{|contig| contig.seq=hash[contig.name] contig.seq.upcase! } end
missing_cluster_transfer(dataset)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 312 def missing_cluster_transfer(dataset) #Busca que clusters estan vacios e intenta llenarlos con clusters de dataset add=[] delete=[] if clusters_empty? dataset.each_cluster{ |clust| transfer_cluster(clust) } dataset.clear_clusters else dataset.each_cluster_with_index{|uni_cluster,ind| is_cluster=FALSE each_cluster{|cluster| #Se mira si existe cluster uni-hsp en cluster if uni_cluster.first.first_hit.name==cluster.first.first_hit.name is_cluster=TRUE break end } if !is_cluster #Caso de q no exista cluster, se transfiere cluster uni-hsp add << uni_cluster delete << ind end } add.each do |clust| transfer_cluster(clust.dup) end delete.sort! delete.reverse_each do |ind| dataset.delete_cluster_at(ind) end end end
missing_contigs_transfer(dataset)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 474 def missing_contigs_transfer(dataset) #dataset is uni_hsp. Se buscan contigs q no alineen con los de self contigs_cluster=[] self.each_cluster_with_index{|self_cluster,s| dataset.each_cluster{|dataset_cluster| if dataset_cluster.nil? ||dataset_cluster.empty? next end if self_cluster.first.first_hit.name==dataset_cluster.first.first_hit.name #Mismo cluster dataset_cluster.each do |dataset_contig| align=FALSE self_cluster.each do |self_contig| position,n_exones=dataset_contig.compare(self_contig) if position>-1 align=TRUE break end end if !align contigs_cluster << dataset_contig end end contigs_cluster.each do |contig| self.transfer_contig_to_cluster(contig,s) dataset_cluster.delete(contig) end contigs_cluster=[] end } } end
multiple_align_contigs(array_contig_base,mod_contig_base=FALSE)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 431 def multiple_align_contigs(array_contig_base,mod_contig_base=FALSE) correct=0 array_contig_base.each do |contig_base| local_correct=correct_left_side_contigs(contig_base) if local_correct>correct correct=local_correct end self.align_contigs(contig_base,mod_contig_base) end # Correcion del modelo en base al desplazamiento general calculado para cada fragmento teniendo en cuenta el desplazamiento local realizado array_contig_base.each do |contig| if correct>0 contig.modified_coordenates(correct) contig.length+=correct end end self.each_contig {|contig| if correct>0 contig.modified_coordenates(correct) contig.length+=correct end } return correct end
n_contigs?()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 71 def n_contigs? n=@contigs.length return n end
parse_stops()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 202 def parse_stops each_contig{|contig| contig.stop_codon_search } end
rev_comp()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 196 def rev_comp #Realiza la secuencia reverso complementaria en @contigs y @uni_hsp each_contig{|contig| contig.rev_comp_if_hit } end
score_correction(factor)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 348 def score_correction(factor) #Suma al atributo score la operacion nÂș intrones*factor each_contig{|contig| n_intron=contig.n_intron contig.first_hit.each_hsp{|hsp| hsp.score+=factor*n_intron } } end
sort_cluster(cluster)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 270 def sort_cluster(cluster)#Ordena los elementos de cluster(contigs) en base a su posicion en el subject cluster.sort!{|e1,e2| e1.first_hit.first_hsp.s_beg<=>e2.first_hit.first_hsp.s_beg} end
sort_cont_clust()
click to toggle source
# File lib/gene_assembler/dataset.rb, line 263 def sort_cont_clust #Ordenar contigs dentro de @clusters de menor a mayor en base a su primer hsp each_cluster{|cluster| cluster=sort_cluster(cluster) } #@clusters=sort_clusters(@clusters) end
transfer_cluster(cluster)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 34 def transfer_cluster(cluster) @clusters << cluster end
transfer_contig_to_cluster(contig,n_cluster)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 506 def transfer_contig_to_cluster(contig,n_cluster) @clusters[n_cluster] << contig end
transfer_contigs(add_contigs,limit=0)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 18 def transfer_contigs(add_contigs,limit=0) if limit==0 @contigs << add_contigs @contigs.flatten! else if add_contigs.class.to_s=='Array' add_contigs.each_with_index do |contig,i| if i==limit break end @contigs << contig end end end end
transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit)
click to toggle source
# File lib/gene_assembler/dataset.rb, line 457 def transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit) if !cluster.empty?||!cluster.nil? dataset.each_cluster{|dat_cluster| if dat_cluster.empty?||dat_cluster.nil? next end if dat_cluster.first.first_hit.name==cluster.first.first_hit.name # Se busca en los clusters unihsp aquel q pertenece al gen q se esta trabajando dat_cluster.each do |contig| contig.first_hit.type='pseudogene' end transfer_contigs(dat_cluster,limit) end } end end