class Dataset

Attributes

clusters[RW]
contigs[RW]
references_hash[RW]
type[RW]

Public Class Methods

new(type) click to toggle source
# File lib/gene_assembler/dataset.rb, line 5
def initialize(type) #Carga un objeto blast para generar los objetos contig que inician esta clase
  @type=type #Definido pero no se usa
  @contigs=[]
  @clusters=[]
  @references_hash=''
end

Public Instance Methods

add_contig(name) click to toggle source
# File lib/gene_assembler/dataset.rb, line 12
def add_contig(name)
      c=Contig.new(name)
      @contigs << c
      return c
end
align_contigs(contig_base) click to toggle source
# File lib/gene_assembler/dataset.rb, line 388
def align_contigs(contig_base)
              limit=0
   las_contig=nil
              if !contig_base.nil?
                limit=-1
                last_contig=contig_base
              end
              ## Alineamiento de los contig entre si o contra una referencia
              add=0
              align=TRUE
              each_contig_with_index do |contig,i|
                    if i>limit 
                            #Calcular desplazamiento de un contig respecto al anterior en el gff
                            overlap_exon_with_last,ex=contig.compare(last_contig)
                            if overlap_exon_with_last==-1
                                      if contig_base.nil?
                                           add+=last_contig.length
                                      else
                                              align=FALSE
                                      end
                            else
                                   overlap_exon_current,ex=last_contig.compare(contig)
                                   add+=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))
                                   if !contig_base.nil?
                                     align=TRUE
                                   end
                            end
                      end 

                      #Modificacion de contigs
                      if align || contig_base.nil? # Modificar si no existe referencia o el contig a alineado contra la referencia
                              contig.modified_coordenates(add)
                              contig.length+=add
                      end
                      if !contig_base.nil?
                              last_contig=contig_base
        add=0 #Resetear desplazamiento en caso de usarse una referencia
                      else
                              last_contig=contig
                      end
              end           
end
attrib_recover(dataset) click to toggle source
# File lib/gene_assembler/dataset.rb, line 103
def attrib_recover(dataset) #Reponer atributos en el Dataset del exonerate que se han perdido en el proceso (exonerate no los tiene), se recuperan del blast
      each_contig{|self_contig|
                      dataset.each_contig{|dataset_contig|
                              if self_contig.name==dataset_contig.name
                                      self_contig.length=dataset_contig.length
                                      self_contig.seq=dataset_contig.seq
                                      self_contig.each_hit{|hit|
                                              hit.s_length=dataset_contig.first_hit.s_length
                                      }
                                      break      
                              end
                      }
              }
end
clear_clusters() click to toggle source
# File lib/gene_assembler/dataset.rb, line 344
def clear_clusters
  @clusters=[]
end
clr_contigs() click to toggle source
# File lib/gene_assembler/dataset.rb, line 76
def clr_contigs # Vacia @contigs
      @contigs=[]
end
cluster_count() click to toggle source
# File lib/gene_assembler/dataset.rb, line 98
def cluster_count
      count=@clusters.length
      return count
end
clustering() click to toggle source
# File lib/gene_assembler/dataset.rb, line 124
def clustering # Compara el subject_id  entre todos los contig y agrupa en un array aquellos con mismo s_i. Cada array se guarda en el array 'clusters'
  finished_clusters=[]
  each_contig{|contig|
    clust=[]
    if finished_clusters.include?(contig.first_hit.name)
      next
    end
    each_contig{|contig2|
      if contig.first_hit.name==contig2.first_hit.name
        clust << contig2
        contig2=nil
      end
    }
    finished_clusters << contig.first_hit.name
    if !clust.empty?
      @clusters << clust
    end  
  }
end
clusters_empty?() click to toggle source
# File lib/gene_assembler/dataset.rb, line 80
def clusters_empty?
  empty=TRUE
  i=0
  each_cluster{|cl|
    i+=1
    if i>0
      empty=FALSE
      break
    end
  }
  return empty
end
contig_count() click to toggle source
# File lib/gene_assembler/dataset.rb, line 93
def contig_count
      count=@contigs.length
      return count
end
correct_hsp_contigs(blast_coor_type) click to toggle source
# File lib/gene_assembler/dataset.rb, line 118
def correct_hsp_contigs(blast_coor_type)
  each_contig {|contig|
    contig.correct_hsps(blast_coor_type)  
  }
end
correct_left_side_contigs(contig_base) click to toggle source
# File lib/gene_assembler/dataset.rb, line 357
def correct_left_side_contigs(contig_base)
        last_contig=nil
        limit=0
        correct=0
        ## Alineamiento de los contig entre si para calcular desplazamiento
        if !contig_base.nil?
          limit=-1
          last_contig=contig_base
        end
        each_contig_with_index do |contig,i| # Calculo del desplazamiento necesario para corregir indices negativos en el gff
              if i>limit 
                      overlap_exon_with_last,ex=contig.compare(last_contig)
                      if overlap_exon_with_last>-1
                             overlap_exon_current,ex=last_contig.compare(contig)
    diference=coord_prot(last_contig.hsp_at(overlap_exon_with_last),contig.hsp_at(overlap_exon_current))                              
    if diference<correct
                                correct=diference
    end
                      end
                end 
                
                if !contig_base.nil?
                        last_contig=contig_base
                else
                        last_contig=contig
                end  
        end
        correct*=-1
        return correct
end
delete_cluster(cluster) click to toggle source
# File lib/gene_assembler/dataset.rb, line 38
def delete_cluster(cluster)
      @clusters.delete(cluster) 
end
delete_cluster_at(ind) click to toggle source
# File lib/gene_assembler/dataset.rb, line 42
def delete_cluster_at(ind)
      @clusters.delete_at(ind) 
end
each_cluster() { |cluster| ... } click to toggle source
# File lib/gene_assembler/dataset.rb, line 59
def each_cluster
  @clusters.each do |cluster|
    yield cluster
  end
end
each_cluster_with_index() { |cluster,i| ... } click to toggle source
# File lib/gene_assembler/dataset.rb, line 65
def each_cluster_with_index
  @clusters.each_with_index do |cluster,i|
    yield cluster,i
  end
end
each_contig() { |contig| ... } click to toggle source

iterador

# File lib/gene_assembler/dataset.rb, line 47
def each_contig
      @contigs.each do |contig|
              yield contig
      end
end
each_contig_with_index() { |contig,i| ... } click to toggle source
# File lib/gene_assembler/dataset.rb, line 53
def each_contig_with_index
      @contigs.each_with_index do |contig,i|
              yield contig,i
      end
end
fasta(fasta_file) click to toggle source
# File lib/gene_assembler/dataset.rb, line 208
def fasta(fasta_file) #Crea un archivo fasta a partir de @contigs
        temp=File.open(fasta_file, 'w')
        each_contig{|contig|
                temp.print ">#{contig.name}\n"
                temp.puts contig.seq
        }
        temp.close
end
filtering() click to toggle source
# File lib/gene_assembler/dataset.rb, line 164
def filtering #Bateria de filtros
      putative_contigs=[]
      uni_hsp=[]
              each_contig{ |contig|
                      if contig.mixed?
                              next
                      elsif contig.is_one_hsp? #Apartamos contigs uni-hsp
                              uni_hsp << contig
                              next
                      elsif contig.is_gapped?
                              next
                      elsif contig.is_truncated?
                              next
                      elsif contig.hsp_minor_than?(15) #En nt
                              next
                      else putative_contigs << contig
                              if $verbose
                                      puts "#{contig.first_hit.name}\t#{contig.name}"
                              end
                      end
              }
              @contigs=putative_contigs
              return uni_hsp
end
filtering_clust() click to toggle source
# File lib/gene_assembler/dataset.rb, line 217
def filtering_clust # Bateria de filtros q se aplica sobre @clusters. tb muestra informacion
      gene_clusters=[]
      uni_hsp=[]
              each_cluster{|clust|
                      if $verbose
                              puts "\n********************CLUSTER*************************\n"
                      end
                      putative_ex=[]
                      trash_ex=[]
                      clust.each do |contig|
                              temp=[]
                              if contig.mixed?
                                      temp << "#{contig.first_hit.name}\t#{contig.name}"
                                      trash_ex << temp
                              elsif contig.is_truncated?
                                      temp << "#{contig.first_hit.name}\t#{contig.name}"
                                      trash_ex << temp
                              elsif contig.is_one_hsp?
                                      temp << "#{contig.first_hit.name}\t#{contig.name}"
                                      trash_ex << temp
                                      uni_hsp << contig#Se guardan los contig uni-hsp, para procesado posterior
                              elsif contig.is_gapped?
                                      temp << "#{contig.first_hit.name}\t#{contig.name}"
                                      trash_ex << temp
                              else putative_ex << contig
                              end
                      end
                      if $verbose
                              putative_ex.each do |contig|
                              puts "#{contig.first_hit.name}\t#{contig.name}\t\t\tsc:#{contig.first_hit.first_hsp.score}" #el score de cada hsp es el mismo, por lo que realmente pertenece al alineamiento entero
                              end
                              puts ',,,,,,,,,,,,,REJECTED,,,,,,,,,,,,,'
                              trash_ex.each do |contig|
                                      puts contig
                              end
                              puts "\n= = = = = = = = = =MAP= = = = = = = = = = = =\n"
                              putative_ex.each do |contig|
                                      contig.draw
                              end
                      end
                      gene_clusters << putative_ex 
              }
              @clusters=gene_clusters
              return uni_hsp
end
generate_file_5_prime(file, fasta) click to toggle source
# File lib/gene_assembler/dataset.rb, line 510
def generate_file_5_prime(file, fasta)
        prime5_file = File.open(file, 'w')
        fasta_file = File.open(fasta, 'w')
        each_cluster{ |cluster|
                if !cluster.nil? && !cluster.empty?
                        gene_name = cluster.first.first_hit.name
                        cluster.each do |contig|
                                if contig.first_hit.first_hsp.s_beg <= 10
                                        prime5_end = contig.first_hit.first_hsp.q_beg
                                        prime5_file.puts "#{gene_name}\t#{contig.name}\t#{prime5_end}"
                                        seq = contig.seq[0..prime5_end]
                                        if !seq.nil?
                                                fasta_file.puts "#{gene_name}\n#{seq}"
                                        end
                                end
                        end
                end
        }
        prime5_file.close
        fasta_file.close
end
info_clusters() click to toggle source
# File lib/gene_assembler/dataset.rb, line 144
def info_clusters # Muestra informacion sobre @Clusters, muestra contig, la proteina a la q pertenece y un diagrama del alineamiento en aa
      if $verbose
                      each_cluster{|cl|
                      puts '............................'
                      cl.each do |c|
                      puts "#{c.first_hit.name}\t#{c.name}"
                      end
                      puts "............................"
                      }

                      each_cluster{|clust|
                              puts "\n********************MAP*************************\n"
                              clust.each do |contig|
                                      contig.draw
                              end
                      }
              puts "\n"
              end
end
load_references(references_file) click to toggle source
# File lib/gene_assembler/dataset.rb, line 274
def load_references(references_file) # Carga en @references_hash todas las referencias en forma de objetos contig
      hash={}
      if File.exists?(references_file)
             File.open(references_file, 'r').each do |line|
                     fields=line.split
                     contig_name=fields[0]
                     if !fields[1].nil?
                     structures=fields[1].split('|')
                     all_models=[]
                     structures.each do |structure|
                     contig=Contig.new(contig_name)
                     contig.add_hit(contig_name, 0, 1,:nt)
                     if structure.nil?
                             break
                     end
                     hsps=structure.split(';')
                     s_end=0
                     nt_add=0
                     hsps.each do |hsp|
                             coords=hsp.split('-')
                             q_beg=coords[0].to_i
                             q_end=coords[1].to_i
                             s_beg=s_end+1
                             exon_length=q_end-q_beg+nt_add
                             s_end=s_end+(exon_length/3)
                             nt_add=exon_length.modulo(3)
                             contig.first_hit.add_hsp(q_beg, q_end, s_beg, s_end, 0, 0, 0, 0)
                     end
                     contig.length=contig.first_hit.last_hsp.q_end
                       all_models << contig
                     end
                     hash[contig_name]=all_models
            end
             end
        end
      @references_hash=hash
end
load_seq(hash) click to toggle source
# File lib/gene_assembler/dataset.rb, line 189
def load_seq(hash) #Carga secuencias en @contigs
      each_contig{|contig|
                      contig.seq=hash[contig.name]
                      contig.seq.upcase!
      }
end
missing_cluster_transfer(dataset) click to toggle source
# File lib/gene_assembler/dataset.rb, line 312
def missing_cluster_transfer(dataset) #Busca que clusters estan vacios e intenta llenarlos con clusters de dataset
      add=[]
      delete=[]
      if clusters_empty?
              dataset.each_cluster{ |clust|
                      transfer_cluster(clust)
              }
              dataset.clear_clusters
      else         
             dataset.each_cluster_with_index{|uni_cluster,ind|
                     is_cluster=FALSE
                     each_cluster{|cluster| #Se mira si existe cluster uni-hsp en cluster
                             if uni_cluster.first.first_hit.name==cluster.first.first_hit.name
                                     is_cluster=TRUE
                                     break
                             end
                     }
                     if !is_cluster #Caso de q no exista cluster, se transfiere cluster uni-hsp
                             add << uni_cluster
                             delete << ind
                     end        
             }
             add.each do |clust|
                     transfer_cluster(clust.dup)
             end
              delete.sort!
             delete.reverse_each do |ind|
                     dataset.delete_cluster_at(ind)
             end
      end
end
missing_contigs_transfer(dataset) click to toggle source
# File lib/gene_assembler/dataset.rb, line 474
def missing_contigs_transfer(dataset) #dataset is uni_hsp. Se buscan contigs q no alineen con los de self
  contigs_cluster=[]
  self.each_cluster_with_index{|self_cluster,s|
    dataset.each_cluster{|dataset_cluster|
        if dataset_cluster.nil? ||dataset_cluster.empty?
                next
        end
      if self_cluster.first.first_hit.name==dataset_cluster.first.first_hit.name #Mismo cluster
        dataset_cluster.each do |dataset_contig|
          align=FALSE
          self_cluster.each do |self_contig|
            position,n_exones=dataset_contig.compare(self_contig)
            if position>-1
              align=TRUE
              break
            end
          end
          if !align
            contigs_cluster << dataset_contig
          end
        end
        
        contigs_cluster.each do |contig|
          self.transfer_contig_to_cluster(contig,s)
          dataset_cluster.delete(contig)
        end
        contigs_cluster=[]
      end  
    }
  }
end
multiple_align_contigs(array_contig_base,mod_contig_base=FALSE) click to toggle source
# File lib/gene_assembler/dataset.rb, line 431
def multiple_align_contigs(array_contig_base,mod_contig_base=FALSE)
      correct=0
      array_contig_base.each do |contig_base|
              local_correct=correct_left_side_contigs(contig_base)
              if local_correct>correct
                      correct=local_correct
              end
              self.align_contigs(contig_base,mod_contig_base)
      end
      # Correcion del modelo en base al desplazamiento general calculado para cada fragmento teniendo en cuenta el desplazamiento local realizado
      array_contig_base.each do |contig|
              if correct>0
                      contig.modified_coordenates(correct)
                      contig.length+=correct
              end
      end
      self.each_contig {|contig|
              if correct>0
                      contig.modified_coordenates(correct)
                      contig.length+=correct
              end
      }
      
      return correct
end
n_contigs?() click to toggle source
# File lib/gene_assembler/dataset.rb, line 71
def n_contigs?
  n=@contigs.length
  return n
end
parse_stops() click to toggle source
# File lib/gene_assembler/dataset.rb, line 202
def parse_stops
      each_contig{|contig|
              contig.stop_codon_search
      }
end
rev_comp() click to toggle source
# File lib/gene_assembler/dataset.rb, line 196
def rev_comp #Realiza la secuencia reverso complementaria en @contigs y @uni_hsp
      each_contig{|contig|
                      contig.rev_comp_if_hit
      }
end
score_correction(factor) click to toggle source
# File lib/gene_assembler/dataset.rb, line 348
def score_correction(factor) #Suma al atributo score la operacion nÂș intrones*factor
      each_contig{|contig|
              n_intron=contig.n_intron
              contig.first_hit.each_hsp{|hsp|
                      hsp.score+=factor*n_intron
              }
      }
end
sort_cluster(cluster) click to toggle source
# File lib/gene_assembler/dataset.rb, line 270
def sort_cluster(cluster)#Ordena los elementos de cluster(contigs) en base a su posicion en el subject
        cluster.sort!{|e1,e2| e1.first_hit.first_hsp.s_beg<=>e2.first_hit.first_hsp.s_beg}
end
sort_cont_clust() click to toggle source
# File lib/gene_assembler/dataset.rb, line 263
def sort_cont_clust #Ordenar contigs dentro de @clusters de menor a mayor en base a su primer hsp
      each_cluster{|cluster| 
                      cluster=sort_cluster(cluster)
              }
      #@clusters=sort_clusters(@clusters)
end
transfer_cluster(cluster) click to toggle source
# File lib/gene_assembler/dataset.rb, line 34
def transfer_cluster(cluster)
  @clusters << cluster
end
transfer_contig_to_cluster(contig,n_cluster) click to toggle source
# File lib/gene_assembler/dataset.rb, line 506
def transfer_contig_to_cluster(contig,n_cluster)
  @clusters[n_cluster] << contig
end
transfer_contigs(add_contigs,limit=0) click to toggle source
# File lib/gene_assembler/dataset.rb, line 18
def transfer_contigs(add_contigs,limit=0)
      if limit==0
                @contigs << add_contigs
                @contigs.flatten!
              else
               if add_contigs.class.to_s=='Array'
                    add_contigs.each_with_index do |contig,i|
                            if i==limit
                                    break
                            end
                            @contigs << contig
                    end
               end
              end                           
end
transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit) click to toggle source
# File lib/gene_assembler/dataset.rb, line 457
def transfer_n_contigs_def_hit_type(dataset,cluster,new_hit_type,limit)
        if !cluster.empty?||!cluster.nil?
                dataset.each_cluster{|dat_cluster| 
                        if dat_cluster.empty?||dat_cluster.nil?
                                next
                        end                                 
                        if dat_cluster.first.first_hit.name==cluster.first.first_hit.name # Se busca en los clusters unihsp aquel q pertenece al gen q se esta trabajando
                                dat_cluster.each do |contig|
                                        contig.first_hit.type='pseudogene'
                                end
                                transfer_contigs(dat_cluster,limit)
                        end 

                }
        end           
end