class Rebuild

Public Class Methods

new(dataset,dataset_uni_hsp,path) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 7
def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
        @dataset=dataset
        @dataset_uni_hsp=dataset_uni_hsp
        @path=path
        @db_seqs=fasta_hash(path[:exonerate_db])
end

Public Instance Methods

add_uni_hsp(model,cluster) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 248
def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
  contigs_uni_hsp=''
  is_contig=0
  pseudogenes=[]
  @clusters_uni_hsp.each do |contigs|
    if contigs.first.first_hit.name==cluster.first.first_hit.name
      contigs_uni_hsp=contigs
      is_contig=1
      break
    end
  end
  
  if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
    if model.class.to_s!='Array'
      model=[model]
    end
    model.each do |item|
      contigs_uni_hsp.each do |contig|
        start,exons=item.compare(contig)
        if exons>1 && !pseudogenes.include?(contig)
          pseudogenes << contig
        end
      end
    end
  end
      return pseudogenes
end
array_contigs_to_contig(array_contigs) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 852
def array_contigs_to_contig(array_contigs)
  contig=Contig.new(array_contigs.first.name)
  array_contigs.each do |cn|
      contig.transfer_contig_hits(cn)
  end
  contig.length=array_contigs.last.length
  return contig
end
build_gene_array(contigs,reference=nil) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 276
def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
        gene_array=[]
        gene_array_introns=[]
        last_contig=''
        if !reference.nil?
                last_contig=reference
        end
        contigs.each do |contig|
                        array_contig=[]
                        array_contig_introns=[]
                        n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
                        #Determinar posiciones vacias
                        if !gene_array.empty?||reference 
                                first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
                                if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
                                  if $verbose
                                    puts "\n#{contig.name} alignment step OUT OF RANGE"
                                  end
                                  gene_array=[]
                                  gene_array_introns=[]
                                  break
                                end
                                if first_exon==-1
                                        gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
                                                array_contig << 0 # Marca ausencia de exon para esa posicion
                                                array_contig_introns << 0 # Marca ausencia de intron para esa posicion
                                        end
                                else
                                        if reference # ASignamiento de la posicion del contig respecto a la referencia
                                                void_positions=first_exon
                                        else
                                                void_positions=first_exon+gene_array.last.count(0)
                                        end
                                        void_positions.times do #Posiciones vacias cuando HAY overlapping
                                                array_contig << 0
                                                array_contig_introns << 0
                                        end
                                end
                        end
                        #Agregar exones e intrones del contig
                        exones=contig.exones_s
                        introns=contig.intrones_q
                        array_contig << exones # Marca presencia de exon para esa posicion
                        array_contig_introns << introns # Marca presencia de exon para esa posicion
                        gene_array << array_contig.flatten!
                        gene_array_introns << array_contig_introns.flatten!
                        if reference.nil?
                                last_contig=contig
                        end
        end
        return gene_array, gene_array_introns
end
contig_compact(contigs) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 430
  def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
          cn_def=[]
          cn_backup=contigs.dup
          #Determinar contigs a fusionar
          cn_to_merge=[]
          s_end=nil
          last_position_ref=nil
          position_overlap=nil
          last_contig=nil
          fusion=[]
          contigs.length.times do
                  fusion << FALSE
          end
          #Marcaje de contigs correlativos no solapantes
          contigs.each_with_index do |contig,i|
                  if i>0                       
        diference=contig.first_hit.first_hsp.s_beg-s_end
                          if diference==0 || diference==1
                                  fusion[i]=TRUE
                          end
                  end
                  s_end=contig.first_hit.last_hsp.s_end
          end
          
          if fusion.include?(TRUE)
                  
          #Construccion array contigs a fusionar y guardado de los solapantes
          fusion_contigs=[]
          count=0     # Marca la posicion de las fusiones
          fusion.each_with_index do |cont,i|
                  if cont
                          if !fusion_contigs.include?(contigs[i-1])
                                  fusion_contigs << contigs[i-1]
                          end
                          if !fusion_contigs.include?(contigs[i])
                                  fusion_contigs << contigs[i]
                          end
                  else
                          if !fusion_contigs.empty?#Marcar fusiones
                                  cn_to_merge << fusion_contigs
                                  fusion_contigs=[]
                                  cn_def << count
                                  count+=1
                          end
                          if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
                                  cn_def << contigs[i]
                          end
                  end
                  if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
                          cn_to_merge << fusion_contigs
                          cn_def << count
                          count+=1
                  end
          end
          
          #Generar fasta de los contig fusionados
          contigs_merge=contigs_seq_merge(cn_to_merge)
          if !contigs_merge.empty?
          temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
          contigs_merge.each_with_index do |seq,i|
                  temp.puts ">Fusion_#{i}\n#{seq}"
          end
          temp.close
  
          temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
          temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
          temp_db.close
          
end
          
          #Exonerating
          cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
          system(cmd)
          
          #Parsing exonerate
          local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
          store_local_ex = local.dataset
    #store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices}
          store_local_ex.score_correction(30)
          #puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
          if store_local_ex.contig_count==contigs_merge.length
                  #Recuperar atributos en contigs y cargar array con contigs def
                  store_local_ex.each_contig_with_index{|contig,i|
                          contig.seq=contigs_merge[i]
                          contig.length=contigs_merge[i].length
                          contig.first_hit.s_length=contigs.first.first_hit.s_length
                          cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
                                  if contig_def==i
                                          cn_def[j]=contig
                                  end
                          end
                  }
          else
                  cn_def=cn_backup
          end
          else
            cn_def=contigs
          end

          return cn_def

  end
contigs_seq_merge(contigs) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 533
def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
        cn=[]
        seq=''
        contigs.each do |contigs_to_merge|
                contigs_to_merge.each do |contig|
                        if seq.empty?
                                seq=contig.seq
                        else
                                seq=seq+'n'*10+contig.seq
                        end
                end
                cn << seq
                seq=''
        end
        return cn
end
correct_model(model, length_model, gff_dataset, sequences_hash) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 954
def correct_model(model, length_model, gff_dataset, sequences_hash)
  correct_add_Ns=0
  if model.class.to_s=='Array'
    model=array_contigs_to_contig(model)
    model.name=model.name.gsub('_0','')
    model.length=length_model
  end
  
  correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
  model.modified_coordenates(correct_add_Ns)
  model.length+=correct_add_Ns
  gff_dataset.align_contigs(model)

       #Corregir secuencia para que alinee con las features generadas
       if correct_add_Ns>0
    sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
       end
      
        return model
end
eval_model(local_model, length) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 171
def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
  recover=0
  overlap=0
  if local_model.class.to_s=='Array'
    local_model=array_contigs_to_contig(local_model)
    local_model.length=length
  end
  recover=recover_test(local_model,FALSE)
  overlap=overlap_test(local_model,FALSE)    
  return recover, overlap
end
format_model(model) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 944
def format_model(model)
  if model.n_hits?>1
    model.each_hit_with_index{|hit,i|
      hit.name=hit.name+"_gene_#{i}"
    }
  else
    model.first_hit.name=model.first_hit.name+'_gene'
  end
end
gene_array_and_compact(rebuild,cluster,reference) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 221
def gene_array_and_compact(rebuild,cluster,reference)
  # Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
  #---------------------------------------------------------------------------------------------------------------------------
  if rebuild
    gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
    if $verbose
      gene_exons=gene_stadistics(gene_array)
      gene_stadistics_report(gene_exons,'EXONS')
      gene_introns=gene_stadistics(gene_array_introns)
      gene_stadistics_report(gene_introns,'INTRONS')
    end
  end

  # Seleccion de contigs para modelado de gen
  #-----------------------------------------------------
  if $verbose #Info cluster before compact array contigs
    gene_array_report(gene_array,cluster,rebuild)
  end
  if rebuild
    gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
  end
  if $verbose && rebuild #Info cluster after compact array contigs
    gene_array_report(gene_array,cluster,rebuild)
  end
  return cluster, gene_array
end
gene_array_report(gene_array,contigs,act_array) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 356
def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
        if act_array 
                puts "\nGENE ARRAY"
                gene_array.each_with_index do |fila,c|
                        print "#{contigs[c].name.center(24)}\t "
                        print "#{contigs[c].completed}\t"
                        fila.each do |item|
                                print "#{item}\t"
                        end
                        puts "\n"
                end
        end
        
        puts "\nMAP"
        contigs.each do |contig|
                print "#{contig.name.center(25)}"
                print contig.draw
        end
end
gene_compact(gene_array, contigs) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 376
def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
        gene_array.each_with_index do |contig,c1|
                if !contig
                        next
                end
                c1_len=contig.length
                n_exons=contig.count{|x| x>0}
                gene_array.each_with_index do |contig2,c2|
                        if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
                                next
                        end
                        c2_len=contig2.length
                        
                        # IGUAL
                        if c1_len==c2_len
                                if contig2.count{|x| x>0}==n_exons
                                        if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
                                                gene_array[c2]=nil
                                                contigs[c2]=nil
                                        else 
                                                gene_array[c1]=nil
                                                contigs[c1]=nil
                                                break
                                        end
                                elsif contig2.count{|x| x>0}>n_exons 
                                        gene_array[c1]=nil
                                        contigs[c1]=nil
                                        break
                                else
                                        gene_array[c2]=nil
                                        contigs[c2]=nil
                                end

                        # MAYOR QUE
                        elsif c1_len>c2_len
                                if contig.count(0)<=contig2.count(0)
                                        gene_array[c2]=nil
                                        contigs[c2]=nil
                                end

                        # MENOR QUE
                        elsif c1_len<c2_len
                                if contig.count(0)==contig2.count(0)
                                        gene_array[c1]=nil
                                        contigs[c1]=nil
                                        break
                                end
                        end
                end #end contig2
        end #end contig
        gene_array.compact!
        contigs.compact!
end
gene_error(e, gene_name, file_error, cluster, model) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 861
    def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
puts gene_name+' ERROR'
file_error.puts "\n"+gene_name+"\n.............................."
file_error.puts e.message
e.backtrace.each do |line|
  file_error.puts line
end
file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
cluster.each do |contig|
  file_error.puts contig.name
  #puts contig.name
  #contig.indices
end
file_error.puts '----------------------------------------------------------------------------------------'
    end
gene_model_cut(contigs, reference=nil) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 550
      def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
              q_beg=[]
              q_end=[]
              s_beg=[]
              s_end=[]
              seq=[]
              last_contig=nil
              last_score=0
              length_model=0
              multiple_lengths=[]
              add_length=TRUE
              add_last=0
              last_position_ref=nil
              last_position=nil
              lengthy=[]
              out_of_range=FALSE
              contigs.each do |contig|
                      score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
                      n_exones = contig.first_hit.hsp_count
                      
                      # FIRST CONTIG
                      #-------------------------------------------------------
                      if last_contig.nil?
                              q_end_seq=nil #SEQ
                              contig.first_hit.each_hsp_with_index{|hsp,i|
                                      q_beg << hsp.q_beg
                                      q_end << hsp.q_end
                                      s_beg << hsp.s_beg
                                      s_end << hsp.s_end
                                      #SEQ.................................
                                      if i==0
                                              seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1] 
                                      elsif i+1==n_exones
                                              seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
                                      else
                                              seq << contig.seq[q_end_seq..hsp.q_end-1]
                                      end
                                      q_end_seq=hsp.q_end
                                      # ...................................
                              }
                              length_model+=contig.length
                              if !reference.nil? #Posicionamiento del primer contig en la referencia
                                      last_position_ref,ex=contig.compare(reference)
                                      if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
                                        puts contig.name+' OUT OF RANGE'
                                        out_of_range=TRUE
                                        break
                                      end
                              end
                              
                      # OTHER CONTIG
                      #--------------------------------------------------------
                      else
                              position_overlap,ex=contig.compare(last_contig)

                              #Correccion posicion del contig en base a una referencia
                              if !reference.nil?
                                      last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
        if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
          out_of_range=TRUE
          puts contig.name+' OUT OF RANGE'
          break
        end
                              end
      
                              # NOT OVERLAP
                              #..........................
                              if position_overlap==-1 || contig.first_hit.hsp_count==1
                                      if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
                                              q_beg << 0
                                              q_end << 0
                                              s_beg << 0
                                              s_end << 0
                                              multiple_lengths << length_model
                                              length_model=contig.length
                                              last=length_model
                                              add_length=FALSE
                                              seq.last << 'n'*10 #SEQ Indicacion de GAP
                                      else
                                              last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
                                              length_model+=contig.length
                                      end
       
                                      q_end_seq=nil #SEQ
                                      contig.first_hit.hsps.each_with_index do |hsp,i|
                                              add_no=last
                                              if !add_length
                                                      add_no=0
                                              end
                                              q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
                                              q_end << hsp.q_end+add_no
                                              s_beg << hsp.s_beg
                                              s_end << hsp.s_end
                                              #SEQ.................................
                                              if i==0
                                                      cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
                                                      cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
                                                      seq << cs                                        
                                              elsif i+1==n_exones
                                                      seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]                                              
                                              else
                                                      seq << contig.seq[q_end_seq..hsp.q_end-1]
                                              end
                                              q_end_seq=hsp.q_end
                                              # ...................................
                                      end
                                      
                              # OVERLAP
                              #..........................
                              else
                                      if last_position==-1
                                              add_last=length_model-last_contig.length
                                      end
                                      overlap=last_contig.first_hit.hsp_count-position_overlap
                                      if last_contig.first_hit.hsp_count ==1
                                        overlap=1
                                      end
        #puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
                              add=0
                                      dif=0
                                      if last_score>=score
                                              add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
                                              dif=add
                                              if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
                                                      add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
                                                      #puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
                                                      dif=contig.first_hit.hsp_at(overlap-1).q_end     
                                                      q_beg=q_beg.reverse.drop(1).reverse
                                                      q_end=q_end.reverse.drop(1).reverse
                                                      s_beg=s_beg.reverse.drop(1).reverse
                                                      s_end=s_end.reverse.drop(1).reverse
                                                      seq=seq.reverse.drop(1).reverse #SEQ
                                              end
                                              if overlap==1
                                                      overlap=2
                                              end
                                              (contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
                                                      q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
                                                      q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
                                                      s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
                                                      s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
                                                      #SEQ.......................................
                                                      position_hsp=n+overlap-2
                                                      if position_hsp <0
                                                              position_hsp= 0
                                                      end
                                                      position_next_hsp=n+overlap-1
                                                      if position_next_hsp < 0
                                                              position_next_hsp =0
                                                      end

                                                      if n==0  
                                                              cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
                                                              cs=cn[0..1].swapcase!+cn[2..-1] 
                                                              seq << cs
                                                      elsif position_next_hsp==contig.first_hit.hsp_count-1
                                                              seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
                                                      else
                                                              seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
                                                      end
                                                      #............................................
                                             end
                                      else
                                              hsp_position=last_contig.first_hit.hsp_count-2
                                              if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
                                                      hsp_position=0
                                              end
                                              add=last_contig.first_hit.hsp_at(hsp_position).q_end
                                              dif=last_contig.length-add
                                              drop=1
                                              correction=0
                                              if overlap>1
                                                      drop=overlap-1
                                                      correction=1
                                                      add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
                                                      dif=length_model-(add+add_last)
                                              end                               
                                              # Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)
                                              q_beg=q_beg.reverse.drop(drop).reverse
                                              q_end=q_end.reverse.drop(drop).reverse
                                              s_beg=s_beg.reverse.drop(drop).reverse
                                              s_end=s_end.reverse.drop(drop).reverse
                                              seq=seq.reverse.drop(drop).reverse #SEQ
      
                                              # Añadimos los exones de 'contig' (excepto el primero)
                                              (contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
                                                      q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
                                                      q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
                                                      s_beg << contig.first_hit.hsp_at(n+correction).s_beg
                                                      s_end << contig.first_hit.hsp_at(n+correction).s_end
                                                      #SEQ.............................................................
                                                      if n+1==(contig.first_hit.hsp_count-correction)
                                                              n_correction=n+correction-1
                                                              if n_correction < 0
                                                                      n_correction=0
                                                              end
                                                              seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
                                                      elsif n==0
                                                              if n+correction==0
                                                                      cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
                                                              else
                                                                      cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
                                                              end
                                                              cs=cn[0..1].swapcase!+cn[2..-1] 
                                                              seq << cs
                                                      else
                                                              seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
                                                      end
                                                      #................................................................
                                              end
                                      end
                                      length_model+=(contig.length-dif)
                                      add_length=TRUE
                                      add_last+=add
                              end
                      end
                      last_position=position_overlap
                      last_contig=contig
                      last_score=score
                      lengthy << length_model
              end
              if !multiple_lengths.empty?
                      multiple_lengths << length_model
                      length_model=multiple_lengths
              end
              
              model=nil
              if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
    model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
    #Merge contigs under sequence reference
    model_length=nil
    if model.class.to_s=='Array'
      add=0
      model.each_with_index do |contig,i|
        contig.modified_coordenates(add)
        add+=contig.length
        if i<model.length-1
          add+=10
        end
      end
      model_length=add+model.last.length
    end
  
    model_n_exones=seq.length
      final_seq=seq.join
else #No generar modelo si al menos un contig no alinea contra la referencia
     model_length=nil
     final_seq=nil
  end

              return model, model_length, final_seq
      end
gene_stadistics(gene_array) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 329
def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
        exons=[]
        length=length2D(gene_array)
        length.times do |column|
                exon=[]
                gene_array.each_with_index.each do |item,row|
                        if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
                                exon << gene_array[row][column]
                        end
                end
                exons << exon
        end
        exons_stadistic=[]
        exons.each do |ex|
                exons_stadistic << ex.compact.length
        end
        return exons_stadistic        
end
gene_stadistics_report(exons_stadistic,tag) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 348
def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
        print "\n#{tag}\t"
        exons_stadistic.each do |item|
                print "#{item}\t"
        end
        print "\n"
end
iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash) click to toggle source

end main method

# File lib/gene_assembler/rebuild.rb, line 93
def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
  # Model atributes
  model=nil
  length=0
  seq=nil
  cluster_length=0
  length_cluster=0
  prot_reference=cluster.first.first_hit.name
  array_references=references_hash[prot_reference]

  # Model parameters
  recover=0
  overlap=0

  #Modelo de gen en ciego
  if $verbose
    puts "\n",'||||||||||  BLIND MODELING  ||||||||||'
  end
  model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)

  recover, overlap=eval_model(model.dup, length)

  if $verbose
    puts "\nRecover: #{recover} Overlap: #{overlap}"
  end
  
  guided=FALSE
  #Modelo de gen guiado
  if !array_references.nil?
    array_references.each do |ref|
      if $verbose
        puts "\n",'||||||||||  GUIDED MODELING  ||||||||||'
      end

      guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
      if guided_model.nil? # Si algun modelo sale mal se ignora
        next
      end
      guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
      if $verbose
        puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
      end
      
      #Arbol de decisiones
      if guided_overlap <= 15 #Si el overlap es menor del 15 %
        if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
          if guided_recover > recover
            guided=TRUE
          end        
        else # A distinto overlap
          recover_dif=guided_recover-recover
          if recover_dif < 0  # Si el guided_model tiene menos recuperacion q el anterior
            if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
              guided=TRUE
            end
          elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
            guided=TRUE      
          end
        end
      elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
        guided=TRUE
      end
      
      if guided
        model=guided_model
        length=guided_length
        seq=guided_seq
        length_cluster=guided_length_cluster
        recover=guided_recover
        overlap=guided_overlap
      end
    end
  end
  
  sequences_hash[prot_reference]=seq    
  return model, length, length_cluster
end
modeling_gene(cluster, reference, rebuild) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 183
def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
        model=nil
        model_length=nil
        seq=nil
        length_cluster=0
        # Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
        #--------------------------------------------------------------------------------------------------
        gene_array_length_before=nil
        continue=TRUE
        gene_array=[]
                
        while continue
              cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
              gene_array_length_after=length2D(gene_array)
              if gene_array_length_after == gene_array_length_before
                continue=FALSE
              end
              gene_array_length_before=gene_array_length_after
        end
        length_cluster=cluster.length

        # Modelado del gen
        #----------------------------------------------------
        if rebuild && !cluster.empty? && !gene_array.empty?
                if cluster.length >1
                        cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
                else
                        cluster_comp=cluster
                end
                if !cluster_comp.nil?
                        model, model_length, seq=gene_model_cut(cluster_comp, reference)
                else
                        puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
                end
        end
        return model, model_length, seq, length_cluster
end
overlap_test(model,output=TRUE) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 877
    def overlap_test(model,output=TRUE)
perc_overlap=0
overlap=model.overlap
total=0
if !overlap.empty?
  if output
    print 'WARNING: overlap/s '
  end
  overlap.each do |length_overlap|
    if output
      print (length_overlap*-3).to_s+', '
    end
    total+=length_overlap
  end
  perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
  if output
    puts 'nt. % Total overlap '+perc_overlap.to_s
  end
end
return perc_overlap    
    end
position_reference_guided(contig,last_contig,last_position_ref,reference) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 840
def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
        position_ref,ex=contig.compare(reference)
        if !last_position_ref.nil?
                if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
                        position_overlap=(last_position_ref-position_ref).abs
                else #No overlap
                        position_overlap=-1
                end
        end
        return position_ref,position_overlap
end
rebuild(options) click to toggle source

MAIN METHOD

# File lib/gene_assembler/rebuild.rb, line 17
def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
        gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
        gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
        file_error=File.open(@path[:error],'w')
        file_web=web_header(options[:web],@path[:html])
        sequences_hash={}
        gene_name=nil
        model=nil
        statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
        puts "\nMODELING GENE",'*******************************************'
        @dataset.each_cluster{|cluster|
                begin
                        if !cluster.nil?
                                gene_name=cluster.first.first_hit.name
                        end
                        cluster_complete=cluster.dup
                        model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)

                       # GeneAssembler output (gff for Gbrowse)
                       #--------------------------------------------------------
                       if !model.nil?
                               #Format Contigs children of model
                               gff_dataset.clr_contigs
                               gff_dataset.transfer_contigs(cluster_complete)
                               gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
                               
                                # Convertir arrays a contig y ajustar alineamiento añadiendo Ns
                               model=correct_model(model, length_model, gff_dataset, sequences_hash)

                               # Comprobaciones en el modelo
                               exones=model.exones_s.length # N exones
                               puts 'Exones: '+ exones.to_s
                               recovered=recover_test(model)
                               overlap=overlap_test(model)
                               fragmentation=((length_cluster-1.00)/exones).round(2)
                               puts 'Fragmentation: ' + fragmentation.to_s
                        
                               # HTML index
                               if !file_web.nil?
                                       gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
                                       html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
                                end
                
                               #Format Model for Gbrowse
                               gff_dataset_model.clr_contigs
                               format_model(model) #Añade la particula _gene al modelo
                               gff_dataset_model.transfer_contigs(model)
                        
                               #Write
                               write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
                        
                               #General statistics
                               statistics[:genes]+=1
                               statistics[:total_recovered]+=recovered
                               statistics[:total_overlap]+=overlap
                               statistics[:total_fragmentation]+=fragmentation
                       end
                rescue Exception => e
                        gene_error(e, gene_name, file_error, cluster_complete, model)
                end
                puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'             
        }
        file_error.close
        web_body(file_web)

        puts  "\nFINAL STATISTICS\n",
                'Recovered genes: '+ statistics[:genes].to_s,
                'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
                'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
                'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
        write_model_fasta(sequences_hash,@path[:fasta])
end
recover_test(model,output=TRUE) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 899
    def recover_test(model,output=TRUE)
            recovered=0
            model.exones_s.each do |exon|
                    recovered+=exon
            end           
recovered=(recovered*100.0/model.first_hit.s_length).round(2)
if output
              puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
            end
            return recovered
    end
void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 803
def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
        contigs=[]
        is_contig=1
        contig=nil
        n=0
        q_beg.each_with_index do |item,ind|
                if item>0 ||single
                        if contig==nil
                                if contig_length.class.to_s=='Array'
                                        length=contig_length[n]
                                        name="#{contig_name}_#{n}"
                                else
                                        length=contig_length
                                        name=contig_name
                                end
                                contig=Contig.new(name)
                                contig.length=length
                                contig.type=contig_type
                                hit_v=contig.add_hit(contig_name,s_length,1,:prot)
                                hit_v.type=hit_type
                        end
                        hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
                        hsp_v.type=hsp_type
                end
                if item==0 && contig!=nil && !single||q_beg.length-1==ind
                        if single ||!q_beg.include?(0)
                                contigs=contig
                        else
                                contigs << contig
                        end
                        n+=1
                        contig=nil
                end
        end
        return contigs
end
web_body(file_web) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 921
def web_body(file_web)
  if !file_web.nil?
    html_table_footer(file_web)
    html_footer(file_web)
    file_web.close
  end
end
web_header(web, path) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 911
def web_header(web, path)
  file_web=nil
  if web
    file_web=File.open(path,'w')
    html_header(file_web,'Gene index')
    html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
  end
  return file_web
end
write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 937
def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
  gff_model=ReportGff.new(gff_dataset_model,path,'s')
  gff_model.create('a')
  gff=ReportGff.new(gff_dataset,path,'s')
  gff.create('a',name)
end
write_model_fasta(sequences_hash, path) click to toggle source
# File lib/gene_assembler/rebuild.rb, line 929
def write_model_fasta(sequences_hash, path)
  model_file=File.open(path,'w')
  sequences_hash.each do |model|
    model_file.puts '>'+model[0]+"_model\n"+model[1]
  end
  model_file.close
end