class Cheripic::Pileup

An extension of Bio::DB::Pileup object to process pileup information at a given position

Public Class Methods

new(string) click to toggle source

creates a Pileup object using a pileup information as string @param string [String] pileup information line for a given position

Calls superclass method
# File lib/cheripic/pileup.rb, line 15
def initialize(string)
  super(string)
  adj_read_bases
  @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
end

Public Instance Methods

adj_read_bases() click to toggle source

removes mapping quality information

# File lib/cheripic/pileup.rb, line 22
def adj_read_bases
  # mapping quality after '^' symbol is substituted
  # to avoid splitting at non indel + or - characters
  # read ends marking by '$' symbol is substituted
  # insertion and deletion marking by '*' symbol is substituted
  self.read_bases.gsub!(/\^./, '')
  self.read_bases.delete! '$'
  self.read_bases.delete! '*'
  # warn about reads with ambiguous codes
  # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
  #   warn "Ambiguous nucleotide\t#{self.read_bases}"
  # end
end
bases_hash() click to toggle source

count bases matching reference and non-reference from snp variant and make a hash of bases with counts for indels return the read bases information instead

# File lib/cheripic/pileup.rb, line 39
def bases_hash
  if self.read_bases =~ /\+/
    bases_hash = indels_to_hash('+')
  elsif self.read_bases =~ /-/
    bases_hash = indels_to_hash('-')
  else
    bases_hash = snp_base_hash(self.read_bases)
  end
  # some indels will have ref base in the read and using
  # sum of hash values is going to give wrong additional coverage
  # from indels so including actual coverage from pileup
  # bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel
  bases_hash
end
count_indel_bases(delimiter) click to toggle source

count bases from indels array of pileup bases is split at + / - and number after each + / - is counted

# File lib/cheripic/pileup.rb, line 57
def count_indel_bases(delimiter)
  array = self.read_bases.split(delimiter)
  number = 0
  array.shift
  array.each do |element|
    # deletions in reference could contain ambiguous codes,
    number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
  end
  number
end
is_var() click to toggle source

check if the pileup has the parameters we are looking for

# File lib/cheripic/pileup.rb, line 83
def is_var
  ignore_reference_n = Options.ignore_reference_n
  min_depth  = Options.mindepth
  min_non_ref_count = Options.min_non_ref_count

  return false if self.ref_base == '*'
  return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
  return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
  false
end
non_ref_count() click to toggle source

count bases matching reference and non-reference and calculate ratio of non_ref allele to total bases

# File lib/cheripic/pileup.rb, line 70
def non_ref_count
  read_bases = self.read_bases
  if read_bases =~ /\+/
    non_ref_count = indel_non_ref_count('+')
  elsif read_bases =~ /-/
    non_ref_count = indel_non_ref_count('-')
  else
    non_ref_count = read_bases.count('atgcATGC')
  end
  non_ref_count
end
non_ref_ratio() click to toggle source

count bases matching reference and non-reference and calculate ratio of non_ref allele to total bases

# File lib/cheripic/pileup.rb, line 96
def non_ref_ratio
  self.non_ref_count.to_f / self.coverage.to_f
end
var_base_frac() click to toggle source

form hash of base information, [ATGC] counts for snp a hash of base proportion is calculated base proportion hash below a selected depth is empty base proportion below or equal to a noise factor are discarded

# File lib/cheripic/pileup.rb, line 118
def var_base_frac
  hash = self.bases_hash
  snp_hash = {}
  coverage = self.coverage
  return snp_hash if coverage < Options.mindepth
  # calculate proportion of each base in coverage
  hash.each_key do | base |
    freq = hash[base].to_f/coverage.to_f
    next if freq <= Options.noise
    snp_hash[base] = freq
  end
  snp_hash
end

Private Instance Methods

indel_non_ref_count(delimitter) click to toggle source
# File lib/cheripic/pileup.rb, line 167
def indel_non_ref_count(delimitter)
  read_bases = self.read_bases
  non_ref_count = read_bases.count(@indelbases)
  indelcounts = read_bases.count(delimitter)
  indel_bases = count_indel_bases(delimitter)
  non_ref_count + indelcounts - indel_bases
end
indels_to_hash(delimiter) click to toggle source

count number of indels and number non-indel base and return a hash with bases and indel counts

# File lib/cheripic/pileup.rb, line 137
def indels_to_hash(delimiter)
  non_indel_bases = String.new
  array = self.read_bases.split(delimiter)
  non_indel_bases << array.shift
  array.each do |element|
    # get number of nucleotides inserted or deleted
    number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
    # capture remaining nucleotides
    non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
  end
  bases_hash = snp_base_hash(non_indel_bases)
  # check at least three reads are supporting indel
  indel_count = self.read_bases.count(delimiter)
  if indel_count >= Options.min_indel_count_support
    bases_hash[:indel] = indel_count
  end
  bases_hash
end
snp_base_hash(readbases) click to toggle source
# File lib/cheripic/pileup.rb, line 156
def snp_base_hash(readbases)
  non_indel_base_hash = {}
  non_indel_base_hash[:ref] = readbases.count('.,')
  non_indel_base_hash[:A] = readbases.count('aA')
  non_indel_base_hash[:C] = readbases.count('cC')
  non_indel_base_hash[:G] = readbases.count('gG')
  non_indel_base_hash[:T] = readbases.count('tT')
  # non_indel_base_hash[:N] = read_bases.count('nN')
  non_indel_base_hash
end