class Bio::FastQC::Parser

Public Class Methods

new(fastqc_data_txt) click to toggle source
# File lib/bio/fastqc/parser.rb, line 5
def initialize(fastqc_data_txt)
  @data = fastqc_data_txt
  @module_results = parse_modules
  @basic_statistics = basic_statistics
end

Public Instance Methods

adapter_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 110
def adapter_content
  get_module_matrix("Adapter Content", 1)
end
basic_statistics() click to toggle source

Basic Statistics module

# File lib/bio/fastqc/parser.rb, line 21
def basic_statistics
  Hash[*@module_results[0].flatten]
end
encoding() click to toggle source
# File lib/bio/fastqc/parser.rb, line 37
def encoding # quality encoding method for input file type
  @basic_statistics["Encoding"]
end
fastqc_version() click to toggle source
# File lib/bio/fastqc/parser.rb, line 25
def fastqc_version # software version of FastQC
  @basic_statistics["##FastQC"]
end
file_type() click to toggle source
# File lib/bio/fastqc/parser.rb, line 33
def file_type # input file type
  @basic_statistics["File type"]
end
filename() click to toggle source
# File lib/bio/fastqc/parser.rb, line 29
def filename # input filename for FastQC program
  @basic_statistics["Filename"]
end
filtered_sequences() click to toggle source
# File lib/bio/fastqc/parser.rb, line 49
def filtered_sequences # number of sequence reads filtered out
  @basic_statistics["Filtered Sequences"].to_i
end
get_module_matrix(module_name, num_of_header_rows) click to toggle source

Other modules

# File lib/bio/fastqc/parser.rb, line 65
def get_module_matrix(module_name, num_of_header_rows)
  mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
  mod.drop(num_of_header_rows) if mod
end
kmer_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 114
def kmer_content
  get_module_matrix("Kmer Content", 1)
end
max_length() click to toggle source
# File lib/bio/fastqc/parser.rb, line 126
def max_length
  sequence_length.sub(/^\d+-/,"").to_i
end
mean_sequence_length() click to toggle source
# File lib/bio/fastqc/parser.rb, line 162
def mean_sequence_length
  dist = sequence_length_distribution.drop(1) # drop column header
  if dist.size == 1
    dist[0][0].to_f
  else
    sum = dist.map do |length_count|
      l = length_count[0]
      c  = length_count[1].to_f
      ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
    end
    sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
  end
end
median_sequence_length() click to toggle source
# File lib/bio/fastqc/parser.rb, line 176
def median_sequence_length
  dist = sequence_length_distribution.drop(1) # drop column header
  if dist.size == 1
    dist[0][0].to_f
  else
    k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
    median = 0
    dist.each do |l_c|
      c = l_c[1].to_f # count of reads in this length range
      if k > c
        k -= c
      else
        l = l_c[0]
        median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
        break
      end
    end
    median
  end
end
min_length() click to toggle source

Custom modules

# File lib/bio/fastqc/parser.rb, line 122
def min_length
  sequence_length.sub(/-\d+$/,"").to_i
end
overall_mean_quality_score() click to toggle source
# File lib/bio/fastqc/parser.rb, line 148
def overall_mean_quality_score
  overall_quality_score(:mean)
end
overall_median_quality_score() click to toggle source
# File lib/bio/fastqc/parser.rb, line 152
def overall_median_quality_score
  overall_quality_score(:median)
end
overall_n_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 156
def overall_n_content
  per_base = per_base_n_content
  v = per_base.map{|c| c[1].to_f }
  v.reduce(:+) / v.size
end
overall_quality_score(mean_or_median) click to toggle source
# File lib/bio/fastqc/parser.rb, line 139
def overall_quality_score(mean_or_median)
  per_base = per_base_sequence_quality.drop(1) # drop header
  column = per_base_quality_column(mean_or_median)
  v = per_base.map do |row|
    (10**(row[column].to_f / -10)).to_f
  end
  -10 * Math.log10(v.reduce(:+) / v.size)
end
overrepresented_sequences() click to toggle source
# File lib/bio/fastqc/parser.rb, line 106
def overrepresented_sequences
  get_module_matrix("Overrepresented sequences", 1)
end
parse() click to toggle source
# File lib/bio/fastqc/parser.rb, line 201
def parse
  {
    fastqc_version: fastqc_version,
    filename: filename,
    file_type: file_type,
    encoding: encoding,
    total_sequences: total_sequences,
    sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
    filtered_sequences: filtered_sequences,
    sequence_length: sequence_length,
    percent_gc: percent_gc,
    per_base_sequence_quality: per_base_sequence_quality,
    per_tile_sequence_quality: per_tile_sequence_quality,
    per_sequence_quality_scores: per_sequence_quality_scores,
    per_base_sequence_content: per_base_sequence_content,
    per_sequence_gc_content: per_sequence_gc_content,
    per_base_n_content: per_base_n_content,
    sequence_length_distribution: sequence_length_distribution,
    total_duplicate_percentage: total_duplicate_percentage,
    sequence_duplication_levels: sequence_duplication_levels,
    overrepresented_sequences: overrepresented_sequences,
    adapter_content: adapter_content,
    kmer_content: kmer_content,
    min_length: min_length,
    max_length: max_length,
    overall_mean_quality_score: overall_mean_quality_score,
    overall_median_quality_score: overall_median_quality_score,
    overall_n_content: overall_n_content,
    mean_sequence_length: mean_sequence_length,
    median_sequence_length: median_sequence_length,
  }
end
parse_modules() click to toggle source
# File lib/bio/fastqc/parser.rb, line 11
def parse_modules
  @data.split(">>END_MODULE\n").map do |mod|
    mod.split("\n").map{|line| line.split("\t") }
  end
end
per_base_n_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 90
def per_base_n_content
  get_module_matrix("Per base N content", 1)
end
per_base_quality_column(mean_or_median) click to toggle source
# File lib/bio/fastqc/parser.rb, line 130
def per_base_quality_column(mean_or_median)
  case mean_or_median
  when :mean
    1
  when :median
    2
  end
end
per_base_sequence_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 82
def per_base_sequence_content
  get_module_matrix("Per base sequence content", 1)
end
per_base_sequence_quality() click to toggle source
# File lib/bio/fastqc/parser.rb, line 70
def per_base_sequence_quality
  get_module_matrix("Per base sequence quality", 1)
end
per_sequence_gc_content() click to toggle source
# File lib/bio/fastqc/parser.rb, line 86
def per_sequence_gc_content
  get_module_matrix("Per sequence GC content", 1)
end
per_sequence_quality_scores() click to toggle source
# File lib/bio/fastqc/parser.rb, line 78
def per_sequence_quality_scores
  get_module_matrix("Per sequence quality scores", 1)
end
per_tile_sequence_quality() click to toggle source
# File lib/bio/fastqc/parser.rb, line 74
def per_tile_sequence_quality
  get_module_matrix("Per tile sequence quality", 1)
end
percent_gc() click to toggle source
# File lib/bio/fastqc/parser.rb, line 57
def percent_gc # overall percentage of GC content
  @basic_statistics["%GC"].to_f
end
sequence_duplication_levels() click to toggle source
# File lib/bio/fastqc/parser.rb, line 102
def sequence_duplication_levels
  get_module_matrix("Sequence Duplication Levels", 2)
end
sequence_length() click to toggle source
# File lib/bio/fastqc/parser.rb, line 53
def sequence_length # store as string: can be range
  @basic_statistics["Sequence length"]
end
sequence_length_distribution() click to toggle source
# File lib/bio/fastqc/parser.rb, line 94
def sequence_length_distribution
  get_module_matrix("Sequence Length Distribution", 1)
end
sequences_flagged_as_poor_quality() click to toggle source
# File lib/bio/fastqc/parser.rb, line 45
def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
  @basic_statistics["Sequences flagged as poor quality"].to_i
end
summary() click to toggle source
# File lib/bio/fastqc/parser.rb, line 197
def summary
  parse
end
total_duplicate_percentage() click to toggle source
# File lib/bio/fastqc/parser.rb, line 98
def total_duplicate_percentage
  get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
end
total_sequences() click to toggle source
# File lib/bio/fastqc/parser.rb, line 41
def total_sequences # total number of sequence reads
  @basic_statistics["Total Sequences"].to_i
end