class BioInterchange::Genomics::VCFReader
Public Class Methods
Creates a new instance of a Genome Variation Format (GVF) reader.
name
-
Optional name of the person who generated the GVF file.
name_uri
-
Optional e-mail address of the person who generated the GVF file.
date
-
Optional date of when the GVF file was produced.
BioInterchange::Genomics::GFF3Reader::new
# File lib/biointerchange/genomics/vcf_reader.rb, line 26 def initialize(name = nil, name_uri = nil, date = nil, batch_size = nil) # Remember: calling super without brackets passes all arguments of initialize! super end
Protected Instance Methods
Adds a comment to the feature set; ignores the header line that preceds VCF features. Comments are added on a line-by-line basis.
feature_set
-
VCF feature set to which the comment line is being added
comment
-
comment line in the VCF file
# File lib/biointerchange/genomics/vcf_reader.rb, line 116 def add_comment(feature_set, comment) if comment.start_with?("CHROM\tPOS\tID\tREF\tALT") then columns = comment.split("\t") @samples = columns[9..-1] @samples = [] unless @samples else @comment << comment end end
Adds a VCF feature to a VCF feature set.
feature_set
-
feature set to which the feature should be added to
line
-
line from the VCF that describes the feature
# File lib/biointerchange/genomics/vcf_reader.rb, line 130 def add_feature(feature_set, line) line.chomp! chrom, pos, id, ref, alt, qual, filter, info, format, samples = line.split("\t") # Replace an unknown ID by nil, so that feature coordinates are used during serialization: id = nil # # Split composite fields # # Alternative alleles: alt = alt.split(',') # Filters: filter = filter.split(';') # Feature information: info = info.split(';') info = info.map { |key_value_pair| key, values = key_value_pair.split('=', 2) } info = Hash[info] info = Hash[info.map { |key, value| if value then [ key, value.split(',') ] else [ key, true ] end }] # Format for following sample columns: format = format.split(':') # Sample columns (need to be further split in the writer -- depends on format): samples = samples.split("\t").map { |value| # Dot: not data provided for the sample if value == '.' then {} else values = value.split(':') Hash[format.zip(values)] end } feature_set.add(BioInterchange::Genomics::VCFFeature.new(chrom, pos, id, ref, alt, qual, filter, info, samples)) end
# File lib/biointerchange/genomics/vcf_reader.rb, line 37 def add_pragma(feature_set, line) line.chomp! name, value = line[2..-1].split(/=/, 2) value.strip! # Interpret pragmas, and if not known, delegate to GFF3Reader (in alphabetical order): if name == 'assembly' then # attributes = split_attributes(value) # structured_attributes = feature_set.pragma(name) # structured_attributes = { name => [] } unless structured_attributes # structured_attributes[name] << attributes # feature_set.set_pragma(name, structured_attributes) elsif name == 'center' then # elsif name == 'contig' then self.add_vcf_pragma(feature_set, name, value) elsif name == 'fileDate' then feature_set.set_pragma(name, { name => Date.parse(value) }) elsif name == 'fileformat' then feature_set.set_pragma(name, { name => value.sub(/^VCFv/, '').to_f }) elsif name == 'FILTER' then self.add_vcf_pragma(feature_set, name, value) elsif name == 'FORMAT' then self.add_vcf_pragma(feature_set, name, value) elsif name == 'geneAnno' then # elsif name == 'ID' then # elsif name == 'INFO' then feature_set.set_pragma(name, vcf_mapping(value)) elsif name == 'Number' then # elsif name == 'PEDIGREE' then self.add_vcf_pragma(feature_set, name, value) elsif name == 'phasing' then # elsif name == 'reference' then # elsif name == 'SAMPLE' then # elsif name == 'tcgaversion' then # elsif name == 'Type' then # elsif name == 'vcfProcessLog' then # elsif name == 'reference' then # 'reference' is not specified in VCF 4.1, but used in examples and real-world # VCF files nevertheless. # TODO What if reference already set? feature_set.set_pragma(name, value) else # Cannot be passed to super class, because GFF3 has inherently different pragma statements. feature_set.set_pragma(name, { name => value }) end end
Adds pragma information where the pragma can appear multiple times in the input (application: VCF). Each pragma information is still a hash, which is stored in an array.
feature_set
-
feature set to which the pragma information is added
name
-
name of the pragma under which the information is being stored
value
-
hashmap of the actual pragma information (will be passed through
vcf_mapping
call)
# File lib/biointerchange/genomics/vcf_reader.rb, line 101 def add_vcf_pragma(feature_set, name, value) values = feature_set.pragma(name) if values then values << vcf_mapping(value) else values = [ vcf_mapping(value) ] end feature_set.set_pragma(name, values) end
# File lib/biointerchange/genomics/vcf_reader.rb, line 33 def create_feature_set BioInterchange::Genomics::VCFFeatureSet.new() end
Private Instance Methods
Takes a VCF meta-information string and returns a key-value mapping.
value
-
value of a meta-information assignment in VCF (key/value mappings of the form “<ID=value,…>”)
# File lib/biointerchange/genomics/vcf_reader.rb, line 181 def vcf_mapping(value) value = value[1..-2] mapping = {} identifier = '' assignment = '' state = :id value.each_char { |character| if state == :value then if character == '"' then state = :quoted next else state = :plain end end state = :separator if state == :plain and character == ',' if state == :id then if character == '=' then state = :value assignment = '' else identifier << character end elsif state == :separator then if character == ',' then state = :id mapping[identifier] = assignment identifier = '' else # TODO Format error. end elsif state == :quoted then if character == '"' then state = :separator mapping[identifier] = assignment identifier = '' else assignment << character end elsif state == :plain then assignment << character else # TODO Whoops. Report error. end } mapping[identifier] = assignment unless identifier.empty? mapping end