class BioTCM::Apps::StringProcessor

To extract ppi network from STRING

Example Usage

BioTCM::Apps::StringProcessor.new(
  'protein.links.detailed.v10.txt',
  'species.v10.txt'
).extract_by_species(
  'protein.links.detailed.v10.homo.sapiens.txt', 'Homo sapiens'
)

Constants

VERSION

Version of StringProcessor

Public Class Methods

new(protein_links_filepath, species_filepath) click to toggle source

Open STRING data files @param protein_links_filepath [String] @param species_filepath [String]

# File lib/biotcm/apps/string_processor.rb, line 18
def initialize(protein_links_filepath, species_filepath)
  @f_protein_links = File.open(protein_links_filepath)
  @f_species = File.open(species_filepath)
end

Public Instance Methods

check() click to toggle source

Check given STRING network file

# File lib/biotcm/apps/string_processor.rb, line 24
def check
  species = []
  counter = 0

  @f_protein_links.pos = 0
  @f_protein_links.each do |line|
    col = line.chomp!.split("\t")
    /^(?<id>\d+)\./ =~ col[0]
    if id != species.last
      puts "Processing Species No.#{id}..."
      species << id
    end
    counter += 1
  end

  puts "Total #{species.size} kinds of species"
  puts "Total #{counter} lines"
end
extract_by_species(filepath, species = 'Homo sapiens') click to toggle source

Extract ppi network by species @param filepath [String] path to output @param species [String/Integer] species name or ID

# File lib/biotcm/apps/string_processor.rb, line 46
def extract_by_species(filepath, species = 'Homo sapiens')
  fout = File.new(filepath, 'w')
  raise ArgumentError, 'Illegal filepath given' unless fout

  species = find_species_id(species).to_i
  raise ArgumentError, 'Illegal species given' unless species > 0

  # Start from head of the file
  counter = 0
  @f_protein_links.pos = 0

  # Jump to target lines
  until @f_protein_links.gets =~ /^#{species}\./
    @f_protein_links.pos += 500_000
    @f_protein_links.gets # finish reading current line
  end
  @f_protein_links.pos -= 501_000
  @f_protein_links.gets

  # Start to extract
  @f_protein_links.each do |line|
    col = line.chomp.split(' ')
    col[0] =~ /(?<species_id>\d+)\.(?<protein_id>.*)$/
    next if species_id.to_i < species
    break if species_id.to_i > species

    # Handle proteins' names
    col[0] = protein_id
    col[1] =~ /\d+\.(?<protein_id>.*)$/
    col[1] = protein_id

    fout.puts col.join("\t")
    counter += 1
  end

  puts "Total #{counter} PPIs extracted"
  fout.close
end

Private Instance Methods

find_species_id(species) click to toggle source

Find species id by taxon_id, STRING_name_compact or official_name_NCBI

# File lib/biotcm/apps/string_processor.rb, line 88
def find_species_id(species)
  pattern = Regexp.new(species)

  @f_species.pos = 0
  @f_species.gets # Title line
  @f_species.each do |line|
    col = line.chomp.split("\t")
    [col[0], col[2], col[3]].each do |str|
      return col[0] if pattern =~ str
    end
  end
  nil
end