class BioTCM::Apps::PubmedGeneMiner

To mine gene relationships from PubMed

Example Usage

miner = BioTCM::Apps::PubmedGeneMiner.new
miner.mine_offline('(IBD[Title/Abstract]) OR (CRC[Title/Abstract]) AND Cancer[Title/Abstract] AND Obesity[Title/Abstract] AND IL-2[Title/Abstract]')
# => {"MID1"=>["25975416"], "IL2"=>["25975416"], "HR"=>["25975416"], "NDUFB6"=>["25975416"], "CXCL8"=>["25975416"]}

Constants

VERSION

Current version

Public Class Methods

new(params = {}) click to toggle source

Setup a new miner @param params options for the new miner @option params :gene_set [Array] ([…]) a set of genes we concern @option params :branching_point [Fixnum] (10000) decide the strategy upon the number of abstracts @option params :mining_strategy [Symbol] (nil) use given strategy regardless of the number of abstracts, possible values are:

- nil
- :online
- :offline
# File lib/biotcm/apps/pubmed_gene_miner.rb, line 21
def initialize(params = {})
  BioTCM::Databases::HGNC.ensure

  @branching_point = params[:branching_point] || 10_000
  @mining_strategy = params[:mining_strategy]

  if params[:gene_set]
    @gene_set = params[:gene_set].to_formal_symbols - ['']
  else
    hgnc = BioTCM::Databases::HGNC
    gene_blacklist = Regexp.new('(' + BioTCM::Apps::GeneDetector::DEFAULT_GENE_BLACKLIST.join(')|(') + ')')
    @gene_set = (hgnc.symbol2hgncid.keys - hgnc.ambiguous_symbol.keys).reject { |gene| gene =~ gene_blacklist }
  end
end

Public Instance Methods

mine(query) click to toggle source

Mine gene relationships

Mining strategy will be decided by given :mining_strategy or given :branching_point

# File lib/biotcm/apps/pubmed_gene_miner.rb, line 40
def mine(query)
  case @mining_strategy
  when :online then mine_online(query)
  when :offline then mine_offline(query)
  else
    if as_medline(query).count > @branching_point
      mine_online(query)
    else
      mine_offline(query)
    end
  end
end
mine_offline(query) click to toggle source

Mine gene relationships offline

Donwload abstracts and then count genes

# File lib/biotcm/apps/pubmed_gene_miner.rb, line 67
def mine_offline(query)
  @gene_detector = BioTCM::Apps::GeneDetector.new

  datapath = BioTCM.path_to("tmp/PubmedGeneMiner.#{BioTCM.stamp}.txt")
  as_medline(query).download_abstracts(datapath)

  res = {}
  counter = 0
  abstract = ''
  pubmed_id = nil

  f_abstracts = File.open(datapath, 'r:utf-8')
  f_abstracts.each do |line|
    if /^PMID- +(?<pmid>\d+)/ =~ line
      counter += 1
      pubmed_id = pmid
      BioTCM.logger.info('PubmedGeneMiner') { "Analyzing article \##{counter}..." }
    elsif line =~ /^AB  -/
      abstract = line.gsub(/^AB  -\s*/, '').chomp
      abstract += line.gsub(/^\s+/, ' ') while (line = f_abstracts.gets.chomp) =~ /^\s/

      # Split into sentences
      sentences = abstract.split(/[.?!][\s]?/)

      # Identify genes
      genes = sentences.map { |sentence| @gene_detector.detect(sentence) }

      # Update nodes
      genes.flatten.uniq.each do |gene|
        res[gene] = [] unless res[gene]
        res[gene] << pubmed_id
      end
    end
  end

  res.select { |gene| @gene_set.include?(gene) }
end
mine_online(query) click to toggle source

Mine gene relationships online

Query medline by enumerate all term-gene pairs

# File lib/biotcm/apps/pubmed_gene_miner.rb, line 56
def mine_online(query)
  term = as_medline(query).term

  @gene_set.map do |gene|
    [gene, BioTCM::Databases::Medline.new("(#{term}) AND #{gene}[Title/Abstract]").fetch_pubmed_ids]
  end.to_h
end

Private Instance Methods

as_medline(query) click to toggle source
# File lib/biotcm/apps/pubmed_gene_miner.rb, line 107
def as_medline(query)
  query.is_a?(BioTCM::Databases::Medline) ? query : BioTCM::Databases::Medline.new(query)
end