class Reader

Attributes

names[RW]
pubchem_compound_ids[RW]
pubchem_substance_ids[RW]

Public Class Methods

new(names_filename=nil, pubchem_substance_ids_filename=nil, pubchem_compound_ids_filename=nil) click to toggle source
# File lib/pubchem/reader.rb, line 12
def initialize(names_filename=nil,
               pubchem_substance_ids_filename=nil,
               pubchem_compound_ids_filename=nil)

  @fuzzy_matcher = FuzzyStringMatch::JaroWinkler
                   .create( :native )

  return if initialize_from_files( names_filename,
                                   pubchem_substance_ids_filename,
                                   pubchem_compound_ids_filename )

  @names = Hash.new { |h,k| h[k] = Set.new }

  @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
  @pubchem_compound_ids = Hash.new  { |h,k| h[k] = Set.new }

end

Public Instance Methods

add_name(name) click to toggle source
# File lib/pubchem/reader.rb, line 140
def add_name(name)
  return if name.nil? || name.empty?

  # Speed up lookups with sorted names
  @names[self.short_code(name)].add name

  if @current_type == "substance"
    @pubchem_substance_ids[name].add @pubchem_id
  elsif @current_type == "compound"
    @pubchem_compound_ids[name].add @pubchem_id
  else
    raise "Unknown substance"
  end

end
fuzzy_name_lookup(lookup_name, threshold) click to toggle source
# File lib/pubchem/reader.rb, line 156
def fuzzy_name_lookup(lookup_name, threshold)

  closest_distance = 0.0
  closest_name = nil

  # Optimistically check for exact name match
  exact_match = self.short_code(lookup_name).include? lookup_name

  return @pubchem_ids[lookup_name] if exact_match
  return nil if threshold == 1.0

  @names[self.short_code(lookup_name)].each do |name|

    distance = @fuzzy_matcher.getDistance(lookup_name, name)

    if distance > closest_distance
      closest_name = name
      closest_distance = distance
    end

  end

  return closest_name if closest_distance > 0.99

end
initialize_from_files(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) click to toggle source
# File lib/pubchem/reader.rb, line 30
def initialize_from_files(names_filename,
                          pubchem_substance_ids_filename,
                          pubchem_compound_ids_filename)

  filenames = [ names_filename,
                pubchem_substance_ids_filename,
                pubchem_compound_ids_filename ]

  return nil unless filenames.any?
  raise "Both filenames required" unless filenames.all?

  @names = Ox.load_file(names_filename)
  @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
  @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)

end
match_list_of_names(names, threshold=0.99) click to toggle source
# File lib/pubchem/reader.rb, line 182
def match_list_of_names(names, threshold=0.99)
  @matched_names = names.inject({}) do |acc, name|
    acc[name] = self.fuzzy_name_lookup(name, threshold)
    acc
  end
end
parse_compound(compound) click to toggle source
# File lib/pubchem/reader.rb, line 87
def parse_compound(compound)

  @pubchem_id = compound.css("PC-Compound_id
                              PC-CompoundType
                              PC-CompoundType_id
                              PC-CompoundType_id_cid").text.to_i

  compound.css("PC-Compound_props").each do |property|
    self.parse_property(property)
  end

end
parse_info_data(info_data) click to toggle source
# File lib/pubchem/reader.rb, line 122
def parse_info_data(info_data)

  urn_label = info_data.css("PC-InfoData_urn
                             PC-Urn
                             PC-Urn_label").first.text
  name = nil
  case urn_label
  when "SMILES"
    name = info_data.css("PC-InfoData_value
                          PC-InfoData_value_sval").first.text
  when"IUPAC Name"
    name = info_data.css("PC-InfoData_value
                          PC-InfoData_value_sval").first.text
  end

  self.add_name(name)
end
parse_property(property) click to toggle source
# File lib/pubchem/reader.rb, line 114
def parse_property(property)

  property.css("PC-InfoData").each do |info_data|
    parse_info_data(info_data)
  end

end
parse_substance(substance) click to toggle source
# File lib/pubchem/reader.rb, line 100
def parse_substance(substance)


  @pubchem_id = substance.css("PC-Substance_sid
                               PC-ID
                               PC-ID_id").text.to_i

  substance.css("PC-Substance_synonyms
                 PC-Substance_synonyms_E").each do |substance_synonym|
    self.add_name(substance_synonym.text)
  end

end
read(xml_filepath, type: nil) click to toggle source
# File lib/pubchem/reader.rb, line 57
def read(xml_filepath, type: nil)

  filepath = File.basename(xml_filepath)
  if type.nil? and filepath.downcase.start_with? "compound"
    type = :compound
  elsif type.nil? and filepath.downcase.start_with? "substance"
    type = :substance
  else
    raise "Cannot infer pubchem type"
  end

  f = File.open(xml_filepath)
  doc = Nokogiri::XML(f)
  f.close
  @current_type = type.to_s
  case type
  when :compound
    doc.css("PC-Compounds PC-Compound").each do |compound|
      self.parse_compound(compound)
    end
  when :substance
    doc.css("PC-Substances PC-Substance").each do |substance|
      self.parse_substance(substance)
    end
  else
    raise "Unknown type"
  end

end
retrieve_compound_ids() click to toggle source
# File lib/pubchem/reader.rb, line 215
def retrieve_compound_ids
  self.retrieve_ids(@pubchem_compound_ids)
end
retrieve_ids(collection) click to toggle source
# File lib/pubchem/reader.rb, line 189
def retrieve_ids(collection)
  msg = "@matched_names required, see #{self.class}#match_list_of_names"

  raise msg unless @matched_names

  @matched_names.inject({}) do |acc, name|
    input_name = name[0]
    matched_name = name[1]

    if matched_name
      ids = collection[matched_name]
      if ids.size > 1
        puts "WARNING: Multiple matching sets"
      end
      collection_id = collection[matched_name].first
      acc[input_name] = collection_id if collection_id
    end

    acc
  end
end
retrieve_substance_ids() click to toggle source
# File lib/pubchem/reader.rb, line 211
def retrieve_substance_ids
  self.retrieve_ids(@pubchem_substance_ids)
end
save(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) click to toggle source
# File lib/pubchem/reader.rb, line 47
def save(names_filename,
         pubchem_substance_ids_filename,
         pubchem_compound_ids_filename)

  Ox.to_file(names_filename, @names, indent: 0)
  Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
  Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)

end
short_code(name) click to toggle source
# File lib/pubchem/reader.rb, line 219
def short_code(name)
  name[0..2].downcase
end