class BibCard::Crawler

Constants

SPARQL_ENDPOINTS

Public Class Methods

new(uri, repository) click to toggle source
# File lib/bib_card/crawler.rb, line 4
def initialize(uri, repository)
  @subject = RDF::URI(uri)
  @repository = repository
end

Public Instance Methods

alma_maters() click to toggle source
# File lib/bib_card/crawler.rb, line 328
def alma_maters
  sparql = "
  #{self.wikidata_sparql_prefixes}

  SELECT DISTINCT ?inst ?instLabel ?statement ?reference ?source ?sourceLabel
  WHERE
  {
    <#{self.wikidata_uri.to_s}> p:P69 ?statement .
    ?statement ps:P69 ?inst .
    ?inst rdfs:label ?instLabel .
    FILTER(langMatches(lang(?instLabel), \"en\"))
    OPTIONAL {
      ?statement prov:wasDerivedFrom ?reference .
      ?reference pref:P248 ?source .
      ?source rdfs:label ?sourceLabel .
      FILTER(langMatches(lang(?sourceLabel), \"en\"))
    }
  }
  "
  get_data(sparql, :wikidata)
end
birth_date() click to toggle source
# File lib/bib_card/crawler.rb, line 15
def birth_date
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_BIRTHDATE}).first
  stmt.nil? ? nil : stmt.object
end
brief_bio() click to toggle source
# File lib/bib_card/crawler.rb, line 350
def brief_bio
  sparql = "
  #{self.wikidata_sparql_prefixes}

  SELECT DISTINCT ?description ?workLocation ?workLocationLabel
  WHERE
  {
    <#{self.wikidata_uri.to_s}> schema:description ?description .
    OPTIONAL {
      <#{self.wikidata_uri.to_s}> wdt:P937 ?workLocation .
    }
    SERVICE wikibase:label {
      bd:serviceParam wikibase:language \"en\" .
    }
    FILTER(langMatches(lang(?description), \"en\"))
  }
  "
  get_data(sparql, :wikidata).first
end
creator_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 45
def creator_graph
  graph = RDF::Graph.new
  if @repository.size > 0
    @repository.query({subject: @subject, predicate: RDF.type}).each {|stmt| graph << stmt}
    @repository.query({subject: @subject, predicate: SCHEMA_NAME}).each {|stmt| graph << stmt}
    graph << [@subject, SCHEMA_BIRTHDATE, self.birth_date] if self.birth_date
    graph << [@subject, SCHEMA_DEATHDATE, self.death_date] if self.death_date
    graph << [@subject, SCHEMA_SAME_AS, self.loc_uri] if self.loc_uri
    graph << [@subject, SCHEMA_SAME_AS, self.dbpedia_uri] if self.dbpedia_uri
    graph << [@subject, SCHEMA_SAME_AS, self.getty_uri] if self.getty_uri
    graph << [@subject, SCHEMA_SAME_AS, self.wikidata_uri] if self.wikidata_uri
    graph << dbpedia_graph if self.dbpedia_uri
    graph << getty_note_graph if self.getty_uri
    graph << wikidata_graph if self.wikidata_uri
  end
  graph
end
dbpedia_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 63
def dbpedia_graph
  graph = RDF::Graph.new
  begin
    graph << profile_graph
    graph << influence_graph
    graph << film_graph
  rescue RestClient::RequestTimeout
    BibCard.logger.warn "DBPedia failed to respond. SPARQL query request timed out after 5 seconds for #{@current_query}."
  rescue Exception => e
    BibCard.logger.warn "DBPedia failed to respond. Processing data for SPARQL request: #{@current_query}. Error: #{e.message}"
  end
  graph
end
dbpedia_profile() click to toggle source
# File lib/bib_card/crawler.rb, line 189
def dbpedia_profile
  sparql = "
  #{self.dbpedia_sparql_prefixes}

  SELECT ?abstract ?foundedDate ?location ?thumbnail ?depiction
  WHERE {
    OPTIONAL { <#{self.dbpedia_uri}> dbo:abstract ?abstract . }
    OPTIONAL {<#{self.dbpedia_uri}> dbp:location ?location . }
    OPTIONAL { <#{self.dbpedia_uri}> dbp:foundedDate ?foundedDate . }
    OPTIONAL { <#{self.dbpedia_uri}> dbo:thumbnail ?thumbnail . }
    OPTIONAL { <#{self.dbpedia_uri}> foaf:depiction ?depiction . }
    FILTER(langMatches(lang(?abstract), \"en\"))
  }
  "
  get_data(sparql, :dbpedia).first
end
dbpedia_uri() click to toggle source
# File lib/bib_card/crawler.rb, line 30
def dbpedia_uri
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_SAME_AS}).select {|s| s.object.to_s.match('http://dbpedia.org/resource')}.first
  stmt.nil? ? nil : stmt.object
end
death_date() click to toggle source
# File lib/bib_card/crawler.rb, line 20
def death_date
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_DEATHDATE}).first
  stmt.nil? ? nil : stmt.object
end
film_appearances() click to toggle source
# File lib/bib_card/crawler.rb, line 206
def film_appearances
  sparql = "
  #{self.dbpedia_sparql_prefixes}

  SELECT ?film ?filmName ?filmAbstract
  WHERE {
    ?film dbo:starring <#{self.dbpedia_uri}> .
    ?film rdfs:label ?filmName .
    ?film dbo:abstract ?filmAbstract .
    FILTER(langMatches(lang(?filmName), \"en\"))
    FILTER(langMatches(lang(?filmAbstract), \"en\"))
  }
  "
  get_data(sparql, :dbpedia)
end
film_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 104
def film_graph
  @current_query = "film graph"
  graph = RDF::Graph.new
  self.film_appearances.each do |appearance|
    film = RDF::URI.new(appearance["film"]["value"])
    graph << [film, DBO_STARRING, self.dbpedia_uri]
    graph << [film, RDF::RDFS.label, appearance["filmName"]["value"]]
    graph << [film, DBO_ABSTRACT, appearance["filmAbstract"]["value"]]
  end
  graph
end
getty_note_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 222
def getty_note_graph
  @current_query = "getty note graph"
  graph = RDF::Graph.new
  begin
    getty_subject = self.getty_uri
    self.getty_scope_notes.each do |scope_note|
      # Add the scope note itself
      scope_note_uri = RDF::URI.new(scope_note["scopeNote"]["value"])
      graph << [getty_subject, SKOS_SCOPE_NOTE, scope_note_uri]
      graph << [scope_note_uri, RDF.value, scope_note["scopeNoteValue"]["value"]]

      # Add the sources/citations for the scope note
      source_uri = RDF::URI.new(scope_note["source"]["value"])
      graph << [scope_note_uri, DC_SOURCE, source_uri]
      if scope_note["sourceShortTitle"]
        graph << [source_uri, BIBO_SHORT_TITLE, scope_note["sourceShortTitle"]["value"]]
      else
        parent_uri = RDF::URI.new(scope_note["parent"]["value"])
        graph << [source_uri, DC_IS_PART_OF, parent_uri]
        graph << [source_uri, RDF.type, BIBO_DOCUMENT_PART]
        graph << [parent_uri, BIBO_SHORT_TITLE, scope_note["parentShortTitle"]["value"]]
      end
    end
  rescue RestClient::RequestTimeout
    BibCard.logger.warn "Getty failed to respond. SPARQL query request timed out after 5 seconds for #{@current_query}."
  rescue Exception => e
    BibCard.logger.warn "Getty failed to respond. Processing data for SPARQL request: #{@current_query}. Error: #{e.message}"
  end
  graph
end
getty_scope_notes() click to toggle source
# File lib/bib_card/crawler.rb, line 253
def getty_scope_notes
  sparql = "
  PREFIX ulan: <http://vocab.getty.edu/ulan/>
  PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  PREFIX dct: <http://purl.org/dc/terms/>
  PREFIX bibo: <http://purl.org/ontology/bibo/>

  SELECT ?scopeNote ?scopeNoteValue ?source ?sourceShortTitle ?parent ?parentShortTitle
  WHERE {
    <#{self.getty_uri.to_s}> skos:scopeNote ?scopeNote .
    ?scopeNote rdf:value ?scopeNoteValue .
    ?scopeNote dct:source ?source .
    OPTIONAL { ?source bibo:shortTitle ?sourceShortTitle . }
    OPTIONAL {
      ?source dct:isPartOf ?parent .
      ?parent bibo:shortTitle ?parentShortTitle .
    }
  }
  "
  get_data(sparql, :getty)
end
getty_uri() click to toggle source
# File lib/bib_card/crawler.rb, line 35
def getty_uri
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_SAME_AS}).select {|s| s.object.to_s.match('vocab.getty.edu')}.first
  stmt.nil? ? nil : RDF::URI.new( stmt.object.to_s.gsub('-agent', '') )
end
influence_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 77
def influence_graph
  graph = RDF::Graph.new
  [:influences, :influenced].each do |relationship|
    m = self.method(relationship)
    m.call.each do |influence|
      if relationship == :influences
        field = "influence"
        predicate = DBO_INFLUENCED_BY
      else
        field = "influenced"
        predicate = DBO_INFLUENCED
      end
      influence_entity = RDF::URI.new(influence[field]["value"])
      graph << [self.dbpedia_uri, predicate, influence_entity]
      graph << [influence_entity, RDFS_LABEL, influence["#{field}Label"]["value"]]
      if influence["#{field}GivenName"] and influence["#{field}Surname"]
        graph << [influence_entity, FOAF_GIVEN_NAME, influence["#{field}GivenName"]["value"]]
        graph << [influence_entity, FOAF_SURNAME, influence["#{field}Surname"]["value"]]
      end
      if influence["influenceSameAs"]
        graph << [influence_entity, RDF::OWL.sameAs, influence["#{field}SameAs"]["value"]]
      end
    end
  end
  graph
end
influenced() click to toggle source
# File lib/bib_card/crawler.rb, line 160
def influenced
  @current_query = "influence upon graph"
  sparql = "
  #{self.dbpedia_sparql_prefixes}

  SELECT ?influenced ?influencedGivenName ?influencedSurname ?influencedSameAs ?influencedLabel
  WHERE {
    {
      <#{self.dbpedia_uri}> dbo:influenced ?influenced .
    }
    UNION
    {
      ?influenced dbo:influencedBy <#{self.dbpedia_uri}> .
    }
    ?influenced rdfs:label ?influencedLabel .
    OPTIONAL {
      ?influenced foaf:givenName ?influencedGivenName .
      ?influenced foaf:surname ?influencedSurname .
    }
    OPTIONAL {
      ?influenced owl:sameAs ?influencedSameAs .
      FILTER regex(STR(?influencedSameAs), \"viaf.org\").
    }
    FILTER (lang(?influencedLabel) = 'en')
  }
  "
  get_data(sparql, :dbpedia)
end
influences() click to toggle source
# File lib/bib_card/crawler.rb, line 131
def influences
  @current_query = "influences graph"
  sparql = "
  #{self.dbpedia_sparql_prefixes}

  SELECT DISTINCT ?influence ?influenceGivenName ?influenceSurname ?influenceSameAs ?influenceLabel
  WHERE {
    {
      ?influence dbo:influenced <#{self.dbpedia_uri}> .
    }
    UNION
    {
      <#{self.dbpedia_uri}> dbo:influencedBy ?influence .
    }
    ?influence rdfs:label ?influenceLabel .
    OPTIONAL {
      ?influence foaf:givenName ?influenceGivenName .
      ?influence foaf:surname ?influenceSurname .
    }
    OPTIONAL {
      ?influence owl:sameAs ?influenceSameAs .
      FILTER regex(STR(?influenceSameAs), \"viaf.org\").
    }
    FILTER (lang(?influenceLabel) = 'en')
  }
  "
  get_data(sparql, :dbpedia)
end
loc_uri() click to toggle source
# File lib/bib_card/crawler.rb, line 25
def loc_uri
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_SAME_AS}).select {|s| s.object.to_s.match('http://id.loc.gov/authorities/names/')}.first
  stmt.nil? ? nil : stmt.object
end
notable_works() click to toggle source
# File lib/bib_card/crawler.rb, line 370
def notable_works
  sparql = "
  #{self.wikidata_sparql_prefixes}

  SELECT DISTINCT ?notableWork ?notableWorkLabel ?isbn ?oclcNumber
  WHERE
  {
    <#{self.wikidata_uri.to_s}> wdt:P800 ?notableWork .
            OPTIONAL {
        ?notableWork wdt:P212 ?isbn .
        ?notableWork wdt:P243 ?oclcNumber .
      }
    SERVICE wikibase:label {
            bd:serviceParam wikibase:language \"en\" .
    }
  }
  "
  notable_works = get_data(sparql, :wikidata)
  notable_works.select {|work| work["notableWorkLabel"] != nil and !work["notableWorkLabel"]["value"].match(/^Q\d+$/)}
end
profile_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 116
def profile_graph
  @current_query = "profile graph"
  graph = RDF::Graph.new
  dbpedia_subject = self.dbpedia_uri
  profile = self.dbpedia_profile
  if profile
    graph << [dbpedia_subject, DBO_ABSTRACT, profile["abstract"]["value"]] if profile["abstract"]
    graph << [dbpedia_subject, DBP_FOUNDED, profile["foundedDate"]["value"]] if profile["foundedDate"]
    graph << [dbpedia_subject, DBP_LOCATION, profile["location"]["value"]] if profile["location"]
    graph << [dbpedia_subject, DBO_THUMBNAIL, profile["thumbnail"]["value"]] if profile["thumbnail"]
    graph << [dbpedia_subject, FOAF_DEPICTION, profile["depiction"]["value"]] if profile["depiction"]
  end
  graph
end
wikidata_graph() click to toggle source
# File lib/bib_card/crawler.rb, line 276
def wikidata_graph
  graph = RDF::Graph.new
  begin
    wikidata_subject = self.wikidata_uri
    self.alma_maters.each do |alma_mater|
      @current_query = "alma maters graph"
      am_inst_uri   = RDF::URI.new(alma_mater["inst"]["value"])
      am_edu_stmt   = RDF::URI.new(alma_mater["statement"]["value"])

      graph << [wikidata_subject, WDT_EDUCATED_AT, am_inst_uri]
      graph << [am_inst_uri, RDF::RDFS.label, alma_mater["instLabel"]["value"]]
      graph << [wikidata_subject, WDP_EDUCATED_AT, am_edu_stmt]
      graph << [am_edu_stmt, WDPS_STMT_EDU_AT, am_inst_uri]

      # Not all assertions have references/citations
      if alma_mater["reference"]
        am_stmt_ref   = RDF::URI.new(alma_mater["reference"]["value"])
        am_ref_source = RDF::URI.new(alma_mater["source"]["value"])

        graph << [am_edu_stmt, PROV_DERIVED_FROM, am_stmt_ref]
        graph << [am_stmt_ref, WDR_STATED_IN, am_ref_source]
        graph << [am_ref_source, RDF::RDFS.label, alma_mater["sourceLabel"]["value"]]
      end
    end

    bio = self.brief_bio
    if bio
      @current_query = "brief bio graph"
      graph << [wikidata_subject, SCHEMA_DESCRIPTION, bio["description"]["value"]] if bio["description"]
      if bio["workLocation"]
        work_loc_uri = RDF::URI.new(bio["workLocation"]["value"])
        graph << [wikidata_subject, WDT_WORK_LOCATION, work_loc_uri]
        graph << [work_loc_uri, RDF::RDFS.label, bio["workLocationLabel"]["value"]]
      end
    end

    self.notable_works.each do |work|
      @current_query = "notable works graph"
      work_uri = RDF::URI.new(work["notableWork"]["value"])
      graph << [wikidata_subject, WDT_NOTABLE_WORKS, work_uri]
      graph << [work_uri, RDF::RDFS.label, work["notableWorkLabel"]["value"]]
      graph << [work_uri, WDT_ISBN, work["isbn"]["value"]] if work["isbn"]
      graph << [work_uri, WDT_OCLC_NUMBER, work["oclcNumber"]["value"]] if work["oclcNumber"]
    end
  rescue RestClient::RequestTimeout
    BibCard.logger.warn "WikiData failed to respond. SPARQL query request timed out after 5 seconds for #{@current_query}."
  rescue Exception => e
    BibCard.logger.warn "WikiData failed to respond. Processing data for SPARQL request: #{@current_query}. Error: #{e.message}"
  end
  graph
end
wikidata_uri() click to toggle source
# File lib/bib_card/crawler.rb, line 40
def wikidata_uri
  stmt = @repository.query({subject: @subject, predicate: SCHEMA_SAME_AS}).select {|s| s.object.to_s.match('http://www.wikidata.org/entity')}.first
  stmt.nil? ? nil : stmt.object
end

Protected Instance Methods

dbpedia_sparql_prefixes() click to toggle source
# File lib/bib_card/crawler.rb, line 410
def dbpedia_sparql_prefixes
  "
  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  PREFIX owl: <http://www.w3.org/2002/07/owl#>
  PREFIX foaf: <http://xmlns.com/foaf/0.1/>
  PREFIX dbo: <http://dbpedia.org/ontology/>
  "
end
get_data(sparql, source) click to toggle source
# File lib/bib_card/crawler.rb, line 393
def get_data(sparql, source)
  url = SPARQL_ENDPOINTS[source] + URI::encode_www_form_component(sparql.gsub(/\n/, ' '))
  data = RestClient::Request.execute(method: :get, url: url, headers: {accept: "application/sparql-results+json"}, timeout: 5)
  parsed_data = JSON.parse data
  parsed_data["results"]["bindings"]
end
wikidata_sparql_prefixes() click to toggle source
# File lib/bib_card/crawler.rb, line 400
def wikidata_sparql_prefixes
  "
  PREFIX wikibase: <http://wikiba.se/ontology#>
  PREFIX p: <http://www.wikidata.org/prop/>
  PREFIX pref: <http://www.wikidata.org/prop/reference/>
  PREFIX ps: <http://www.wikidata.org/prop/statement/>
  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  "
end