class GeoCombine::GeoBlacklightHarvester

A class to harvest and index results from GeoBlacklight sites You can configure the sites to be harvested via a configure command. GeoCombine::GeoBlacklightHarvester.configure do

{
  SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
}

end The class configuration also allows for various other things to be configured:

- A debug parameter to print out details of what is being harvested and indexed
- crawl delays for each page of results (globally or on a per site basis)
- Solr's commitWithin parameter (defaults to 5000)
- A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)

Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index

Attributes

document_transformer[W]
site[R]
site_key[R]

Public Class Methods

config() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 28
def config
  @config || {}
end
configure() { |block| ... } click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 24
def configure(&block)
  @config = yield block
end
document_transformer() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 32
def document_transformer
  @document_transformer || ->(document) do
    document.delete('_version_')
    document.delete('score')
    document.delete('timestamp')
    document
  end
end
new(site_key) click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 44
def initialize(site_key)
  @site_key = site_key
  @site = self.class.config[site_key]

  raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
end

Public Instance Methods

index() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 51
def index
  puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
  response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
  response_class = BlacklightResponseVersionFactory.call(response)

  response_class.new(response: response, base_url: base_url).documents.each do |docs|
    docs.map! do |document|
      self.class.document_transformer.call(document) if self.class.document_transformer
    end.compact

    puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
    solr_connection.update params: { commitWithin: commit_within, overwrite: true },
                           data: docs.to_json,
                           headers: { 'Content-Type' => 'application/json' }

    sleep(crawl_delay.to_i) if crawl_delay
  end
end

Private Instance Methods

base_url() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 179
def base_url
  "#{site[:host]}?#{default_params.to_query}"
end
commit_within() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 189
def commit_within
  self.class.config[:commit_within] || '5000'
end
crawl_delay() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 193
def crawl_delay
  site[:crawl_delay] || self.class.config[:crawl_delay]
end
default_params() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 197
def default_params
  {
    per_page: 100,
    format: :json
  }.merge(site[:params])
end
solr_connection() click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 183
def solr_connection
  solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'

  RSolr.connect url: solr_url, adapter: :net_http_persistent
end