class GeoCombine::GeoBlacklightHarvester
A class to harvest and index results from GeoBlacklight sites You can configure the sites to be harvested via a configure command. GeoCombine::GeoBlacklightHarvester.configure
do
{ SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } } }
end The class configuration also allows for various other things to be configured:
- A debug parameter to print out details of what is being harvested and indexed - crawl delays for each page of results (globally or on a per site basis) - Solr's commitWithin parameter (defaults to 5000) - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
Example: GeoCombine::GeoBlacklightHarvester.new
('SITE').index
Attributes
document_transformer[W]
site[R]
site_key[R]
Public Class Methods
config()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 28 def config @config || {} end
configure() { |block| ... }
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 24 def configure(&block) @config = yield block end
document_transformer()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 32 def document_transformer @document_transformer || ->(document) do document.delete('_version_') document.delete('score') document.delete('timestamp') document end end
new(site_key)
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 44 def initialize(site_key) @site_key = site_key @site = self.class.config[site_key] raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site end
Public Instance Methods
index()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 51 def index puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug] response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1"))) response_class = BlacklightResponseVersionFactory.call(response) response_class.new(response: response, base_url: base_url).documents.each do |docs| docs.map! do |document| self.class.document_transformer.call(document) if self.class.document_transformer end.compact puts "Adding #{docs.count} documents to solr" if self.class.config[:debug] solr_connection.update params: { commitWithin: commit_within, overwrite: true }, data: docs.to_json, headers: { 'Content-Type' => 'application/json' } sleep(crawl_delay.to_i) if crawl_delay end end
Private Instance Methods
base_url()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 179 def base_url "#{site[:host]}?#{default_params.to_query}" end
commit_within()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 189 def commit_within self.class.config[:commit_within] || '5000' end
crawl_delay()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 193 def crawl_delay site[:crawl_delay] || self.class.config[:crawl_delay] end
default_params()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 197 def default_params { per_page: 100, format: :json }.merge(site[:params]) end
solr_connection()
click to toggle source
# File lib/geo_combine/geo_blacklight_harvester.rb, line 183 def solr_connection solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core' RSolr.connect url: solr_url, adapter: :net_http_persistent end