class Rake::BlacklightSitemapTask

Attributes

base_filename[RW]

base filename to use for sitemap in case these will be moved to a location that hosts other sitemaps so these sitemaps do not overwrite others

changefreq[RW]

value for changefreq for each page listed

gzip[RW]

should the files be gzipped? requires the commandline tool gzip

lastmod_field[RW]

Solr field that contains a date to create a lastmod date for the page. Currently must be a string as in W3C Datetime format or YYYY-MM-DD

max[RW]

the most resources which should be listed within a single sitemap defaults to 50,000

priority_field[RW]

Solr field to use to provide a priority for this resource

public_url[RW]

base url used for public directory where sitemaps will be placed

qt[RW]

pick a request handler.

resource_url[RW]

base url used for locations of resources

sort[RW]

Solr sort option

Public Class Methods

new() { |self| ... } click to toggle source
# File lib/blacklight-sitemap.rb, line 47
def initialize
  @resource_url = 'http://localhost:3000/catalog'
  @public_url = 'http://localhost:3000'
  
  @base_filename = 'blacklight'
  @gzip = false
  @changefreq = nil
  @max = 50000 #default value for max number of locs per sitemap file
  @lastmod_field = 'timestamp'
  @priority_field = nil
  @sort = '_docid_ asc' # http://osdir.com/ml/solr-user.lucene.apache.org/2010-03/msg01371.html
  @qt = 'standard'
  yield self if block_given?
  define
end

Public Instance Methods

define() click to toggle source
# File lib/blacklight-sitemap.rb, line 63
def define
  namespace :blacklight do
    desc 'clobber then create sitemap files for blacklight'
    task :sitemap => ['sitemap:clobber', 'sitemap:create']

    namespace :sitemap do

      desc 'create a sitemap for blacklight'
      task :create => :environment do
        start_time = Time.now

        #collect warnings here rather than raise an error
        warnings = []

        blacklight_config = CatalogController.blacklight_config

        puts 'Creating a sitemap...'
        fl = ['id', @lastmod_field, @priority_field].compact.join(',')
        base_solr_parameters = {:qt => @qt, :fq => 'id:[* TO *]', :fl => fl}
        response = Blacklight.solr.get(blacklight_config.solr_path, :params => base_solr_parameters.merge(:rows => 1))
        number_of_resources = response['response']['numFound']
        puts 'Number of resources: ' + number_of_resources.to_s
        batches = (number_of_resources / @max.to_f).ceil
        puts 'Total sitemap to create: ' + batches.to_s
        master_sitemap = ''
        base_solr_parameters.merge!(:sort => @sort) if @sort

        # create a hash of batches with lastmod dates so that the most recent
        # lastmod date shows up associated with that batch. This will feed
        # into the lastmod for each sitemap in the index sitemap.
        batch_lastmods = {}

        batches.times do |batch_number|
          current_page = batch_number + 1
          start = batch_number * @max
          puts 'Processing batch # ' + current_page.to_s
          response = Blacklight.solr.get(blacklight_config.solr_path, :params => base_solr_parameters.merge(:rows => @max, :start => start))['response']
          sitemap_builder = Nokogiri::XML::Builder.new do |xml|
            xml.urlset "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9" do
              response['docs'].each do |doc|
                xml.url do
                  # FIXME through config
                  xml.loc File.join(@resource_url.to_s, doc['id'])
                  if @lastmod_field and doc[@lastmod_field]
                    xml.lastmod doc[@lastmod_field].to_s
                    if batch_lastmods[batch_number].blank? or batch_lastmods[batch_number] < doc[@lastmod_field]
                      batch_lastmods[batch_number] = doc[@lastmod_field]
                    end
                  end
                  xml.priority doc[@priority_field] if @priority_field and doc[@priority_field]
                  xml.changefreq @changefreq if @changefreq
                end
              end
            end
          end
          sitemap_filename = File.join(Rails.root, 'public', @base_filename + '-sitemap' + batch_number.to_s + '.xml')
          File.open(sitemap_filename, 'w') do |fh|
            fh.puts sitemap_builder.to_xml
          end
          if File.size(sitemap_filename) > 10485760
            warnings << 'WARNING Sitemap is over 10MB limit: ' + sitemap_filename
          end
          if @gzip
            `gzip #{sitemap_filename}`
          end
        end
        puts 'Creating sitemap index...'
        rake_run_lastmod = DateTime.now.utc.strftime("%Y-%m-%dT%H:%M:%S+00:00")
        sitemap_index_builder = Nokogiri::XML::Builder.new do |xml|
          xml.sitemapindex 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9' do
            batches.times do |batch|
              sitemap_filename = File.join(@public_url.to_s, @base_filename + '-sitemap' + batch.to_s + '.xml')
              sitemap_filename << '.gz' if @gzip
              xml.sitemap{
                xml.loc sitemap_filename
                if batch_lastmods[batch]
                  xml.lastmod batch_lastmods[batch]
                else
                  xml.lastmod rake_run_lastmod
                end
              }
            end
          end
        end #sitemap_index_builder
        index_sitemap_filename = File.join(Rails.root, 'public', @base_filename + '-sitemap.xml')
        File.open(index_sitemap_filename, 'w') do |fh|
          fh.puts sitemap_index_builder.to_xml
        end
        if File.size(index_sitemap_filename) > 10485760
          warnings << 'WARNING Index sitemap is over 10MB limit: ' + index_sitemap_filename
        end
        puts 'Done.'
        end_time = Time.now
        puts 'Create start time: ' + start_time.to_s
        puts 'Create end time:   ' + end_time.to_s
        puts 'Execution time in seconds: ' + (end_time - start_time).to_s
        puts warnings.join("\n")
      end # task :sitemap

      desc 'clobber sitemap files'
      task :clobber do
        puts "Deleting all sitemap files..."
        Dir.glob(File.join(Rails.root, 'public', @base_filename + '-sitemap*')).each do |sitemap|
          FileUtils.rm(sitemap)
        end
      end

    end # namespace :sitemap
  end # namespace :blacklight
end