class Maltese::Sitemap

Constants

SLACK_ICON_URL

icon for Slack messages

Attributes

access_key[R]
logger[R]
rack_env[R]
region[R]
secret_key[R]
sitemap_bucket[R]
slack_webhook_url[R]

Public Class Methods

new(attributes={}) click to toggle source
# File lib/maltese/sitemap.rb, line 30
def initialize(attributes={})
  @sitemap_bucket = attributes[:sitemap_bucket].presence || "search.test.datacite.org"
  @rack_env = attributes[:rack_env].presence || ENV['RACK_ENV'] || "stage"
  @access_key = attributes[:access_key].presence || ENV['AWS_ACCESS_KEY_ID']
  @secret_key = attributes[:secret_key].presence || ENV['AWS_SECRET_ACCESS_KEY']
  @region = attributes[:region].presence || ENV['AWS_REGION']
  @slack_webhook_url = attributes[:slack_webhook_url].presence || ENV['SLACK_WEBHOOK_URL']

  @logger = LogStashLogger.new(type: :stdout)
end

Public Instance Methods

get_data(url) click to toggle source
# File lib/maltese/sitemap.rb, line 164
def get_data(url)
  Maremma.get(url, timeout: 300)
end
get_query_url(options={}) click to toggle source
# File lib/maltese/sitemap.rb, line 103
def get_query_url(options={})
  options[:size] = options[:size] || job_batch_size

  params = { 
    "fields[dois]" => "doi,updated",
    "exclude-registration-agencies" => "true",
    "page[scroll]" => "7m",
    "page[size]" => options[:size]
  }
  search_path + URI.encode_www_form(params)
end
get_total(options={}) click to toggle source
# File lib/maltese/sitemap.rb, line 96
def get_total(options={})
  query_url = get_query_url(options.merge(size: 1))

  result = Maremma.get(query_url, options)
  result.body.dig("meta", "total")
end
job_batch_size() click to toggle source
# File lib/maltese/sitemap.rb, line 61
def job_batch_size
  1000
end
parse_data(result) click to toggle source
# File lib/maltese/sitemap.rb, line 168
def parse_data(result)
  Array.wrap(result.body.fetch("data", nil)).each do |item|
    loc = "/doi.org/" + item.dig("attributes", "doi")
    sitemap.add loc, changefreq: "weekly", lastmod: item.dig("attributes", "updated")
  end
  sitemap.sitemap.link_count
end
process_data(options = {}) click to toggle source
# File lib/maltese/sitemap.rb, line 115
def process_data(options = {})
  options[:start_time] = Time.now
  link_count = 0

  # walk through paginated results
  while options[:url] do
    begin
      response = nil

      # speed up tests
      base_interval = rack_env == "test" ? 0.1 : 10

      # retry on temporal errors (status codes 408, 500 and 502)
      Retriable.retriable(base_interval: base_interval, multiplier: 2) do
        response = get_data(options[:url])

        raise Timeout::Error, "A timeout error occured for URL #{options[:url]}." if response.status == 408
        raise InternalServerError, "An internal server error occured for URL #{options[:url]}." if response.status == 500
        raise BadGatewayError, "A bad gateway error occured for URL #{options[:url]}." if response.status == 502
      end

      if response.status == 200
        link_count = parse_data(response)
        logger.info "#{(link_count + sitemap.sitemap_index.total_link_count).to_s(:delimited)} DOIs parsed."
        options[:url] = response.body.dig("links", "next")
      else
        logger.error "An error occured for URL #{options[:url]}."
        logger.error "Error: #{response.body.fetch("errors").inspect}" if response.body.fetch("errors", nil).present?
        options[:url] = nil
      end
    rescue => exception
      logger.error "Error: #{exception.message}"
      fields = [
        { title: "Error", value: exception.message },
        { title: "Number of DOIs", value: sitemap.sitemap_index.total_link_count.to_s(:delimited), short: true },
        { title: "Number of Sitemaps", value: sitemap.sitemap_index.link_count.to_s(:delimited), short: true },
        { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
      ]
      send_notification_to_slack(nil, title: slack_title + ": Sitemaps Not Updated", level: "danger", fields: fields) unless rack_env == "test"
      options[:url] = nil
    ensure
      # don't loop when testing
      break if rack_env == "test"
    end  
  end

  push_data(options)
end
push_data(options={}) click to toggle source
# File lib/maltese/sitemap.rb, line 176
def push_data(options={})
  sitemap.finalize!
  options[:start_time] ||= Time.now
  sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
  
  fields = [
    { title: "URL", value: sitemap.sitemap_index_url },
    { title: "Number of DOIs", value: sitemap.sitemap_index.total_link_count.to_s(:delimited), short: true },
    { title: "Number of Sitemaps", value: sitemap.sitemap_index.link_count.to_s(:delimited), short: true },
    { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
  ]
  send_notification_to_slack(nil, title: slack_title + ": Sitemaps Updated", level: "good", fields: fields) unless rack_env == "test"
  sitemap.sitemap.link_count
end
queue_jobs(options={}) click to toggle source
# File lib/maltese/sitemap.rb, line 81
def queue_jobs(options={})
  total = get_total(options)

  if total.nil?
    logger.error "An error occured."
  elsif total > 0
    process_data(options.merge(total: total, url: get_query_url))
  else
    logger.info "No works found."
  end

  # return number of works queued
  total.to_i
end
s3_adapter() click to toggle source
# File lib/maltese/sitemap.rb, line 74
def s3_adapter
  SitemapGenerator::AwsSdkAdapter.new(sitemap_bucket,
                                  aws_access_key_id: access_key,
                                  aws_secret_access_key: secret_key,
                                  aws_region: region)
end
search_path() click to toggle source
# File lib/maltese/sitemap.rb, line 53
def search_path
  rack_env == "production" ? "https://api.datacite.org/dois?" : "https://api.stage.datacite.org/dois?"
end
send_notification_to_slack(text, options={}) click to toggle source
# File lib/maltese/sitemap.rb, line 191
def send_notification_to_slack(text, options={})
  return nil unless slack_webhook_url.present?

  attachment = {
    title: options[:title] || "Fabrica Message",
    text: text,
    color: options[:level] || "good",
    fields: options[:fields]
  }.compact

  begin
    notifier = Slack::Notifier.new(slack_webhook_url,
                                   username: "Fabrica",
                                   icon_url: SLACK_ICON_URL)
    response = notifier.ping attachments: [attachment]
    response.first.body
  rescue => exception
    logger.error exception.message
  end
end
sitemap() click to toggle source
# File lib/maltese/sitemap.rb, line 65
def sitemap
  @sitemap ||= SitemapGenerator::LinkSet.new(
    default_host: sitemap_url,
    sitemaps_host: sitemap_url,
    sitemaps_path: sitemaps_path,
    adapter: s3_adapter,
    finalize: false)
end
sitemap_url() click to toggle source
# File lib/maltese/sitemap.rb, line 41
def sitemap_url
  rack_env == "production" ? "https://commons.datacite.org/" : "https://commons.stage.datacite.org/"
end
sitemaps_path() click to toggle source
# File lib/maltese/sitemap.rb, line 49
def sitemaps_path
  "sitemaps/"
end
slack_title() click to toggle source
# File lib/maltese/sitemap.rb, line 45
def slack_title
  rack_env == "production" ? "DataCite Commons" : "DataCite Commons Stage"
end
timeout() click to toggle source
# File lib/maltese/sitemap.rb, line 57
def timeout
  60
end