class EverTools::ElasticsearchS3Backup

Constants

VERSION

Attributes

backup_repo[R]
conf[R]
snapshot_label[R]

Public Class Methods

new() click to toggle source

rubocop:enable Metrics/AbcSize, Lint/RescueException

# File lib/elasticsearch_s3_backup.rb, line 74
def initialize
  @conf = OpenStruct.new(YAML.load_file('/etc/s3_backup.yml'))

  if sentry_dsn
    Raven.configure do |config|
      config.dsn = sentry_dsn
      config.logger = logger
    end
  end

  now                 = Time.new.utc
  @backup_test_index  = "backup_test_#{now.to_i}"
  @restore_test_index = "restore_test_#{now.to_i}"
  @backup_repo        = now.strftime '%m-%Y'
  @snapshot_label     = now.strftime '%m-%d_%H%M'
end

Public Instance Methods

run() click to toggle source

rubocop:disable Metrics/AbcSize, Lint/RescueException

# File lib/elasticsearch_s3_backup.rb, line 28
def run
  unless master?
    logger.info 'This node is not the currently elected master. Exiting.'
    exit
  end

  wait_for_cluster_state!

  cleanup_test_indexes
  insert_test_data

  # Create a new repo if none exists (typically at beginning of month)
  create_repo unless es_api.snapshot.get_repository[backup_repo]
  create_snapshot

  restore_test_index
  # Compare each doc in the original backup_test index to the restored index
  logger.info "Verifying the newly-restored #{@backup_test_index}…"
  test_size.times { |i| compare_index_item! i }
  logger.info 'Successfully verified the test data!'
  delete_test_indexes

  remove_expired_backups

  # Resolve any open Pagerduty incidents from previous backup failures
  if (pd_incident = pagerduty.get_incident("#{cluster_name} elasticsearch backup"))
    pd_incident.resolve
  end

  logger.info 'Finished'
rescue Interrupt => e
  puts "Received #{e.class}"
  exit 99
rescue SignalException => e
  logger.info "Received: #{e.signm} (#{e.signo})"
  exit 2
rescue SystemExit => e
  exit e.status
rescue Exception => e # Need to rescue "Exception" so that Sentry gets it
  notify e
  logger.fatal e.message
  logger.fatal e.backtrace.join("\n")
  raise e
end

Private Instance Methods

cleanup_test_indexes() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 237
def cleanup_test_indexes
  logger.info 'Removing remnant test indexes...'
  # Gather backup test indices
  es_api.indices.get(index: 'backup_test_*').each do |test_index, _value|
    if test_index =~ /backup_test_(.*)/ # Check again that they are backup test indices
      logger.info "Removing test index: #{test_index}"
      es_api.indices.delete index: test_index
    end
  end
end
compare_index_item!(i) click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 220
def compare_index_item!(i)
  backup_item  = index_item(@backup_test_index, i)
  restore_item = index_item(@restore_test_index, i)

  (backup_item == restore_item) ||
    fail("Item #{i} in test restore doesn’t match.\n" \
         "Original: #{backup_item}\n" \
         "Restored: #{restore_item}")
end
create_repo() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 158
def create_repo
  logger.info 'Creating a new monthly ES backup repo…'
  es_api.snapshot.create_repository(
    repository: backup_repo,
    body: {
      type: 's3',
      settings: new_repo_params.merge(
        base_path: "/elasticsearch/#{cluster_name}/#{conf['env']}/#{backup_repo}",
        server_side_encryption: true
      )
    }
  )
end
create_snapshot() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 191
def create_snapshot
  # Make a backup (full on new month, incremental otherwise)
  logger.info "Starting a new backup (#{backup_repo}/#{snapshot_label})…"
  r = es_api.snapshot.create repository: backup_repo,
                             snapshot: snapshot_label,
                             wait_for_completion: true
  fail "Snapshot failed! #{r.inspect}" if r['snapshot']['failures'].any?
  logger.info 'Snapshot complete. Time: ' \
              "#{r['snapshot']['duration_in_millis'].to_i / 1000} seconds " \
              "Results: #{r['snapshot']['shards'].inspect}"
end
dated_repos() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 178
def dated_repos
  es_api.snapshot.get_repository.keys.select { |r| valid_date? r }
end
delete_test_indexes() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 230
def delete_test_indexes
  [@restore_test_index, @backup_test_index].each do |test_index|
    logger.info "Removing test index: #{test_index}"
    es_api.indices.delete index: test_index
  end
end
es_api() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 114
def es_api
  @es_api ||= begin
    es_host = @conf['es_host'] || 'localhost'
    Elasticsearch::Client.new host: "http://#{es_host}:9200",
                              transport_options: {
                                request: {
                                  timeout: (es_transport_timeout || 2400)
                                }
                              }
  end
end
index_item(index, id) click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 216
def index_item(index, id)
  es_api.get(index: index, type: 'dummy', id: id)['_source']['test_value']
end
insert_test_data() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 146
def insert_test_data
  logger.info 'Generating test data using math…'
  test_size.times do |i|
    es_api.create(
      index: @backup_test_index,
      type: 'dummy',
      id: i,
      body: { test_value: pseudo_random_string }
    )
  end
end
logger() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 93
def logger
  @logger ||= Logger.new(conf['log']).tap { |l| l.progname = 's3_backup' }
end
master?() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 126
def master?
  es_api.nodes.info['nodes'][es_api.cluster.state['master_node']]['name'] == node_name
end
notify(e) click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 101
def notify(e)
  if conf['env'] == 'prod'
    pagerduty.trigger(
      "#{cluster_name} Elasticsearch backup failed",
      incident_key: "#{cluster_name} elasticsearch backup",
      client: node_name,
      details: "#{e.message}\n\n#{e.backtrace}"
    )
  end

  Raven.capture_exception(e) if sentry_dsn
end
pagerduty() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 97
def pagerduty
  @pagerduty ||= Pagerduty.new pagerduty_api_key
end
pseudo_random_string() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 142
def pseudo_random_string
  'a' + rand(10**100).to_s
end
remove_expired_backups() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 182
def remove_expired_backups
  # Remove 3 month old repos
  logger.info "Removing backups older than #{3.months.ago.strftime '%m-%Y'}"
  dated_repos.select { |b| Time.strptime(b, '%m-%Y') < 3.months.ago }.each do |repo|
    logger.info "Removing #{repo}"
    es_api.snapshot.delete_repository repository: repo
  end
end
restore_test_index() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 203
def restore_test_index
  # Restore just the backup_test index to a new index
  logger.info "Restoring the #{@backup_test_index} index to #{@restore_test_index}…"
  es_api.snapshot.restore repository: backup_repo,
                          snapshot: snapshot_label,
                          wait_for_completion: true,
                          body: {
                            indices: @backup_test_index,
                            rename_pattern: @backup_test_index,
                            rename_replacement: @restore_test_index
                          }
end
valid_date?(date) click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 172
def valid_date?(date)
  # rubocop:disable Style/RescueModifier
  Time.strptime(date, '%m-%Y') rescue false
  # rubocop:enable Style/RescueModifier
end
wait_for_cluster_state!() click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 130
def wait_for_cluster_state!
  tries = 0
  until (cluster_settings = Hashie::Mash.new es_api.cluster.get_settings) &&
        [nil, 'all'].include?(cluster_settings.transient_.cluster_.routing_.allocation_.enable)
    fail 'Shard reallocation is disabled. Snapshot cannot proceed because creating the test ' \
         'index in this state will put the cluster into RED state.' if tries >= 5
    logger.warn 'Waiting for shard reallocation to be re-enabled'
    sleep 10
    tries += 1
  end
end