class EverTools::ElasticsearchS3Backup
Constants
- VERSION
Attributes
backup_repo[R]
conf[R]
snapshot_label[R]
Public Class Methods
new()
click to toggle source
rubocop:enable Metrics/AbcSize, Lint/RescueException
# File lib/elasticsearch_s3_backup.rb, line 74 def initialize @conf = OpenStruct.new(YAML.load_file('/etc/s3_backup.yml')) if sentry_dsn Raven.configure do |config| config.dsn = sentry_dsn config.logger = logger end end now = Time.new.utc @backup_test_index = "backup_test_#{now.to_i}" @restore_test_index = "restore_test_#{now.to_i}" @backup_repo = now.strftime '%m-%Y' @snapshot_label = now.strftime '%m-%d_%H%M' end
Public Instance Methods
run()
click to toggle source
rubocop:disable Metrics/AbcSize, Lint/RescueException
# File lib/elasticsearch_s3_backup.rb, line 28 def run unless master? logger.info 'This node is not the currently elected master. Exiting.' exit end wait_for_cluster_state! cleanup_test_indexes insert_test_data # Create a new repo if none exists (typically at beginning of month) create_repo unless es_api.snapshot.get_repository[backup_repo] create_snapshot restore_test_index # Compare each doc in the original backup_test index to the restored index logger.info "Verifying the newly-restored #{@backup_test_index}…" test_size.times { |i| compare_index_item! i } logger.info 'Successfully verified the test data!' delete_test_indexes remove_expired_backups # Resolve any open Pagerduty incidents from previous backup failures if (pd_incident = pagerduty.get_incident("#{cluster_name} elasticsearch backup")) pd_incident.resolve end logger.info 'Finished' rescue Interrupt => e puts "Received #{e.class}" exit 99 rescue SignalException => e logger.info "Received: #{e.signm} (#{e.signo})" exit 2 rescue SystemExit => e exit e.status rescue Exception => e # Need to rescue "Exception" so that Sentry gets it notify e logger.fatal e.message logger.fatal e.backtrace.join("\n") raise e end
Private Instance Methods
cleanup_test_indexes()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 237 def cleanup_test_indexes logger.info 'Removing remnant test indexes...' # Gather backup test indices es_api.indices.get(index: 'backup_test_*').each do |test_index, _value| if test_index =~ /backup_test_(.*)/ # Check again that they are backup test indices logger.info "Removing test index: #{test_index}" es_api.indices.delete index: test_index end end end
compare_index_item!(i)
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 220 def compare_index_item!(i) backup_item = index_item(@backup_test_index, i) restore_item = index_item(@restore_test_index, i) (backup_item == restore_item) || fail("Item #{i} in test restore doesn’t match.\n" \ "Original: #{backup_item}\n" \ "Restored: #{restore_item}") end
create_repo()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 158 def create_repo logger.info 'Creating a new monthly ES backup repo…' es_api.snapshot.create_repository( repository: backup_repo, body: { type: 's3', settings: new_repo_params.merge( base_path: "/elasticsearch/#{cluster_name}/#{conf['env']}/#{backup_repo}", server_side_encryption: true ) } ) end
create_snapshot()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 191 def create_snapshot # Make a backup (full on new month, incremental otherwise) logger.info "Starting a new backup (#{backup_repo}/#{snapshot_label})…" r = es_api.snapshot.create repository: backup_repo, snapshot: snapshot_label, wait_for_completion: true fail "Snapshot failed! #{r.inspect}" if r['snapshot']['failures'].any? logger.info 'Snapshot complete. Time: ' \ "#{r['snapshot']['duration_in_millis'].to_i / 1000} seconds " \ "Results: #{r['snapshot']['shards'].inspect}" end
dated_repos()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 178 def dated_repos es_api.snapshot.get_repository.keys.select { |r| valid_date? r } end
delete_test_indexes()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 230 def delete_test_indexes [@restore_test_index, @backup_test_index].each do |test_index| logger.info "Removing test index: #{test_index}" es_api.indices.delete index: test_index end end
es_api()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 114 def es_api @es_api ||= begin es_host = @conf['es_host'] || 'localhost' Elasticsearch::Client.new host: "http://#{es_host}:9200", transport_options: { request: { timeout: (es_transport_timeout || 2400) } } end end
index_item(index, id)
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 216 def index_item(index, id) es_api.get(index: index, type: 'dummy', id: id)['_source']['test_value'] end
insert_test_data()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 146 def insert_test_data logger.info 'Generating test data using math…' test_size.times do |i| es_api.create( index: @backup_test_index, type: 'dummy', id: i, body: { test_value: pseudo_random_string } ) end end
logger()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 93 def logger @logger ||= Logger.new(conf['log']).tap { |l| l.progname = 's3_backup' } end
master?()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 126 def master? es_api.nodes.info['nodes'][es_api.cluster.state['master_node']]['name'] == node_name end
notify(e)
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 101 def notify(e) if conf['env'] == 'prod' pagerduty.trigger( "#{cluster_name} Elasticsearch backup failed", incident_key: "#{cluster_name} elasticsearch backup", client: node_name, details: "#{e.message}\n\n#{e.backtrace}" ) end Raven.capture_exception(e) if sentry_dsn end
pagerduty()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 97 def pagerduty @pagerduty ||= Pagerduty.new pagerduty_api_key end
pseudo_random_string()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 142 def pseudo_random_string 'a' + rand(10**100).to_s end
remove_expired_backups()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 182 def remove_expired_backups # Remove 3 month old repos logger.info "Removing backups older than #{3.months.ago.strftime '%m-%Y'}" dated_repos.select { |b| Time.strptime(b, '%m-%Y') < 3.months.ago }.each do |repo| logger.info "Removing #{repo}" es_api.snapshot.delete_repository repository: repo end end
restore_test_index()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 203 def restore_test_index # Restore just the backup_test index to a new index logger.info "Restoring the #{@backup_test_index} index to #{@restore_test_index}…" es_api.snapshot.restore repository: backup_repo, snapshot: snapshot_label, wait_for_completion: true, body: { indices: @backup_test_index, rename_pattern: @backup_test_index, rename_replacement: @restore_test_index } end
valid_date?(date)
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 172 def valid_date?(date) # rubocop:disable Style/RescueModifier Time.strptime(date, '%m-%Y') rescue false # rubocop:enable Style/RescueModifier end
wait_for_cluster_state!()
click to toggle source
# File lib/elasticsearch_s3_backup.rb, line 130 def wait_for_cluster_state! tries = 0 until (cluster_settings = Hashie::Mash.new es_api.cluster.get_settings) && [nil, 'all'].include?(cluster_settings.transient_.cluster_.routing_.allocation_.enable) fail 'Shard reallocation is disabled. Snapshot cannot proceed because creating the test ' \ 'index in this state will put the cluster into RED state.' if tries >= 5 logger.warn 'Waiting for shard reallocation to be re-enabled' sleep 10 tries += 1 end end