class Bosh::Monitor::Plugins::Resurrector

Attributes

url[R]

Public Class Methods

new(options={}) click to toggle source
Calls superclass method Bosh::Monitor::Plugins::Base::new
# File lib/bosh/monitor/plugins/resurrector.rb, line 11
def initialize(options={})
  super(options)
  director = @options['director']
  raise ArgumentError 'director options not set' unless director

  @url              = URI(director['endpoint'])
  @director_options = director
  @processor        = Bhm.event_processor
  @alert_tracker    = ResurrectorHelper::AlertTracker.new(@options)
end

Public Instance Methods

process(alert) click to toggle source
# File lib/bosh/monitor/plugins/resurrector.rb, line 31
def process(alert)
  deployment = alert.attributes['deployment']
  job = alert.attributes['job']
  id = alert.attributes['instance_id']

  # only when the agent times out do we add deployment, job & id to the alert
  # attributes, so this won't trigger a recreate for other types of alerts
  if deployment && job && id
    agent_key = ResurrectorHelper::JobInstanceKey.new(deployment, job, id)
    @alert_tracker.record(agent_key, alert.created_at)

    payload = {'jobs' => {job => [id]}}

    unless director_info
      logger.error("(Resurrector) director is not responding with the status")
      return
    end

    request = {
        head: {
            'Content-Type' => 'application/json',
            'authorization' => auth_provider(director_info).auth_header
        },
        body: Yajl::Encoder.encode(payload)
    }

    @url.path = "/deployments/#{deployment}/scan_and_fix"

    if @alert_tracker.melting_down?(deployment)
      # freak out
      ts = Time.now.to_i
      @processor.process(:alert,
                         severity: 1,
                         source: "HM plugin resurrector",
                         title: "We are in meltdown.",
                         created_at: ts)

      logger.error("(Resurrector) we are in meltdown.")
    else
      # queue instead, and only queue if it isn't already in the queue
      # what if we can't keep up with the failure rate?
      # - maybe not, maybe the meltdown detection takes care of the rate issue
      logger.warn("(Resurrector) notifying director to recreate unresponsive VM: #{deployment} #{job}/#{id}")

      send_http_put_request(url.to_s, request)
    end

  else
    logger.warn("(Resurrector) event did not have deployment, job and id: #{alert}")
  end
end
run() click to toggle source
# File lib/bosh/monitor/plugins/resurrector.rb, line 22
def run
  unless EM.reactor_running?
    logger.error("Resurrector plugin can only be started when event loop is running")
    return false
  end

  logger.info("Resurrector is running...")
end

Private Instance Methods

auth_provider(director_info) click to toggle source
# File lib/bosh/monitor/plugins/resurrector.rb, line 85
def auth_provider(director_info)
  @auth_provider ||= AuthProvider.new(director_info, @director_options, logger)
end
director_info() click to toggle source
# File lib/bosh/monitor/plugins/resurrector.rb, line 89
def director_info
  return @director_info if @director_info

  director_info_url = @url.dup
  director_info_url.path = '/info'
  response = send_http_get_request(director_info_url.to_s)
  return nil if response.status_code != 200

  @director_info = Yajl::Parser.parse(response.body)
end