class KubernetesHealthChecker::Runner

Constants

ALERT_STATUSES
RUN_INTERVAL
TIMED_ALERT_STATUSES

time is in seconds

Public Instance Methods

construct_message(output) click to toggle source
# File bin/kubernetes_health_checker, line 125
def construct_message(output)
  message = ''

  rows = output.split("\n")
  rows.each_with_index do |row, index|
    next if index == 0

    row = row.split(' ')
    pod_name = row[0]
    status = row[2]
    restarts = row[3].to_i
    age = get_age_in_seconds(row[4])

    @pods_data[pod_name] ||= {}
    message += get_pod_message(pod_name: pod_name,
                               new_status: status,
                               new_restarts: restarts,
                               age: age)

    # update our data store for the pods
    @pods_data[pod_name][:status] = status
    @pods_data[pod_name][:restarts] = restarts
  end

  message
end
get_age_in_seconds(raw_age) click to toggle source
# File bin/kubernetes_health_checker, line 152
def get_age_in_seconds(raw_age)
  # this split does: 1d = [1, d]
  number, unit = raw_age.split(/(?<=\d)(?=[A-Za-z])/)
  case unit
  when 's'
    number.to_i
  when 'm'
    number.to_i * 60
  when 'h'
    number.to_i * 60 * 60
  when 'd'
    number.to_i * 60 * 60 * 24
  else
    # I don't think we'll ever get here
    nil
  end
end
get_cli_output() click to toggle source
# File bin/kubernetes_health_checker, line 73
def get_cli_output
  if @test
    # so hacky, but pulls from second mock output if
    # it's the second time running to test state changes
    file_name = @pods_data.empty? ? '../mock_output.txt' : '../mock_output_two.txt'
    path = File.expand_path(file_name, __FILE__)
    output = File.read(path)
  else
    output = ''
    @namespace.each do |namespace|
      output += `kubectl get pods --namespace #{namespace}`
    end
  end

  output
end
get_pod_message(pod_name:, new_status:, new_restarts:, age:) click to toggle source
# File bin/kubernetes_health_checker, line 104
def get_pod_message(pod_name:, new_status:, new_restarts:, age:)
  old_restarts = @pods_data[pod_name][:restarts]
  old_status = @pods_data[pod_name][:status]
  text = ''

  if !@pods_data[pod_name].empty? && old_restarts == new_restarts && old_status == new_status
    # we've already alerted from this state, skip
  elsif ALERT_STATUSES.include?(new_status.downcase)
    text = "Pod *#{pod_name}* is in status: *#{new_status}*.\n"
  elsif TIMED_ALERT_STATUSES.keys.include?(new_status.downcase)
    alert_threshold = TIMED_ALERT_STATUSES[new_status.downcase]
    if !age.nil? && age > alert_threshold
      text = "Pod #{pod_name} has been in status: *#{new_status}* for #{age} seconds :grimacing:.\n"
    end
  elsif !new_restarts.nil? && new_restarts > @alert_threshold
    text = "Pod *#{pod_name}* has restarted *#{new_restarts}* times and has status: *#{new_status}*.\n"
  end

  text
end
run_validations() click to toggle source
# File bin/kubernetes_health_checker, line 69
def run_validations
  raise 'must provide a url to POST alert to' if !@url
end
send_alert(message) click to toggle source
# File bin/kubernetes_health_checker, line 90
def send_alert(message)
  puts "Sending message: #{message}"
  return if @test

  payload = {
    channel: @slack_channel,
    username: 'kubernetes_health_check_bot',
    text: message,
    icon_emoji: ':face_with_thermometer:'
  }

  `curl -X POST --data-urlencode 'payload=#{JSON.generate(payload)}' #{@url}`
end
set_defaults(options) click to toggle source
# File bin/kubernetes_health_checker, line 59
def set_defaults(options)
  opts = options.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo }
  @alert_threshold = opts[:alert_threshold] || 5
  @slack_channel = opts[:channel] || '@rohan'
  @namespace = opts[:namespace]&.split(',') || ['default']
  @test = opts[:test] || false
  @url = opts[:url]
  @pods_data = {}
end
start() click to toggle source
# File bin/kubernetes_health_checker, line 34
def start
  log = Syslog::Logger.new 'kubernetes_health_checker'

  begin
    set_defaults(options)
    run_validations if !@test
    while true
      puts 'starting...'
      output = get_cli_output
      message = construct_message(output)
      send_alert(message) if !message.empty?
      puts 'sleeping...'
      sleep RUN_INTERVAL
    end
  rescue => e
    log.error(e)
  end
end
version() click to toggle source
# File bin/kubernetes_health_checker, line 54
def version
  say 'KubernetesHealthChecker 0.0.0'
end