class KubernetesHealthChecker::Runner
Constants
- ALERT_STATUSES
- RUN_INTERVAL
- TIMED_ALERT_STATUSES
time is in seconds
Public Instance Methods
construct_message(output)
click to toggle source
# File bin/kubernetes_health_checker, line 125 def construct_message(output) message = '' rows = output.split("\n") rows.each_with_index do |row, index| next if index == 0 row = row.split(' ') pod_name = row[0] status = row[2] restarts = row[3].to_i age = get_age_in_seconds(row[4]) @pods_data[pod_name] ||= {} message += get_pod_message(pod_name: pod_name, new_status: status, new_restarts: restarts, age: age) # update our data store for the pods @pods_data[pod_name][:status] = status @pods_data[pod_name][:restarts] = restarts end message end
get_age_in_seconds(raw_age)
click to toggle source
# File bin/kubernetes_health_checker, line 152 def get_age_in_seconds(raw_age) # this split does: 1d = [1, d] number, unit = raw_age.split(/(?<=\d)(?=[A-Za-z])/) case unit when 's' number.to_i when 'm' number.to_i * 60 when 'h' number.to_i * 60 * 60 when 'd' number.to_i * 60 * 60 * 24 else # I don't think we'll ever get here nil end end
get_cli_output()
click to toggle source
# File bin/kubernetes_health_checker, line 73 def get_cli_output if @test # so hacky, but pulls from second mock output if # it's the second time running to test state changes file_name = @pods_data.empty? ? '../mock_output.txt' : '../mock_output_two.txt' path = File.expand_path(file_name, __FILE__) output = File.read(path) else output = '' @namespace.each do |namespace| output += `kubectl get pods --namespace #{namespace}` end end output end
get_pod_message(pod_name:, new_status:, new_restarts:, age:)
click to toggle source
# File bin/kubernetes_health_checker, line 104 def get_pod_message(pod_name:, new_status:, new_restarts:, age:) old_restarts = @pods_data[pod_name][:restarts] old_status = @pods_data[pod_name][:status] text = '' if !@pods_data[pod_name].empty? && old_restarts == new_restarts && old_status == new_status # we've already alerted from this state, skip elsif ALERT_STATUSES.include?(new_status.downcase) text = "Pod *#{pod_name}* is in status: *#{new_status}*.\n" elsif TIMED_ALERT_STATUSES.keys.include?(new_status.downcase) alert_threshold = TIMED_ALERT_STATUSES[new_status.downcase] if !age.nil? && age > alert_threshold text = "Pod #{pod_name} has been in status: *#{new_status}* for #{age} seconds :grimacing:.\n" end elsif !new_restarts.nil? && new_restarts > @alert_threshold text = "Pod *#{pod_name}* has restarted *#{new_restarts}* times and has status: *#{new_status}*.\n" end text end
run_validations()
click to toggle source
# File bin/kubernetes_health_checker, line 69 def run_validations raise 'must provide a url to POST alert to' if !@url end
send_alert(message)
click to toggle source
# File bin/kubernetes_health_checker, line 90 def send_alert(message) puts "Sending message: #{message}" return if @test payload = { channel: @slack_channel, username: 'kubernetes_health_check_bot', text: message, icon_emoji: ':face_with_thermometer:' } `curl -X POST --data-urlencode 'payload=#{JSON.generate(payload)}' #{@url}` end
set_defaults(options)
click to toggle source
# File bin/kubernetes_health_checker, line 59 def set_defaults(options) opts = options.inject({}) { |memo, (k, v)| memo[k.to_sym] = v; memo } @alert_threshold = opts[:alert_threshold] || 5 @slack_channel = opts[:channel] || '@rohan' @namespace = opts[:namespace]&.split(',') || ['default'] @test = opts[:test] || false @url = opts[:url] @pods_data = {} end
start()
click to toggle source
# File bin/kubernetes_health_checker, line 34 def start log = Syslog::Logger.new 'kubernetes_health_checker' begin set_defaults(options) run_validations if !@test while true puts 'starting...' output = get_cli_output message = construct_message(output) send_alert(message) if !message.empty? puts 'sleeping...' sleep RUN_INTERVAL end rescue => e log.error(e) end end
version()
click to toggle source
# File bin/kubernetes_health_checker, line 54 def version say 'KubernetesHealthChecker 0.0.0' end