sensu-plugins/sensu-plugins-mesos

View on GitHub
bin/check-mesos-cpu-balance.rb

Summary

Maintainability
C
1 day
Test Coverage
#! /usr/bin/env ruby
# frozen_string_literal: false

#
#   check-mesos-cpu-balance
#
# DESCRIPTION:
#   This plugin checks that there is less CPU imbalance than specified on a certain mesos cluster
#
# OUTPUT:
#   plain text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   gem: rest-client
#   gem: json
#
# USAGE:
#   #YELLOW
#
# NOTES:
#
# LICENSE:
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'rest-client'
require 'json'

class MesosCpuBalanceCheck < Sensu::Plugin::Check::CLI
  check_name 'MesosCpuBalanceCheck'
  @metrics_name = 'slaves'.freeze
  CHECK_TYPE = 'cpus'.freeze

  class << self
    attr_reader :metrics_name
  end

  option :server,
         description: 'Mesos server',
         short: '-s SERVER',
         long: '--server SERVER',
         default: 'localhost'

  option :port,
         description: 'port (default 5050)',
         short: '-p PORT',
         long: '--port PORT',
         default: 5050,
         required: false

  option :uri,
         description: 'Endpoint URI',
         short: '-u URI',
         long: '--uri URI',
         default: '/master/slaves'

  option :timeout,
         description: 'timeout in seconds',
         short: '-t TIMEOUT',
         long: '--timeout TIMEOUT',
         proc: proc(&:to_i),
         default: 5

  option :crit,
         description: 'Critical value to check against',
         short: '-c VALUE',
         long: '--critical VALUE',
         proc: proc(&:to_i),
         default: 0,
         required: false

  option :warn,
         description: 'Warning value to check against',
         short: '-w VALUE',
         long: '--warning VALUE',
         proc: proc(&:to_i),
         default: 0,
         required: false

  def run
    if config[:crit].negative? || config[:warn].negative?
      unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
    end

    server = config[:server]
    port = config[:port]
    uri = config[:uri]
    timeout = config[:timeout]
    crit = config[:crit]
    warn = config[:warn]

    begin
      server = get_leader_url server, port
      r = RestClient::Resource.new("#{server}#{uri}", timeout).get
      compare = get_check_diff(get_slaves(r))
      if compare['diff'] >= crit
        critical "There is a CPU diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
      end
      if compare['diff'] >= warn
        warning "There is a CPU diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
      end
    rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
      unknown  "Mesos #{server} is not responding"
    rescue RestClient::RequestTimeout
      unknown  "Mesos #{server} connection timed out"
    end
    ok
  end

  # Redirects server call to discover the Leader
  # @param server [String] Server address
  # @param port [Number] api port
  # @return [Url] Url representing the Leader

  def get_leader_url(server, port)
    RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
  end

  # Parses JSON data as returned from Mesos's metrics API
  # @param data [String] Server response
  # @return [Integer] Number of failed tasks in Mesos
  def get_slaves(data)
    begin
      slaves = JSON.parse(data)[MesosCpuBalanceCheck.metrics_name]
    rescue JSON::ParserError
      raise "Could not parse JSON response: #{data}"
    end

    if slaves.nil?
      raise "No metrics for [#{MesosCpuBalanceCheck.metrics_name}] in server response: #{data}"
    end

    slaves
  end

  def get_check_diff(slavelist)
    begin
      usages = {}
      check_diff = {}
      slavelist.each do |slaveinfo|
        usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
      end
      sorted = usages.sort_by { |_hostname, total| total }
      max = usages.length - 1
      check_diff['diff'] = sorted[max][1] - sorted[0][1]
      check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
    end
    check_diff
  end
end