sensu-plugins/sensu-plugins-zookeeper

View on GitHub
bin/check-zookeeper-cluster.rb

Summary

Maintainability
A
0 mins
Test Coverage
#!/usr/bin/env ruby
#
#  check-zookeeper-cluster
#
# DESCRIPTION:
#   Check if a exhibitor managed Zookeeper cluster is OK.
#
#   This check will get exhibitor status information,
#   and check each seperate node for ruok.
#   This check also will compare cluster node count with a spesific value,
#   avg latency from mntr for a threshold value, and If the cluster
#   has a leader.
#
# PLATFORMS:
#   All
#
# DEPENDENCIES:
#   gem: sensu-plugin
#
# USAGE:
#  Check if a node has Zookeeper running and responds with imok.
#
#  Cluster should have 3 nodes, exhibitor status endpoint is not default, and average latency threshold should be 10
#  ./check-zookeeeper-cluster.rb -c 3 -e http://localhost:8181/exhibitor/v1/cluster/status -l 10
#

require 'sensu-plugin/check/cli'
require 'socket'
require 'net/http'
require 'json'

class CheckZookeeperCluster < Sensu::Plugin::Check::CLI
  option :count,
         description: 'Zookeeper cluster node count',
         short: '-c count',
         long: '--count count',
         default: 3,
         proc: proc(&:to_i)

  option :zk_port,
         description: 'Zookeeper nodes\' listen port',
         short: '-p port',
         long: '--port port',
         default: 2181,
         proc: proc(&:to_i)

  option :exhibitor,
         description: 'exhibitor end node for status checks',
         short: '-e Exhibitor status end point',
         long: '--exhibitor status end point',
         default: 'http://localhost/exhibitor/v1/cluster/status'

  option :latency,
         description: 'Critical threshold for Zookeeper average latency',
         short: '-l TICKS',
         long: '--latency TICKS',
         proc: proc(&:to_i),
         default: 10

  option :timeout,
         description: 'How long to wait for a reply in seconds.',
         short: '-t SECS',
         long: '--timeout SECS',
         proc: proc(&:to_i),
         default: 5

  def zookeeper_latency(server, port)
    l = 0
    TCPSocket.open(server, port) do |socket|
      socket.write 'mntr'
      ready = IO.select([socket], nil, nil, config[:timeout])
      if ready.nil?
        critical %(Zookeeper did not respond to 'mntr' within #{config[:timeout]} seconds)
      end
      l = ready.first.first.read.chomp.split("\n")[1].split("\t")[1].to_i
    end
  end

  def check_ruok(server, port)
    result = false
    TCPSocket.open(server, port) do |socket|
      socket.write 'ruok'
      ready = IO.select([socket], nil, nil, config[:timeout])

      if ready.nil?
        critical %(Zookeeper did not respond to 'ruok' within #{config[:timeout]} seconds)
      end

      result = ready.first.first.read.chomp
    end
    result == 'imok'
  end

  def _leader_count(json)
    l_count = max_latency = 0
    json.each do |zk|
      l_count += 1 if zk['isLeader']
      l = zookeeper_latency(zk['hostname'], config[:zk_port])
      max_latency = [max_latency, l].max
    end
    [l_count, max_latency]
  end

  def _check_leader(json)
    e = []
    l_count, max_latency = _leader_count(json)
    unless l_count == 1
      e.push("cluster should have a leader (#{l_count})")
    end
    if max_latency > config[:latency]
      e.push("cluster should have a lower latecy #{max_latency}")
    end
    return [true, e] if l_count == 1 && max_latency < config[:latency]
    [false, e]
  end

  def check_exhibitor_endpoint(response)
    json = JSON.parse(response.body)
    e = []
    r = false
    if json.length == config[:count]
      r, e = _check_leader(json)
    else
      e = ["cluster size mismatch (#{json.length}!=#{config[:count]})"]
    end
    [r, json, e]
  end

  def check_exhibitor
    response = ''
    json = ''
    url = URI.parse(config[:exhibitor])
    req = Net::HTTP::Get.new(url.path)
    Net::HTTP.new(url.host, url.port).start do |http|
      response = http.request(req)
    end
    return [false, json, ['exhibitor status is not http 200']] unless
      response.is_a? Net::HTTPSuccess
    r, json, e = check_exhibitor_endpoint(response)
    [r, json, e]
  end

  def check_each_zk(json)
    r = true
    hosts = []
    json.each do |zk|
      r = check_ruok(zk['hostname'], config[:zk_port])
      hosts.push(zk['hostname'])
      return [false, ["#{zk['hostname']} is not ok"]] unless r
    end
    [true, hosts]
  end

  def run
    result, json, errors = check_exhibitor
    result, errors = check_each_zk(json) if result
    message errors.join(', ')
    ok if result
    critical
  end
end