bin/check-ceph.rb
#! /usr/bin/env ruby
#
# check-ceph
#
# DESCRIPTION:
# #YELLOW
#
# OUTPUT:
# plain text
#
# PLATFORMS:
# Linux
#
# DEPENDENCIES:
# gem: sensu-plugin
# gem: english
# ceph client
#
# USAGE:
# #YELLOW
#
# NOTES:
# Runs 'ceph health' command(s) to report health status of ceph
# cluster. May need read access to ceph keyring and/or root access
# for authentication.
#
# Using -i (--ignore-flags) option allows specific options that are
# normally considered Ceph warnings to be overlooked and considered
# as 'OK' (e.g. noscrub,nodeep-scrub).
#
# Using -d (--detailed) and/or -o (--osd-tree) will dramatically increase
# verboseness during warning/error reports, however they may add
# additional insights to cluster-related problems during notification.
#
# LICENSE:
# Copyright 2013 Brian Clark <brian.clark@cloudapt.com>
# Released under the same terms as Sensu (the MIT license); see LICENSE
# for details.
#
require 'sensu-plugin/check/cli'
require 'timeout'
require 'English'
#
# Check Ceph Health
#
class CheckCephHealth < Sensu::Plugin::Check::CLI
option :keyring,
description: 'Path to cephx authentication keyring file',
short: '-k KEY',
long: '--keyring',
proc: proc { |k| " -k #{k}" }
option :monitor,
description: 'Optional monitor IP',
short: '-m MON',
long: '--monitor',
proc: proc { |m| " -m #{m}" }
option :cluster,
description: 'Optional cluster name',
short: '-c NAME',
long: '--cluster',
proc: proc { |c| " --cluster=#{c}" }
option :name,
description: 'Optional client name',
short: '-n NAME',
long: '--name',
proc: proc { |n| " --name=#{n}" }
option :timeout,
description: 'Timeout (default 10)',
short: '-t SEC',
long: '--timeout',
proc: proc(&:to_i),
default: 10
option :ignore_flags,
description: 'Optional ceph warning flags to ignore',
short: '-i FLAG[,FLAG]',
long: '--ignore-flags',
proc: proc { |f| f.split(',') }
option :show_detail,
description: 'Show ceph health detail on warns/errors (verbose!)',
short: '-d',
long: '--detailed',
boolean: true,
default: false
option :osd_tree,
description: 'Show OSD tree on warns/errors (verbose!)',
short: '-o',
long: '--osd-tree',
boolean: true,
default: false
def run_cmd(cmd)
pipe, status = nil
begin
cmd += config[:cluster] if config[:cluster]
cmd += config[:keyring] if config[:keyring]
cmd += config[:monitor] if config[:monitor]
cmd += config[:name] if config[:name]
cmd += ' 2>&1'
Timeout.timeout(config[:timeout]) do
pipe = IO.popen(cmd)
Process.wait(pipe.pid)
status = $CHILD_STATUS.exitstatus
end
rescue Timeout::Error
begin
Process.kill(9, pipe.pid)
Process.wait(pipe.pid)
rescue Errno::ESRCH, Errno::EPERM
# Catch errors from trying to kill the timed-out process
# We must do something here to stop travis complaining
critical 'Execution timed out'
ensure
critical 'Execution timed out'
end
end
output = pipe.read
critical "Command '#{cmd}' returned no output" if output.to_s == ''
critical output unless status == 0
output
end
def strip_warns(result)
r = result.dup
r.gsub!(/HEALTH_WARN\ /, '')
.gsub!(/\ ?flag\(s\) set/, '')
.delete!("\n")
config[:ignore_flags].each do |f|
r.gsub!(/,?#{f},?/, '')
end
if r.empty?
result.gsub(/HEALTH_WARN/, 'HEALTH_OK')
else
result
end
end
def run
result = check_ceph_health
result = strip_warns(result) if config[:ignore_flags]
result += run_cmd('ceph osd tree') if config[:osd_tree]
ok result if result.start_with?('HEALTH_OK')
if result.start_with?('HEALTH_WARN')
warning result
else
critical result
end
end
private
def check_ceph_health
if config[:show_detail]
run_cmd('ceph health detail')
else
run_cmd('ceph health')
end
end
end