bin/check-graphite.rb
#! /usr/bin/env ruby
#
# <script name>
#
# DESCRIPTION:
# Get time series values from Graphite and create events based on values
#
# OUTPUT:
# plain text
#
# PLATFORMS:
# Linux
#
# DEPENDENCIES:
# gem: sensu-plugin
# gem: array_stats
#
# USAGE:
# #YELLOW
#
# NOTES:
#
# LICENSE:
# Copyright 2012 Ulf Mansson @ Recorded Future
# Modifications by Chris Jansen to support wildcard targets
# Released under the same terms as Sensu (the MIT license); see LICENSE
# for details.
#
require 'sensu-plugin/check/cli'
require 'json'
require 'net/http'
require 'net/https'
require 'socket'
require 'array_stats'
class Graphite < Sensu::Plugin::Check::CLI
option :host,
short: '-h HOST',
long: '--host HOST',
description: 'Graphite host to connect to, include port',
required: true
option :target,
description: 'The graphite metric name. Could be a comma separated list of metric names.',
short: '-t TARGET',
long: '--target TARGET',
required: true
option :complex_target,
description: 'Allows complex targets which contain functions. Disables splitting on comma.',
short: '-x',
long: '--complex_target',
default: false
option :period,
description: 'The period back in time to extract from Graphite and compare with. Use 24hours,2days etc, same format as in Graphite',
short: '-p PERIOD',
long: '--period PERIOD',
default: '2hours'
option :updated_since,
description: 'The graphite value should have been updated within UPDATED_SINCE seconds, default to 600 seconds',
short: '-u UPDATED_SINCE',
long: '--updated_since UPDATED_SINCE',
default: 600
option :acceptable_diff_percentage,
description: 'The acceptable diff from max values in percentage, used in check_function_increasing',
short: '-D ACCEPTABLE_DIFF_PERCENTAGE',
long: '--acceptable_diff_percentage ACCEPTABLE_DIFF_PERCENTAGE',
default: 0
option :check_function_increasing,
description: 'Check that value is increasing or equal over time (use acceptable_diff_percentage if it should allow to be lower)',
short: '-i',
long: '--check_function_increasing',
default: false,
boolean: true
option :greater_than,
description: 'Change whether value is greater than or less than check',
short: '-g',
long: '--greater_than',
default: false
option :check_last,
description: 'Check that the last value in GRAPHITE is greater/less than VALUE',
short: '-l VALUE',
long: '--last VALUE',
default: nil
option :ignore_nulls,
description: 'Do not error on null values, used in check_function_increasing',
short: '-n',
long: '--ignore_nulls',
default: false,
boolean: true
option :concat_output,
description: 'Include warning messages in output even if overall status is critical',
short: '-c',
long: '--concat_output',
default: false,
boolean: true
option :short_output,
description: 'Report only the highest status per series in output',
short: '-s',
long: '--short_output',
default: false,
boolean: true
option :check_average,
description: 'MAX_VALUE should be greater than the average of Graphite values from PERIOD',
short: '-a MAX_VALUE',
long: '--average_value MAX_VALUE'
option :data_points,
description: 'Number of data points to include in average check (smooths out spikes)',
short: '-d VALUE',
long: '--data_points VALUE',
default: 1
option :check_average_percent,
description: 'MAX_VALUE% should be greater than the average of Graphite values from PERIOD',
short: '-b MAX_VALUE',
long: '--average_percent_value MAX_VALUE'
option :percentile,
description: 'Percentile value, should be used in conjunction with percentile_value, defaults to 90',
long: '--percentile PERCENTILE',
default: 90
option :check_percentile,
description: 'Values should not be greater than the VALUE of Graphite values from PERIOD',
long: '--percentile_value VALUE'
option :http_user,
description: 'Basic HTTP authentication user',
short: '-U USER',
long: '--http-user USER',
default: nil
option :http_password,
description: 'Basic HTTP authentication password',
short: '-P PASSWORD',
long: '--http-password USER',
default: nil
def initialize
super
@graphite_cache = {}
end
def graphite_cache(target = nil)
# #YELLOW
if @graphite_cache.key?(target)
graphite_value = @graphite_cache[target].select { |value| value[:period] == @period }
graphite_value if graphite_value.size > 0
end
end
# Create a graphite url from params
#
#
def graphite_url(target = nil)
url = "#{config[:host]}/render/"
url = 'http://' + url unless url[0..3] == 'http'
# #YELLOW
url = url + "?target=#{target}" if target # rubocop:disable Style/SelfAssignment
URI.parse(url)
end
def get_levels(config_param)
values = config_param.split(',')
i = 0
levels = {}
%w(warning error fatal).each do |type|
levels[type] = values[i] if values[i]
i += 1
end
levels
end
def get_graphite_values(target)
cache_value = graphite_cache target
return cache_value if cache_value
params = {
target: target,
from: "-#{@period}",
format: 'json'
}
req = Net::HTTP::Post.new(graphite_url.path)
# If the basic http authentication credentials have been provided, then use them
if !config[:http_user].nil? && !config[:http_password].nil?
req.basic_auth(config[:http_user], config[:http_password])
end
req.set_form_data(params)
nethttp = Net::HTTP.new(graphite_url.host, graphite_url.port)
if graphite_url.scheme == 'https'
nethttp.use_ssl = true
end
resp = nethttp.start { |http| http.request(req) }
data = JSON.parse(resp.body)
@graphite_cache[target] = []
if data.size > 0
data.each { |d| @graphite_cache[target] << { target: d['target'], period: @period, datapoints: d['datapoints'] } }
graphite_cache target
end
end
# Will give max values for [0..-2]
def max_graphite_value(target)
max_values = {}
values = get_graphite_values target
if values
values.each do |val|
max = get_max_value(val[:datapoints])
max_values[val[:target]] = max
end
end
max_values
end
def get_max_value(values)
if values
values.map { |i| i[0] ? i[0] : 0 }[0..-2].max
end
end
def last_graphite_metric(target, count = 1)
last_values = {}
values = get_graphite_values target
if values
values.each do |val|
last = get_last_metric(val[:datapoints], count)
last_values[val[:target]] = last
end
end
last_values
end
def get_last_metric(values, count = 1)
if values
ret = []
values_size = values.size
count = values_size if count > values_size
while count > 0
values_size -= 1
break if values[values_size].nil?
count -= 1 if values[values_size][0]
ret.push(values[values_size]) if values[values_size][0]
end
ret
end
end
def last_graphite_value(target, count = 1)
last_metrics = last_graphite_metric(target, count)
last_values = {}
if last_metrics
last_metrics.each do |target_name, metrics|
last_values[target_name] = metrics.map { |metric| metric[0] }.mean
end
end
last_values
end
def been_updated_since(target, time, updated_since)
last_time_stamp = last_graphite_metric target
warnings = []
if last_time_stamp
last_time_stamp.each do |target_name, value|
last_time_stamp_bool = value[1] > time.to_i ? true : false
warnings << "The metric #{target_name} has not been updated in #{updated_since} seconds" unless last_time_stamp_bool
end
end
warnings
end
def greater_less
return 'greater' if config[:greater_than]
return 'less' unless config[:greater_than]
end
def check_increasing(target)
updated_since = config[:updated_since].to_i
time_to_be_updated_since = Time.now - updated_since
critical_errors = []
warnings = []
max_gv = max_graphite_value target
last_gv = last_graphite_value target
if last_gv.is_a?(Hash) && max_gv.is_a?(Hash)
# #YELLOW
last_gv.each do |target_name, value|
if value && max_gv[target_name]
last = value
max = max_gv[target_name]
if max > last * (1 + config[:acceptable_diff_percentage].to_f / 100)
msg = "The metric #{target} with last value #{last} is less than max value #{max} during #{config[:period]} period"
critical_errors << msg
end
end
end
else
warnings << "Could not found any value in Graphite for metric #{target}, see #{graphite_url(target)}"
end
unless config[:ignore_nulls]
warnings.concat(been_updated_since(target, time_to_be_updated_since, updated_since))
end
[warnings, critical_errors, []]
end
def check_average_percent(target, max_values, data_points = 1)
values = get_graphite_values target
last_values = last_graphite_value(target, data_points)
return [[], [], []] unless values
warnings = []
criticals = []
fatal = []
values.each do |data|
target = data[:target]
values_pair = data[:datapoints]
values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
# #YELLOW
avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
last_value = last_values[target]
percent = last_value / avg_value unless last_value.nil? || avg_value.nil?
# #YELLOW
%w(fatal error warning).each do |type|
next unless max_values.key?(type)
max_value = max_values[type]
var1 = config[:greater_than] ? percent : max_value.to_f
var2 = config[:greater_than] ? max_value.to_f : percent
if !percent.nil? && var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
text = "The last value of metric #{target} is #{percent}% #{greater_less} than allowed #{max_value}% of the average value #{avg_value}"
case type
when 'warning'
warnings << text
when 'error'
criticals << text
when 'fatal'
fatal << text
else
raise "Unknown type #{type}"
end
break if config[:short_output]
end
end
end
[warnings, criticals, fatal]
end
def check_average(target, max_values)
values = get_graphite_values target
return [[], [], []] unless values
warnings = []
criticals = []
fatal = []
values.each do |data|
target = data[:target]
values_pair = data[:datapoints]
values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
# #YELLOW
avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
# YELLOW
%w(fatal error warning).each do |type|
next unless max_values.key?(type)
max_value = max_values[type]
var1 = config[:greater_than] ? avg_value : max_value.to_f
var2 = config[:greater_than] ? max_value.to_f : avg_value
if var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
text = "The average value of metric #{target} is #{avg_value} that is #{greater_less} than allowed average of #{max_value}"
case type
when 'warning'
warnings << text
when 'error'
criticals << text
when 'fatal'
fatal << text
else
raise "Unknown type #{type}"
end
break if config[:short_output]
end
end
end
[warnings, criticals, fatal]
end
def check_percentile(target, max_values, percentile, data_points = 1)
values = get_graphite_values target
last_values = last_graphite_value(target, data_points)
return [[], [], []] unless values
warnings = []
criticals = []
fatal = []
values.each do |data|
target = data[:target]
values_pair = data[:datapoints]
values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
percentile_value = values_array.percentile(percentile)
last_value = last_values[target]
percent = last_value / percentile_value unless last_value.nil? || percentile_value.nil?
# #YELLOW
%w(fatal error warning).each do |type|
next unless max_values.key?(type)
max_value = max_values[type]
var1 = config[:greater_than] ? percent : max_value.to_f
var2 = config[:greater_than] ? max_value.to_f : percent
if !percentile_value.nil? && var1 > var2
text = "The percentile value of metric #{target} (#{last_value}) is #{greater_less} than the
#{percentile}th percentile (#{percentile_value}) by more than #{max_value}%"
case type
when 'warning'
warnings << text
when 'error'
criticals << text
when 'fatal'
fatal << text
else
raise "Unknown type #{type}"
end
break if config[:short_output]
end
end
end
[warnings, criticals, fatal]
end
def check_last(target, max_values)
last_targets = last_graphite_value target
return [[], [], []] unless last_targets
warnings = []
criticals = []
fatal = []
# #YELLOW
last_targets.each do |target_name, last_value|
unless last_value.nil?
# #YELLOW
%w(fatal error warning).each do |type|
next unless max_values.key?(type)
max_value = max_values[type]
var1 = config[:greater_than] ? last_value : max_value.to_f
var2 = config[:greater_than] ? max_value.to_f : last_value
if var1 > var2
text = "The metric #{target_name} is #{last_value} that is #{greater_less} than last allowed #{max_value}"
case type
when 'warning'
warnings << text
when 'error'
criticals << text
when 'fatal'
fatal << text
else
raise "Unknown type #{type}"
end
break if config[:short_output]
end
end
end
end
[warnings, criticals, fatal]
end
def run # rubocop:disable AbcSize
targets = config[:complex_target] ? [config[:target]] : config[:target].split(',')
@period = config[:period]
critical_errors = []
warnings = []
fatals = []
# #YELLOW
targets.each do |target|
if config[:check_function_increasing]
inc_warnings, inc_critical, inc_fatal = check_increasing target
warnings += inc_warnings
critical_errors += inc_critical
fatals += inc_fatal
end
if config[:check_last]
max_values = get_levels config[:check_last]
lt_warnings, lt_critical, lt_fatal = check_last(target, max_values)
warnings += lt_warnings
critical_errors += lt_critical
fatals += lt_fatal
end
if config[:check_average]
max_values = get_levels config[:check_average]
avg_warnings, avg_critical, avg_fatal = check_average(target, max_values)
warnings += avg_warnings
critical_errors += avg_critical
fatals += avg_fatal
end
if config[:check_average_percent]
max_values = get_levels config[:check_average_percent]
avg_warnings, avg_critical, avg_fatal = check_average_percent(target, max_values, config[:data_points].to_i)
warnings += avg_warnings
critical_errors += avg_critical
fatals += avg_fatal
end
if config[:check_percentile]
max_values = get_levels config[:check_percentile]
pct_warnings, pct_critical, pct_fatal = check_percentile(target, max_values, config[:percentile].to_i, config[:data_points].to_i)
warnings += pct_warnings
critical_errors += pct_critical
fatals += pct_fatal
end
end
fatals_string = fatals.size > 0 ? fatals.join("\n") : ''
criticals_string = critical_errors.size > 0 ? critical_errors.join("\n") : ''
warnings_string = warnings.size > 0 ? warnings.join("\n") : ''
if config[:concat_output]
fatals_string = fatals_string + "\n" + criticals_string if critical_errors.size > 0
fatals_string = fatals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
criticals_string = criticals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
critical fatals_string if fatals.size > 0
critical criticals_string if critical_errors.size > 0
warning warnings_string if warnings.size > 0 # rubocop:disable Style/IdenticalConditionalBranches
else
critical fatals_string if fatals.size > 0
critical criticals_string if critical_errors.size > 0
warning warnings_string if warnings.size > 0 # rubocop:disable Style/IdenticalConditionalBranches
end
ok
end
end