bin/metrics-es-node-graphite.rb
#! /usr/bin/env ruby
#
# es-node-graphite
#
# DESCRIPTION:
# This check creates node metrics from the elasticsearch API
#
# OUTPUT:
# metric data
#
# PLATFORMS:
# Linux, Windows, BSD, Solaris, etc
#
# DEPENDENCIES:
# gem: sensu-plugin
# gem: rest-client
#
# USAGE:
# #YELLOW
#
# NOTES:
# 2014/04
# Modifid by Vincent Janelle @randomfrequency http://github.com/vjanelle
# Add more metrics, fix es 1.x URLs, translate graphite stats from
# names directly
#
# 2012/12 - Modified by Zach Dunn @SillySophist http://github.com/zadunn
# To add more metrics, and correct for new versins of ES. Tested on
# ES Version 0.19.8
#
# LICENSE:
# Copyright 2013 Vincent Janelle <randomfrequency@gmail.com>
# Copyright 2012 Sonian, Inc <chefs@sonian.net>
# Released under the same terms as Sensu (the MIT license); see LICENSE
# for details.
#
require 'sensu-plugin/metric/cli'
require 'rest-client'
require 'json'
require 'base64'
#
# ES Node Graphite Metrics
#
class ESNodeGraphiteMetrics < Sensu::Plugin::Metric::CLI::Graphite
option :scheme,
description: 'Metric naming scheme, text to prepend to queue_name.metric',
short: '-s SCHEME',
long: '--scheme SCHEME',
default: "#{Socket.gethostname}.elasticsearch"
option :server,
description: 'Elasticsearch server host.',
short: '-h HOST',
long: '--host HOST',
default: 'localhost'
option :port,
description: 'Elasticsearch port.',
short: '-p PORT',
long: '--port PORT',
proc: proc(&:to_i),
default: 9200
option :timeout,
description: 'Request timeout to elasticsearch',
short: '-t TIMEOUT',
long: '--timeout TIMEOUT',
proc: proc(&:to_i),
default: 30
option :disable_jvm_stats,
description: 'Disable JVM statistics',
long: '--disable-jvm-stats',
boolean: true,
default: false
option :disable_os_stats,
description: 'Disable OS Stats',
long: '--disable-os-stat',
boolean: true,
default: false
option :disable_process_stats,
description: 'Disable process statistics',
long: '--disable-process-stats',
boolean: true,
default: false
option :disable_thread_pool_stats,
description: 'Disable thread-pool statistics',
long: '--disable-thread-pool-stats',
boolean: true,
default: false
option :disable_fs_stats,
description: 'Disable filesystem statistics',
long: '--disable-fs-stats',
boolean: true,
default: false
option :user,
description: 'Elasticsearch User',
short: '-u USER',
long: '--user USER'
option :password,
description: 'Elasticsearch Password',
short: '-P PASS',
long: '--password PASS'
option :https,
description: 'Enables HTTPS',
short: '-e',
long: '--https'
option :cert_file,
description: 'Cert file to use',
long: '--cert-file CERT_FILE'
def get_es_resource(resource)
headers = {}
if config[:user] && config[:password]
auth = 'Basic ' + Base64.strict_encode64("#{config[:user]}:#{config[:password]}").chomp
headers = { 'Authorization' => auth }
end
protocol = if config[:https]
'https'
else
'http'
end
r = if config[:cert_file]
RestClient::Resource.new("#{protocol}://#{config[:server]}:#{config[:port]}#{resource}?pretty",
ssl_ca_file: config[:cert_file].to_s,
timeout: config[:timeout],
headers: headers)
else
RestClient::Resource.new("#{protocol}://#{config[:server]}:#{config[:port]}#{resource}?pretty",
timeout: config[:timeout],
headers: headers)
end
::JSON.parse(r.get)
rescue Errno::ECONNREFUSED
warning 'Connection refused'
rescue RestClient::RequestTimeout
warning 'Connection timed out'
end
def acquire_es_version
info = get_es_resource('/')
info['version']['number']
end
def run
# invert various stats depending on if some flags are set
os_stat = !config[:disable_os_stats]
process_stats = !config[:disable_process_stats]
jvm_stats = !config[:disable_jvm_stats]
tp_stats = !config[:disable_thread_pool_stats]
fs_stats = !config[:disable_fs_stats]
es_version = Gem::Version.new(acquire_es_version)
if es_version >= Gem::Version.new('3.0.0')
stats_query_array = %w[indices http transport]
stats_query_array.push('jvm') if jvm_stats == true
stats_query_array.push('os') if os_stat == true
stats_query_array.push('process') if process_stats == true
stats_query_array.push('thread_pool') if tp_stats == true
stats_query_array.push('fs') if fs_stats == true
stats_query_string = stats_query_array.join(',')
elsif es_version >= Gem::Version.new('1.0.0')
stats_query_array = %w[indices http network transport thread_pool]
stats_query_array.push('jvm') if jvm_stats == true
stats_query_array.push('os') if os_stat == true
stats_query_array.push('process') if process_stats == true
stats_query_array.push('tp_stats') if tp_stats == true
stats_query_array.push('fs_stats') if fs_stats == true
stats_query_string = stats_query_array.join(',')
else
stats_query_string = [
'clear=true',
'indices=true',
'http=true',
"jvm=#{jvm_stats}",
'network=true',
"os=#{os_stat}",
"process=#{process_stats}",
"thread_pool=#{tp_stats}",
'transport=true',
'thread_pool=true',
"fs=#{fs_stats}"
].join('&')
end
stats = if es_version >= Gem::Version.new('3.0.0')
get_es_resource("/_nodes/_local/stats/#{stats_query_string}")
elsif es_version >= Gem::Version.new('1.0.0')
get_es_resource("/_nodes/_local/stats?#{stats_query_string}")
else
get_es_resource("/_cluster/nodes/_local/stats?#{stats_query_string}")
end
timestamp = Time.now.to_i
node = stats['nodes'].values.first
metrics = {}
if os_stat
if es_version >= Gem::Version.new('2.0.0')
metrics['os.load_average'] = node['os']['load_average']
else
metrics['os.load_average'] = node['os']['load_average'][0]
metrics['os.load_average.1'] = node['os']['load_average'][0]
metrics['os.load_average.5'] = node['os']['load_average'][1]
metrics['os.load_average.15'] = node['os']['load_average'][2]
metrics['os.cpu.sys'] = node['os']['cpu']['sys']
metrics['os.cpu.user'] = node['os']['cpu']['user']
metrics['os.cpu.idle'] = node['os']['cpu']['idle']
metrics['os.cpu.usage'] = node['os']['cpu']['usage']
metrics['os.cpu.stolen'] = node['os']['cpu']['stolen']
metrics['os.uptime'] = node['os']['uptime_in_millis']
end
metrics['os.mem.free_in_bytes'] = node['os']['mem']['free_in_bytes']
end
if process_stats
metrics['process.cpu.percent'] = node['process']['cpu']['percent']
metrics['process.mem.resident_in_bytes'] = node['process']['mem']['resident_in_bytes'] if node['process']['mem']['resident_in_bytes']
end
if jvm_stats
metrics['jvm.mem.heap_used_in_bytes'] = node['jvm']['mem']['heap_used_in_bytes']
metrics['jvm.mem.non_heap_used_in_bytes'] = node['jvm']['mem']['non_heap_used_in_bytes']
metrics['jvm.mem.max_heap_size_in_bytes'] = 0
node['jvm']['mem']['pools'].each do |k, v|
metrics["jvm.mem.#{k.tr(' ', '_')}.max_in_bytes"] = v['max_in_bytes']
metrics['jvm.mem.max_heap_size_in_bytes'] += v['max_in_bytes']
end
# This makes absolutely no sense - not sure what it's trying to measure - @vjanelle
# metrics['jvm.gc.collection_time_in_millis'] = node['jvm']['gc']['collection_time_in_millis'] + \
# node['jvm']['mem']['pools']['CMS Old Gen']['max_in_bytes']
node['jvm']['gc']['collectors'].each do |gc, gc_value|
gc_value.each do |k, v|
# this contains stupid things like '28ms' and '2s', and there's already
# something that counts in millis, which makes more sense
unless k.end_with? 'collection_time'
metrics["jvm.gc.collectors.#{gc}.#{k}"] = v
end
end
end
metrics['jvm.threads.count'] = node['jvm']['threads']['count']
metrics['jvm.threads.peak_count'] = node['jvm']['threads']['peak_count']
metrics['jvm.uptime'] = node['jvm']['uptime_in_millis']
end
node['indices'].each do |type, index|
index.each do |k, v|
# #YELLOW
unless k =~ /(_time$)/ || v =~ /\d+/
metrics["indices.#{type}.#{k}"] = v
end
end
end
node['transport'].each do |k, v|
# #YELLOW
unless k =~ /(_size$)/
metrics["transport.#{k}"] = v
end
end
metrics['http.current_open'] = node['http']['current_open']
metrics['http.total_opened'] = node['http']['total_opened']
if node['network']
metrics['network.tcp.active_opens'] = node['network']['tcp']['active_opens']
metrics['network.tcp.passive_opens'] = node['network']['tcp']['passive_opens']
metrics['network.tcp.in_segs'] = node['network']['tcp']['in_segs']
metrics['network.tcp.out_segs'] = node['network']['tcp']['out_segs']
metrics['network.tcp.retrans_segs'] = node['network']['tcp']['retrans_segs']
metrics['network.tcp.attempt_fails'] = node['network']['tcp']['attempt_fails']
metrics['network.tcp.in_errs'] = node['network']['tcp']['in_errs']
metrics['network.tcp.out_rsts'] = node['network']['tcp']['out_rsts']
metrics['network.tcp.curr_estab'] = node['network']['tcp']['curr_estab']
metrics['network.tcp.estab_resets'] = node['network']['tcp']['estab_resets']
end
if tp_stats
node['thread_pool'].each do |pool, stat|
stat.each do |k, v|
metrics["thread_pool.#{pool}.#{k}"] = v
end
end
end
if fs_stats
node['fs'].each do |fs, fs_value|
unless fs =~ /(timestamp|data)/
fs_value.each do |k, v|
metrics["fs.#{fs}.#{k}"] = v
end
end
end
end
metrics.each do |k, v|
output([config[:scheme], k].join('.'), v, timestamp)
end
ok
end
end