sensu-plugins/sensu-plugins-elasticsearch

View on GitHub
bin/check-es-heap.rb

Summary

Maintainability
C
7 hrs
Test Coverage
#! /usr/bin/env ruby
#
#   check-es-heap
#
# DESCRIPTION:
#   This plugin checks ElasticSearch's Java heap usage using its API.
#
# OUTPUT:
#   plain text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   gem: rest-client
#
# USAGE:
#   example commands
#
# NOTES:
#
# LICENSE:
#  Copyright 2012 Sonian, Inc <chefs@sonian.net>
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'rest-client'
require 'json'
require 'base64'

#
# ES Heap
#
class ESHeap < Sensu::Plugin::Check::CLI
  option :host,
         description: 'Elasticsearch host',
         short: '-h HOST',
         long: '--host HOST',
         default: 'localhost'

  option :port,
         description: 'Elasticsearch port',
         short: '-p PORT',
         long: '--port PORT',
         proc: proc(&:to_i),
         default: 9200

  option :warn,
         short: '-w N',
         long: '--warn N',
         description: 'Heap used in bytes WARNING threshold',
         proc: proc(&:to_i),
         default: 0

  option :timeout,
         description: 'Sets the connection timeout for REST client',
         short: '-t SECS',
         long: '--timeout SECS',
         proc: proc(&:to_i),
         default: 30

  option :crit,
         short: '-c N',
         long: '--crit N',
         description: 'Heap used in bytes CRITICAL threshold',
         proc: proc(&:to_i),
         default: 0

  option :percentage,
         short: '-P',
         long: '--percentage',
         description: 'Use the WARNING and CRITICAL threshold numbers as percentage indicators of the total heap available',
         default: false

  option :user,
         description: 'Elasticsearch User',
         short: '-u USER',
         long: '--user USER'

  option :password,
         description: 'Elasticsearch Password',
         short: '-W PASS',
         long: '--password PASS'

  option :https,
         description: 'Enables HTTPS',
         short: '-e',
         long: '--https'

  option :cert_file,
         description: 'Cert file to use',
         long: '--cert-file CERT'

  option :all,
         description: 'Check all nodes in the ES cluster',
         short: '-a',
         long: '--all',
         default: false

  def acquire_es_version
    info = acquire_es_resource('/')
    info['version']['number']
  end

  def acquire_es_resource(resource)
    headers = {}
    if config[:user] && config[:password]
      auth = 'Basic ' + Base64.strict_encode64("#{config[:user]}:#{config[:password]}").chomp
      headers = { 'Authorization' => auth }
    end

    protocol = if config[:https]
                 'https'
               else
                 'http'
               end

    r = if config[:cert_file]
          RestClient::Resource.new("#{protocol}://#{config[:host]}:#{config[:port]}#{resource}",
                                   ssl_ca_file: config[:cert_file].to_s,
                                   timeout: config[:timeout],
                                   headers: headers)
        else
          RestClient::Resource.new("#{protocol}://#{config[:host]}:#{config[:port]}#{resource}",
                                   timeout: config[:timeout],
                                   headers: headers)
        end
    JSON.parse(r.get)
  rescue Errno::ECONNREFUSED
    warning 'Connection refused'
  rescue RestClient::RequestTimeout
    warning 'Connection timed out'
  rescue RestClient::ServiceUnavailable
    warning 'Service is unavailable'
  rescue JSON::ParserError
    warning 'Elasticsearch API returned invalid JSON'
  end

  def acquire_stats
    if Gem::Version.new(acquire_es_version) >= Gem::Version.new('1.0.0')
      if config[:all]
        acquire_es_resource('/_nodes/stats')
      else
        acquire_es_resource('/_nodes/_local/stats')
      end
    elsif config[:all]
      acquire_es_resource('/_cluster/nodes/stats')
    else
      acquire_es_resource('/_cluster/nodes/_local/stats')
    end
  end

  def acquire_heap_data(node)
    return node['jvm']['mem']['heap_used_in_bytes'], node['jvm']['mem']['heap_max_in_bytes']
  rescue StandardError
    warning 'Failed to obtain heap used in bytes'
  end

  def acquire_heap_usage(heap_used, heap_max, node_name)
    if config[:percentage]
      heap_usage = ((100 * heap_used) / heap_max).to_i
      output = if config[:all]
                 "Node #{node_name}: Heap used in bytes #{heap_used} (#{heap_usage}% full)\n"
               else
                 "Heap used in bytes #{heap_used} (#{heap_usage}% full)"
               end
    else
      heap_usage = heap_used
      output = config[:all] ? "Node #{node_name}: Heap used in bytes #{heap_used}\n" : "Heap used in bytes #{heap_used}"
    end
    [heap_usage, output]
  end

  def run
    stats = acquire_stats
    status = { crit: '', warn: '', ok: '' }

    # Check all the nodes in the cluster, alert if any of the nodes have heap usage above thresholds
    stats['nodes'].each_value do |node|
      heap_used, heap_max = acquire_heap_data(node)
      heap_usage, output = acquire_heap_usage(heap_used, heap_max, node['name'])
      if heap_usage >= config[:crit]
        status[:crit] += output
      elsif heap_usage >= config[:warn]
        status[:warn] += output
      elsif !config[:all]
        status[:ok] += output
      end
    end

    if !status[:crit].empty?
      message status[:crit]
      critical
    elsif !status[:warn].empty?
      message status[:warn]
      warning
    else
      message status[:ok]
      ok
    end
  end
end