sensu-plugins/sensu-plugins-chef

View on GitHub
bin/check-chef-nodes.rb

Summary

Maintainability
A
2 hrs
Test Coverage
#! /usr/bin/env ruby
# frozen_string_literal: false

#   check-chef-nodes
#
# DESCRIPTION:
#   It will report you nodes from you cluster last seen more then some amount of seconds
#   Set CRITICAL-TIMESPAN to something interval + splay + <average chef kitchen run time>
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.#
#
# OUTPUT:
#   <output> plain text, metric data, etc
#
# PLATFORMS:
#   Linux, Windows, BSD, Solaris, etc
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   gem: <?>
#
# USAGE:
#   Look for nodes that haven't check in for 1 or more hours
#   ./check-chef-nodes.rb -t 3600 -U https://api.opscode.com/organizations/<org> -K /path/to/org.pem
#   ./check-chef-nodes.rb -t 3600 -U https://api.opscode.com/organizations/<org> -K /path/to/org.pem -e "^sensu.*$"
#
# NOTES:
#
# LICENSE:
#   Copyright 2014 Sonian, Inc. and contributors. <support@sensuapp.org>
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'ridley'

# supress the THOUSANDS of useless warnings
require 'hashie'
require 'hashie/logger'
Hashie.logger = Logger.new(nil)

#
# Chef Nodes Status Checker
#
class ChefNodesStatusChecker < Sensu::Plugin::Check::CLI
  option :critical_timespan,
         description: 'Amount of seconds after which node considered as stuck',
         short: '-t CRITICAL-TIMESPAN',
         long: '--timespan CRITICAL-TIMESPAN',
         default: (1800 + 300.0 + 180)

  option :chef_server_url,
         description: 'URL of Chef server',
         short: '-U CHEF-SERVER-URL',
         long: '--url CHEF-SERVER-URL'

  # defaults to the equivalent of `hostname --fqdn`
  option :client_name,
         description: 'Client name',
         short: '-C CLIENT-NAME',
         long: '--client CLIENT-NAME',
         required: true,
         default: Socket.gethostbyname(Socket.gethostname).first

  option :key,
         description: 'Client\'s key',
         short: '-K CLIENT-KEY',
         long: '--keys CLIENT-KEY'

  option :exclude_nodes,
         description: 'Node to excludes',
         short: '-e EXCLUDE-NODES',
         long: '--exclude-nodes EXCLUDE-NODES',
         default: '^$'

  option :grace_period,
         description: 'The amount of time before a node should be evaluated for failed convergence',
         long: '--grace-period SECONDS',
         # default 5 minutes, which seems like a good but not great default
         default: (60 * 5),
         proc: proc(&:to_i)

  option :ignore_ssl_verification,
         description: 'Ignore SSL certificate verification',
         short: '-i',
         long: '--ignore-ssl',
         default: false,
         boolean: true

  def connection
    @connection ||= chef_api_connection
  end

  def nodes_last_seen
    nodes = connection.node.all
    nodes.delete_if { |node| node.name =~ /#{config[:exclude_nodes]}/ }

    checked_nodes = []
    nodes.each do |node|
      node.reload
      # no uptime: node might have not finished convergence -> won't check
      unless node['automatic']['uptime_seconds']
        checked_nodes << { node['name'] => false }
        next
      end

      # won't check if node's uptime is still within grace period
      unless node['automatic']['uptime_seconds'] > config[:grace_period]
        checked_nodes << { node['name'] => false }
        next
      end

      # compute elapsed time since last convergence
      checked_nodes << if node['automatic']['ohai_time']
                         { node['name'] => (Time.now - Time.at(node['automatic']['ohai_time'])) > config[:critical_timespan].to_i }
                       else
                         { node['name'] => true }
                       end
    end
    checked_nodes
  end

  def run
    if any_node_stuck?
      critical "The following nodes cannot be provisioned: #{failed_nodes_names}"
    else
      ok 'Chef Server API is ok, all nodes reporting'
    end
  end

  private

  def chef_api_connection
    chef_server_url      = config[:chef_server_url]
    client_name          = config[:client_name]
    signing_key_filename = config[:key]
    ignore_ssl = config[:ignore_ssl_verification]
    verify_ssl = ignore_ssl.nil?

    Celluloid.boot
    Ridley.new(server_url: chef_server_url, client_name: client_name, client_key: signing_key_filename, ssl: { verify: verify_ssl })
  end

  def any_node_stuck?
    stuck = []
    @nodes_last_seen ||= nodes_last_seen
    @nodes_last_seen.flatten.each do |node|
      node.each do |name, status|
        stuck << name if status == true
      end
    end
    if stuck.empty?
      false
    else
      true
    end
  end

  def failed_nodes_names
    failed_nodes = []
    @nodes_last_seen.flatten.each do |node|
      node.each do |name, status|
        failed_nodes << name if status == true
      end
    end
    failed_nodes
  end
end