sensu-plugins/sensu-plugins-aws

View on GitHub
bin/check-emr-cluster.rb

Summary

Maintainability
A
3 hrs
Test Coverage
#! /usr/bin/env ruby
#
# check-emr-cluster
#
# DESCRIPTION:
#   This plugin checks if a cluster exists.
#
# OUTPUT:
#   plain-text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: aws-sdk
#   gem: sensu-plugin
#
# USAGE:
#   ./check-emr-cluster.rb --cluster-name MyCluster --aws-region eu-west-1 --use-iam --warning-over 14400 --critical-over 21600
#
# NOTES:
#
# LICENSE:
#   Copyright (c) 2015, Olivier Bazoud, olivier.bazoud@gmail.com
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'aws-sdk'

class CheckEMRCluster < Sensu::Plugin::Check::CLI
  option :aws_access_key,
         short: '-a AWS_ACCESS_KEY',
         long: '--aws-access-key AWS_ACCESS_KEY',
         description: "AWS Access Key. Either set ENV['AWS_ACCESS_KEY'] or provide it as an option",
         default: ENV['AWS_ACCESS_KEY']

  option :aws_secret_access_key,
         short: '-k AWS_SECRET_KEY',
         long: '--aws-secret-access-key AWS_SECRET_KEY',
         description: "AWS Secret Access Key. Either set ENV['AWS_SECRET_KEY'] or provide it as an option",
         default: ENV['AWS_SECRET_KEY']

  option :aws_region,
         short: '-r AWS_REGION',
         long: '--aws-region REGION',
         description: 'AWS Region (defaults to us-east-1).',
         default: 'us-east-1'

  option :use_iam_role,
         short: '-u',
         long: '--use-iam',
         description: 'Use IAM role authenticiation. Instance must have IAM role assigned for this to work'

  option :cluster_name,
         short: '-b CLUSTER_NAME',
         long: '--cluster-name',
         description: 'The name of the EMR cluster',
         required: true

  option :warning_over,
         description: 'Warn if cluster\'s age is greater than provided age in seconds',
         short: '-w SECONDS',
         long: '--warning-over SECONDS',
         default: -1,
         proc: proc(&:to_i)

  option :critical_over,
         description: 'Critical if cluster\'s age is greater than provided age in seconds',
         short: '-c SECONDS',
         long: '--critical-over SECONDS',
         default: -1,
         proc: proc(&:to_i)

  option :warning_under,
         description: 'Warn if cluster\'s age is lower than provided age in seconds',
         short: '-w SECONDS',
         long: '--warning-under SECONDS',
         default: -1,
         proc: proc(&:to_i)

  option :critical_under,
         description: 'Critical if cluster\'s age is lower than provided age in seconds',
         short: '-C SECONDS',
         long: '--critical-under SECONDS',
         default: -1,
         proc: proc(&:to_i)

  def aws_config
    { access_key_id: config[:aws_access_key],
      secret_access_key: config[:aws_secret_access_key],
      region: config[:aws_region] }
  end

  def humanize(secs)
    [[60, :seconds], [60, :minutes], [24, :hours], [1000, :days]].map do |count, name|
      if secs > 0
        secs, n = secs.divmod(count)
        "#{n.to_i} #{name}"
      end
    end.compact.reverse.join(' ')
  end

  def run
    aws_config = {}
    if config[:use_iam_role].nil?
      aws_config[:access_key_id] = config[:aws_access_key]
      aws_config[:secret_access_key] = config[:aws_secret_access_key]
    end

    emr = Aws::EMR::Client.new(aws_config.merge!(region: config[:aws_region]))
    begin
      emr_clusters = emr.list_clusters(created_after: Time.now - 24 * 60 * 60, created_before: Time.now).clusters
      clusters = emr_clusters.select { |c| c.name == config[:cluster_name] }

      critical "EMR cluster #{config[:cluster_name]} appears #{clusters.size} times" if clusters.size > 1
      critical "EMR cluster #{config[:cluster_name]} not found" if clusters.empty?

      cluster = clusters.first
      state = cluster.status.state
      if state == 'TERMINATED_WITH_ERRORS'
        critical "EMR cluster #{config[:cluster_name]} state is '#{state}'"
      else
        creation_date_time = cluster.status.timeline.creation_date_time
        end_date_time = cluster.status.timeline.end_date_time || Time.now
        age = end_date_time.to_i - creation_date_time.to_i
        if age >= config[:critical_over]
          critical "EMR cluster #{config[:cluster_name]} - #{humanize(age)} vs. #{humanize(config[:critical_over])}"
        elsif age >= config[:warning_over]
          warning "EMR cluster #{config[:cluster_name]} - #{humanize(age)} vs. #{humanize(config[:warning_over])}"
        elsif age <= config[:critical_under] && state == 'TERMINATED'
          critical "EMR cluster #{config[:cluster_name]} - #{humanize(age)} vs. #{humanize(config[:critical_under])}"
        elsif age <= config[:warning_under] && state == 'TERMINATED'
          warning "EMR cluster #{config[:cluster_name]} - #{humanize(age)} vs. #{humanize(config[:warning_under])}"
        else
          ok "EMR cluster #{config[:cluster_name]} - #{humanize(age)}"
        end
      end
    rescue StandardError => e
      critical "EMR cluster #{config[:cluster_name]} - #{e.message}"
    end
  end
end