sensu-plugins/sensu-plugins-aws

View on GitHub
bin/check-emr-steps.rb

Summary

Maintainability
A
45 mins
Test Coverage
#! /usr/bin/env ruby
#
# check-emr-steps
#
# DESCRIPTION:
#   Alerts on any failed steps for a cluster in the past 10 minutes.
#
# OUTPUT:
#   plain-text
#
# DEPENDENCIES:
#   gem: aws-sdk
#   gem: sensu-plugin
#
# USAGE:
#   check-emr-steps.rb -r us-west-2 -b 'My Cluster' -t FAILED -c 0
#
#   This will alert on any failed steps in the past 10 minutes on the latest cluster
#   with the name 'My Cluster'.
# NOTES:
#
# LICENSE:
#   Bryan Absher <bryan.absher@gmail.com>
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.

require 'sensu-plugins-aws'
require 'sensu-plugin/check/cli'
require 'aws-sdk'

class CheckEMRSteps < Sensu::Plugin::Check::CLI
  include Common

  option :scheme,
         description: 'Metric naming scheme, text to prepend to metric',
         short: '-s SCHEME',
         long: '--scheme SCHEME',
         default: 'sensu.aws.emr'

  option :aws_region,
         short: '-r AWS_REGION',
         long: '--aws-region REGION',
         description: 'AWS Region (defaults to us-east-1).',
         default: 'us-east-1'

  option :cluster_name,
         short: '-b CLUSTER_NAME',
         long: '--cluster-name',
         description: 'The name of the EMR cluster',
         required: true

  option :status,
         short: '-t STEP_STATUS',
         long: '--step-status',
         description: 'Step status to check, [PENDING RUNNING COMPLETED CANCELLED FAILED INTERRUPTED]',
         default: 'FAILED'

  option :count,
         short: '-c COUNT',
         long: '--count',
         description: 'Max number of steps with this status.',
         proc: proc(&:to_i),
         default: 0

  def run
    emr = Aws::EMR::Client.new(aws_config)
    begin
      emr_clusters = emr.list_clusters.clusters
      clusters = emr_clusters.select { |c| c.name == config[:cluster_name] }
      critical "EMR cluster #{config[:cluster_name]} not found" if clusters.empty?
      cluster = clusters.sort_by { |c| c.status.timeline.creation_date_time }.reverse.first

      steps = emr.list_steps(
        cluster_id: cluster.id,
        step_states: [config[:status]]
      ).steps

      messages = []
      now = Time.new
      failed = steps.select { |step| now - step.status.timeline.end_date_time < 10 * 60 }
      if failed.size > config[:count]
        failed.each_entry { |step| messages << "Step #{step.id} '#{step.name}' has failed on cluster #{cluster.id} '#{cluster.name}'" }
        if messages.count > 0
          critical("#{messages.count} #{messages.count > 1 ? 'steps have' : 'step has'} failed: #{messages.join(',')}")
        end
      end
      ok
    end
  end
end