bin/check-rds.rb from sensu-plugins/sensu-plugins-aws

bin/check-rds.rb
Summary

Maintainability

1 day
Test Coverage

Issues
#! /usr/bin/env ruby
#
# check-rds
#
# DESCRIPTION:
#   Check RDS instance statuses by RDS and CloudWatch API.
#
# OUTPUT:
#   plain-text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: aws-sdk
#   gem: sensu-plugin
#
# USAGE:
#   Critical if DB instance "sensu-admin-db" is not on ap-northeast-1a
#   check-rds -i sensu-admin-db --availability-zone-critical ap-northeast-1a
#
#   Warning if CPUUtilization is over 80%, critical if over 90%
#   check-rds -i sensu-admin-db --cpu-warning-over 80 --cpu-critical-over 90
#
#   Critical if CPUUtilization is over 90%, maximum of last one hour
#   check-rds -i sensu-admin-db --cpu-critical-over 90 --statistics maximum --period 3600
#
#   Warning if DatabaseConnections are over 100, critical over 120
#   check-rds -i sensu-admin-db --connections-critical-over 120 --connections-warning-over 100 --statistics maximum --period 3600
#
#   Warning if IOPS are over 100, critical over 200
#   check-rds -i sensu-admin-db --iops-critical-over 200 --iops-warning-over 100 --period 300
#
#   Warning if memory usage is over 80%, maximum of last 2 hour
#   specifying "minimum" is intended actually since memory usage is calculated from CloudWatch "FreeableMemory" metric.
#   check-rds -i sensu-admin-db --memory-warning-over 80 --statistics minimum --period 7200
#
#   Disk usage, same as memory
#   check-rds -i sensu-admin-db --disk-warning-over 80 --period 7200
#
#   You can check multiple metrics simultaneously. Highest severity will be reported
#   check-rds -i sensu-admin-db --cpu-warning-over 80 --cpu-critical-over 90 --memory-warning-over 60 --memory-critical-over 80
#
#   You can ignore accept nil values returned for a time periods from Cloudwatch as being an OK.  Amazon falls behind in their
#   metrics from time to time and this prevents false positives
#   check-rds -i sensu-admin-db --cpu-critical-over 90 -n
#
# NOTES:
#
# LICENSE:
#   Copyright 2014 github.com/y13i
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'aws-sdk'
require 'time'

class CheckRDS < Sensu::Plugin::Check::CLI
  option :aws_access_key,
         short:       '-a AWS_ACCESS_KEY',
         long:        '--aws-access-key AWS_ACCESS_KEY',
         description: "AWS Access Key. Either set ENV['AWS_ACCESS_KEY'] or provide it as an option",
         default:     ENV['AWS_ACCESS_KEY']

  option :aws_secret_access_key,
         short:       '-k AWS_SECRET_KEY',
         long:        '--aws-secret-access-key AWS_SECRET_KEY',
         description: "AWS Secret Access Key. Either set ENV['AWS_SECRET_KEY'] or provide it as an option",
         default:     ENV['AWS_SECRET_KEY']

  option :role_arn,
         long:        '--role-arn ROLE_ARN',
         description: 'AWS role arn of the role of the third party account to switch to',
         default:     false

  option :aws_region,
         short:       '-r AWS_REGION',
         long:        '--aws-region REGION',
         description: 'AWS Region (defaults to us-east-1).',
         default:     'us-east-1'

  option :db_instance_id,
         short:       '-i N',
         long:        '--db-instance-id NAME',
         description: 'DB instance identifier'

  option :db_cluster_id,
         short:       '-l N',
         long:        '--db-cluster-id NAME',
         description: 'DB cluster identifier'

  option :end_time,
         short:       '-t T',
         long:        '--end-time TIME',
         default:     Time.now,
         proc:        proc { |a| Time.parse a },
         description: 'CloudWatch metric statistics end time'

  option :period,
         short:       '-p N',
         long:        '--period SECONDS',
         default:     180,
         proc:        proc(&:to_i),
         description: 'CloudWatch metric statistics period'

  option :statistics,
         short:       '-S N',
         long:        '--statistics NAME',
         default:     :average,
         proc:        proc { |a| a.downcase.intern },
         description: 'CloudWatch statistics method'

  option :accept_nil,
         short: '-n',
         long: '--accept_nil',
         description: 'Continue if CloudWatch provides no metrics for the time period',
         default: false

  %w[warning critical].each do |severity|
    option :"availability_zone_#{severity}",
           long:        "--availability-zone-#{severity} AZ",
           description: "Trigger a #{severity} if availability zone is different than given argument"

    %w[cpu memory disk connections iops].each do |item|
      option :"#{item}_#{severity}_over",
             long:        "--#{item}-#{severity}-over N",
             proc:        proc(&:to_f),
             description: "Trigger a #{severity} if #{item} usage is over a percentage"
    end
  end

  def aws_config
    { access_key_id: config[:aws_access_key],
      secret_access_key: config[:aws_secret_access_key],
      region: config[:aws_region] }
  end

  def role_credentials
    @role_credentials = Aws::AssumeRoleCredentials.new(
      client: Aws::STS::Client.new(aws_config),
      role_arn: config[:role_arn],
      role_session_name: "role@#{Time.now.to_i}"
    )
  end

  def rds
    @rds ||= config[:role_arn] ? Aws::RDS::Client.new(credentials: role_credentials, region: aws_config[:region]) : Aws::RDS::Client.new(aws_config)
  end

  def cloud_watch
    @cloud_watch ||= config[:role_arn] ? Aws::CloudWatch::Client.new(credentials: role_credentials, region: aws_config[:region]) : Aws::CloudWatch::Client.new(aws_config)
  end

  def find_db_instance(id)
    db = rds.describe_db_instances.db_instances.detect { |db_instance| db_instance.db_instance_identifier == id }
    unknown 'DB instance not found.' if db.nil?
    db
  end

  def find_db_cluster_writer(id)
    wr = rds.describe_db_clusters(db_cluster_identifier: id).db_clusters[0].db_cluster_members.detect(&:is_cluster_writer).db_instance_identifier
    unknown 'DB cluster not found.' if wr.nil?
    wr
  end

  def cloud_watch_metric(metric_name, unit)
    cloud_watch.get_metric_statistics(
      namespace: 'AWS/RDS',
      metric_name: metric_name,
      dimensions: [
        {
          name: 'DBInstanceIdentifier',
          value: @db_instance.db_instance_identifier
        }
      ],
      start_time: config[:end_time] - config[:period],
      end_time: config[:end_time],
      statistics: [config[:statistics].to_s.capitalize],
      period: config[:period],
      unit: unit
    )
  end

  def latest_value(metric)
    values = metric.datapoints.sort_by { |datapoint| datapoint[:timestamp] }

    # handle time periods that are too small to return usable values.  # this is a cozy addition that wouldn't port upstream.
    if values.empty?
      config[:accept_nil] ? ok('Cloudwatch returned no results for time period. Accept nil passed so OK') : unknown('Requested time period did not return values from Cloudwatch. Try increasing your time period.')
    else
      values.last[config[:statistics]]
    end
  end

  def memory_total_bytes(instance_class)
    memory_total_gigabytes = {
      'db.cr1.8xlarge'  => 244.0,
      'db.m1.small'     => 1.7,
      'db.m1.medium'    => 3.75,
      'db.m1.large'     => 7.5,
      'db.m1.xlarge'    => 15.0,
      'db.m2.xlarge'    => 17.1,
      'db.m2.2xlarge'   => 34.2,
      'db.m2.4xlarge'   => 68.4,
      'db.m3.medium'    => 3.75,
      'db.m3.large'     => 7.5,
      'db.m3.xlarge'    => 15.0,
      'db.m3.2xlarge'   => 30.0,
      'db.m4.large'     => 8.0,
      'db.m4.xlarge'    => 16.0,
      'db.m4.2xlarge'   => 32.0,
      'db.m4.4xlarge'   => 64.0,
      'db.m4.10xlarge'  => 160.0,
      'db.m4.16xlarge'  => 256.0,
      'db.m5.large'     => 8.0,
      'db.m5.xlarge'    => 16.0,
      'db.m5.2xlarge'   => 32.0,
      'db.m5.4xlarge'   => 64.0,
      'db.m5.12xlarge'  => 192.0,
      'db.m5.24xlarge'  => 384.0,
      'db.r3.large'     => 15.0,
      'db.r3.xlarge'    => 30.5,
      'db.r3.2xlarge'   => 61.0,
      'db.r3.4xlarge'   => 122.0,
      'db.r3.8xlarge'   => 244.0,
      'db.r4.large'     => 15.25,
      'db.r4.xlarge'    => 30.5,
      'db.r4.2xlarge'   => 61.0,
      'db.r4.4xlarge'   => 122.0,
      'db.r4.8xlarge'   => 244.0,
      'db.r4.16xlarge'  => 488.0,
      'db.r5.large'     => 16.0,
      'db.r5.xlarge'    => 32.0,
      'db.r5.2xlarge'   => 64.0,
      'db.r5.4xlarge'   => 128.0,
      'db.r5.12xlarge'  => 384.0,
      'db.r5.24xlarge'  => 768.0,
      'db.t1.micro'     => 0.615,
      'db.t2.micro'     => 1.0,
      'db.t2.small'     => 2.0,
      'db.t2.medium'    => 4.0,
      'db.t2.large'     => 8.0,
      'db.t2.xlarge'    => 16.0,
      'db.t2.2xlarge'   => 32.0,
      'db.t3.micro'     => 1.0,
      'db.t3.small'     => 2.0,
      'db.t3.medium'    => 4.0,
      'db.t3.large'     => 8.0,
      'db.t3.xlarge'    => 16.0,
      'db.t3.2xlarge'   => 32.0,
      'db.x1.16xlarge'  => 976.0,
      'db.x1.32xlarge'  => 1952.0,
      'db.x1e.xlarge'   => 122.0,
      'db.x1e.2xlarge'  => 244.0,
      'db.x1e.4xlarge'  => 488.0,
      'db.x1e.8xlarge'  => 976.0,
      'db.x1e.16xlarge' => 1952.0,
      'db.x1e.32xlarge' => 3904.0
    }

    memory_total_gigabytes.fetch(instance_class) * 1024**3
  end

  def check_az(severity, expected_az)
    return if @db_instance.availability_zone == expected_az
    @severities[severity] = true
    "; AZ is #{@db_instance.availability_zone} (expected #{expected_az})"
  end

  def check_cpu(severity, expected_lower_than)
    cpu_metric ||= cloud_watch_metric 'CPUUtilization', 'Percent'
    cpu_metric_value ||= latest_value cpu_metric
    return if cpu_metric_value < expected_lower_than
    @severities[severity] = true
    "; CPUUtilization is #{sprintf '%.2f', cpu_metric_value}% (expected lower than #{expected_lower_than}%)"
  end

  def check_memory(severity, expected_lower_than)
    memory_metric ||= cloud_watch_metric 'FreeableMemory', 'Bytes'
    memory_metric_value ||= latest_value memory_metric
    memory_total_bytes ||= memory_total_bytes @db_instance.db_instance_class
    memory_usage_bytes ||= memory_total_bytes - memory_metric_value
    memory_usage_percentage ||= memory_usage_bytes / memory_total_bytes * 100
    return if memory_usage_percentage < expected_lower_than
    @severities[severity] = true
    "; Memory usage is #{sprintf '%.2f', memory_usage_percentage}% (expected lower than #{expected_lower_than}%)"
  end

  def check_disk(severity, expected_lower_than)
    disk_metric ||= cloud_watch_metric 'FreeStorageSpace', 'Bytes'
    disk_metric_value ||= latest_value disk_metric
    disk_total_bytes ||= @db_instance.allocated_storage * 1024**3
    disk_usage_bytes ||= disk_total_bytes - disk_metric_value
    disk_usage_percentage ||= disk_usage_bytes / disk_total_bytes * 100
    return if disk_usage_percentage < expected_lower_than
    @severities[severity] = true
    "; Disk usage is #{sprintf '%.2f', disk_usage_percentage}% (expected lower than #{expected_lower_than}%)"
  end

  def check_connections(severity, expected_lower_than)
    connections_metric ||= cloud_watch_metric 'DatabaseConnections', 'Count'
    connections_metric_value ||= latest_value connections_metric
    return if connections_metric_value < expected_lower_than
    @severities[severity] = true
    "; DatabaseConnections are #{sprintf '%d', connections_metric_value} (expected lower than #{expected_lower_than})"
  end

  def check_iops(severity, expected_lower_than)
    read_iops_metric ||= cloud_watch_metric 'ReadIOPS', 'Count/Second'
    read_iops_metric_value ||= latest_value read_iops_metric
    write_iops_metric ||= cloud_watch_metric 'WriteIOPS', 'Count/Second'
    write_iops_metric_value ||= latest_value write_iops_metric
    iops_metric_value ||= read_iops_metric_value + write_iops_metric_value
    return if iops_metric_value < expected_lower_than
    @severities[severity] = true
    "; IOPS are #{sprintf '%d', iops_metric_value} (expected lower than #{expected_lower_than})"
  end

  def run
    instances = []
    if config[:db_cluster_id]
      db_cluster_writer_id = find_db_cluster_writer(config[:db_cluster_id])
      instances << find_db_instance(db_cluster_writer_id)
    elsif config[:db_instance_id].nil? || config[:db_instance_id].empty?
      rds.describe_db_instances[:db_instances].map { |db| instances << db }
    else
      instances << find_db_instance(config[:db_instance_id])
    end

    messages = ''
    severities = {
      critical: false,
      warning:  false
    }
    instances.each do |instance|
      @db_instance = instance
      result = collect(instance)
      if result[1][:critical]
        messages += result[0]
        severities[:critical] = true
      elsif result[1][:warning]
        severities[:warning] = true
        messages += result[0]
      end
    end

    if severities[:critical]
      critical messages
    elsif severities[:warning]
      warning messages
    else
      ok messages
    end
  end

  def collect(instance)
    message = "\n#{instance[:db_instance_identifier]}: "
    @severities = {
      critical: false,
      warning:  false
    }

    @severities.each_key do |severity|
      message += check_az severity, config[:"availability_zone_#{severity}"], instance if config[:"availability_zone_#{severity}"]

      %w[cpu memory disk connections iops].each do |item|
        result = send "check_#{item}", severity, config[:"#{item}_#{severity}_over"] if config[:"#{item}_#{severity}_over"]
        message += result unless result.nil?
      end
    end

    if %w[cpu memory disk connections iops].any? { |item| %w[warning critical].any? { |severity| config[:"#{item}_#{severity}_over"] } }
      message += "(#{config[:statistics].to_s.capitalize} within #{config[:period]}s "
      message += "between #{config[:end_time] - config[:period]} to #{config[:end_time]})"
    end
    [message, @severities]
  end
end