sensu-plugins/sensu-plugins-mesos

View on GitHub
bin/check-marathon-apps.rb

Summary

Maintainability
C
1 day
Test Coverage
#! /usr/bin/env ruby
# frozen_string_literal: false

#
#   check-marathon-apps
#
# DESCRIPTION:
#   This check script creates checks results for each Marathon app that is running,
#   and reports the status of the app based on Marathon Application Status Reference.
#   https://mesosphere.github.io/marathon/docs/marathon-ui.html#application-status-reference
#
# OUTPUT:
#   plain text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#
# USAGE:
#   Exclude apps matching "test"
#   check-marathon-apps.rb -x test
#   CheckMarathonApps OK: Marathon Apps Status and Health check is running properly
#
#   Only apps matching "test"
#   check-marathon-task.rb -i test
#   CheckMarathonApps OK: Marathon Apps Status and Health check is running properly
#
# NOTES:
#
# LICENSE:
#   Copyright 2018, Sensu Plugins
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'rest-client'
require 'json'

# This plugin checks Marathon apps based on https://mesosphere.github.io/marathon/docs/marathon-ui.html#application-status-reference
#
# It produces a check result for `health` and another check result for `status`
# Check results can be customised by providing default values on '--default-check-config JSON' or by populating Marathon app labels, example:
# SENSU_MARATHON_STATUS_UNSCHEDULED_TTL = 10
# SENSU_MARATHON_STATUS_UNSCHEDULED_SOURCE = my_source
#
# Those labels will override the default values and create the following check result if the status is Unscheduled:
# {
#   "name": "check_marathon_app_test_status",
#   "executed": 1519305736,
#   "marathon": {
#     "id": "/test",
#     "version": "2018-02-20T15:09:43.086Z",
#     "versionInfo": {
#       "lastScalingAt": "2018-02-20T15:09:43.086Z",
#       "lastConfigChangeAt": "2018-02-20T15:09:43.086Z"
#     },
#     "tasksStaged": 0,
#     "tasksRunning": 1,
#     "tasksHealthy": 1,
#     "tasksUnhealthy": 0
#   },
#   "source": "my_source",
#   "output": "STATUS Unscheduled - tasksRunning(1), tasksStaged(0), tasksHealthy(1), tasksUnhealthy(0)",
#   "ttl": 10,
#   "status": 0
# }
#
class MarathonAppsCheck < Sensu::Plugin::Check::CLI
  REFERENCES = %w[health status].freeze
  STATUS_STATES = %w[waiting delayed suspended deploying running].freeze
  HEALTH_STATES = %w[unscheduled overcapacity staged unknown unhealthy healthy].freeze
  APPS_EMBED_RESOURCES = %w[apps.task apps.count apps.deployments apps.lastTaskFailure apps.failures apps.taskStats].freeze
  DEFAULT_CHECK_CONFIG = <<-CONFIG.gsub(/^\s+\|/, '').freeze
    |{
    |  "_": {"ttl": 70},
    |  "status":{
    |    "running":   {"status": 0},
    |    "delayed":   {"status": 1},
    |    "deploying": {"status": 1},
    |    "suspended": {"status": 1},
    |    "waiting":   {"status": 1}
    |  },
    |  "health":{
    |    "healthy":      {"status": 0},
    |    "overcapacity": {"status": 1},
    |    "staged":       {"status": 1},
    |    "unhealthy":    {"status": 2},
    |    "unscheduled":  {"status": 2},
    |    "unknown":      {"status": 0}
    |  }
    |}
  CONFIG

  check_name 'CheckMarathonApps'
  banner <<-BANNER.gsub(/^\s+\|/, '')
    |Usage: ./check-marathon-apps.rb (options)
    |
    |This check will always return OK and publish two check results (health and status) per Marathon app.
    |Marathon applications can override default_check_config by using labels as described below.
    |
    |Some example labels that can be used in Marathon application manifests:
    |
    |# Publish '"aggregate": "component"' field in the check results for all health and status checks.
    |- SENSU_MARATHON_AGGREGATE=component
    |# similar for some other fields
    |- SENSU_MARATHON_CONTACT=support
    |
    |# Publish '"aggregate": "component"' field in the check results only for status checks.
    |- SENSU_MARATHON_STATUS_AGGREGATE=component
    |# Don't handle status check results
    |- SENSU_MARATHON_STATUS_HANDLE=false
    |
    |# Generate UNKNOWN result for unknown health status, rather then the default OK result
    |- SENSU_MARATHON_HEALTH_UNKNOWN_STATUS=3
    |
    |Similar logic could be applied for the provided default_check_config. Values
    |under the special '_' key will be used as a default value for deeper levels.
    |
    |Here is the default value for default_check_config:
    |#{DEFAULT_CHECK_CONFIG}
    |
    |Options:
  BANNER

  option :url,
         description: 'Marathon API URL',
         short: '-u url',
         long: '--url url',
         default: 'http://localhost:8080'

  option :username,
         short: '-u USERNAME',
         long: '--username USERNAME',
         default: nil,
         required: false,
         description: 'Marathon API username'

  option :password,
         long: '--password PASSWORD',
         default: nil,
         required: false,
         description: 'Marathon API password'

  option :match_pattern,
         short: '-m PATTERN',
         long: '--match-pattern PATTERN',
         required: false,
         description: 'Match app names against a pattern, exlude pattern takes precedence if both provided'

  option :exclude_pattern,
         short: '-x PATTERN',
         long: '--exclude-pattern PATTERN',
         required: false,
         description: 'Exclude apps that match a pattern, takes precedence over match pattern'

  option :marathon_keys,
         long: '--marathon-keys KEY1,KEY2,KEY3',
         default: 'id,version,versionInfo,tasksStaged,tasksRunning,tasksHealthy,tasksUnhealthy,lastTaskFailure',
         required: false,
         description: 'Keys retrieved from Marathon API that will be included in the output'

  option :default_check_config,
         long: '--default-check-config "{"status":{"running":{"valid":"json"}},"health":{"healthy":{"valid":"json"}}}"',
         required: false,
         description: 'Default values to be used while creating the check results, '\
                      'can be overridden in a per-marathon application config via Marathon labels.'

  option :default_check_config_file,
         long: '--default-check-config-file CONFIG_FILE',
         required: false,
         description: 'Similar to `--default-check-config` but read from given file. If both parameters are provided  '\
                      '`--default-check-config` will override this one.'

  option :check_config_overrides,
         long: '--check-config-overrides CHECK_CONFIG_OVERRIDES',
         description: 'Instead of providing whole default-check-config if you just want to introduce some new fields '\
                      'to the check config without having to provide whole config, this will be merged to the '\
                      'default-check-config.',
         default: '{}'

  option :sensu_client_url,
         description: 'Sensu client HTTP URL',
         long: '--sensu-client-url url',
         default: 'http://localhost:3031'

  option :timeout,
         description: 'timeout in seconds',
         short: '-T TIMEOUT',
         long: '--timeout TIMEOUT',
         proc: proc(&:to_i),
         default: 5

  def run
    if !config[:username].nil? && config[:password].nil? || config[:username].nil? && !config[:password].nil?
      unknown 'You must provide both username and password to authenticate on Marathon API'
    end

    # Get Marathon API apps
    apps = fetch_apps

    # Get Marathon API queue
    queue = fetch_queue

    # Get and parse default check config
    check_config_str = if !config[:default_check_config].nil?
                         config[:default_check_config]
                       elsif !config[:default_check_config_file].nil?
                         File.read(config[:default_check_config_file])
                       else
                         DEFAULT_CHECK_CONFIG
                       end
    default_check_config = parse_json(check_config_str)
    check_config_overrides = parse_json(config[:check_config_overrides])
    check_config = default_check_config.merge(check_config_overrides)

    # Filter apps, if both exists exclude pattern will override match pattern
    apps.keep_if { |app| app['id'][/#{config[:match_pattern]}/] } if config[:match_pattern]
    apps.delete_if { |app| app['id'][/#{config[:exclude_pat]}/] } if config[:exclude_pat]

    failed_apps_to_be_reported = 0
    apps.each do |app|
      failed_apps_to_be_reported += 1 unless process_app_results(app, queue, check_config)
    end

    if failed_apps_to_be_reported.positive?
      critical "#{failed_apps_to_be_reported} apps are failed to be reported to sensu"
    else
      ok 'Marathon Apps Status and Health check is running properly'
    end
  end

  def process_app_results(app, queue, check_config)
    app_result_pushed = true

    # Select app queue if any
    app_queue = queue.select { |q| q['app']['id'][/^#{app['id']}$/] }.to_a.first

    # Build check result
    check_result = check_result_scaffold(app)

    # Parse Marathon app labels
    labels_config = parse_app_labels(app['labels'].to_h)

    REFERENCES.each do |reference|
      # / is and invalid character
      check_result['name'] = "check_marathon_app#{app['id'].tr('/', '_')}_#{reference}"

      state = case reference
              when 'health'
                get_marathon_app_health(app)
              when 'status'
                get_marathon_app_status(app, app_queue.to_h)
              end

      # Merge user provided check config
      check_result.merge!(check_config.dig('_').to_h)
      check_result.merge!(check_config.dig(reference, '_').to_h)
      check_result.merge!(check_config.dig(reference, state).to_h)

      # Merge Marathon parsed check config
      check_result.merge!(labels_config.dig('_').to_h)
      check_result.merge!(labels_config.dig(reference, '_').to_h)
      check_result.merge!(labels_config.dig(reference, state).to_h)

      # Build check result output
      check_result['output'] = "#{reference.upcase} #{state.capitalize} - "\
        "tasksRunning(#{app['tasksRunning'].to_i}), tasksStaged(#{app['tasksStaged'].to_i}), "\
        "tasksHealthy(#{app['tasksHealthy'].to_i}), tasksUnhealthy(#{app['tasksUnhealthy'].to_i})"

      # Make sure that check result data types are correct
      enforce_sensu_field_types(check_result)

      # Send the result to sensu-client HTTP socket
      app_result = post_check_result(check_result)

      # mark if result cant be posted to sensu
      app_result_pushed = if app_result_pushed && app_result
                            true
                          else
                            false
                          end
    end
    app_result_pushed
  end

  def check_result_scaffold(app)
    {
      'name' => '',
      'executed' => Time.now.to_i,
      'marathon' => app.select { |k, _| config[:marathon_keys].split(',').include?(k) },
      'source' => 'marathon',
      'output' => '',
      'status' => 3
    }
  end

  def enforce_sensu_field_types(check_result)
    # Force data types of different fields on the check result
    # https://sensuapp.org/docs/latest/reference/checks.html#example-check-definition
    # https://sensuapp.org/docs/latest/reference/checks.html#check-result-specification
    check_result.each do |k, v|
      if %w[publish standalone auto_resolve force_resolve handle truncate_output].include?(k)
        # Boolean
        check_result[k] = v.to_s.eql?('true')
      elsif %w[status interval issued executed timeout ttl ttl_status low_flap_threshold high_flap_threshold truncate_output_length].include?(k)
        # Integer
        check_result[k] = Integer(v)
      elsif %w[subscribers handlers aggregates].include?(k)
        # Array
        check_result[k] = Array(v.split(','))
      end
    end
  end

  def rest_client(path)
    RestClient.get("#{config[:url]}#{path}",
                   user: config[:username],
                   password: config[:password],
                   accept: 'application/json',
                   timeout: config[:timeout]).body
  rescue RestClient::ExceptionWithResponse => e
    critical "Error while trying to GET (#{config[:url]}#{path}): #{e.response}"
  end

  def fetch_apps
    # http://mesosphere.github.io/marathon/api-console/index.html
    resources_query = APPS_EMBED_RESOURCES.map { |resource| "embed=#{resource}" }.join('&')
    parse_json(rest_client("/v2/apps?#{resources_query}"))['apps']
  end

  def fetch_queue
    # http://mesosphere.github.io/marathon/api-console/index.html
    parse_json(rest_client('/v2/queue'))['queue']
  end

  def post_check_result(data)
    RestClient.post("#{config[:sensu_client_url]}/results",
                    data.to_json,
                    content_type: 'application/json',
                    timeout: config[:timeout])
    true
  rescue RestClient::ExceptionWithResponse => e
    # print a message about failing POST but keep going
    warn "Error while trying to POST check result for #{data} (#{config[:sensu_client_url]}/results): #{e.response}"
    false
  end

  def parse_json(json)
    JSON.parse(json.to_s)
  rescue JSON::ParserError => e
    critical "Failed to parse JSON: #{e}\nJSON => #{json}"
  end

  def parse_app_labels(app_labels)
    config = {}
    # Only grab labels that starts with SENSU_MARATHON
    labels = app_labels.to_h.select { |e| /^SENSU_MARATHON/.match(e) }

    labels.each do |label, value|
      config_keys = label.split('_')

      # Delete SENSU and MARATHON element
      config_keys.delete_if { |k| /^SENSU$|^MARATHON$/.match(k) }

      # Downcase
      config_keys.map!(&:downcase)

      reference = config_keys.shift if REFERENCES.include? config_keys[0]
      if (reference == 'health' && HEALTH_STATES.include?(config_keys[0])) ||
         (reference == 'status' && STATUS_STATES.include?(config_keys[0]))
        state = config_keys.shift
      end
      key = config_keys.join(' ')

      # Add nested keys and value
      unless reference
        config['_'] ||= {}
        config['_'][key] = value
        next
      end
      config[reference] ||= {}
      unless state
        config[reference]['_'] ||= {}
        config[reference]['_'][key] = value
        next
      end
      config[reference][state] ||= {}
      config[reference][state][key] = value
    end
    config
  end

  def get_marathon_app_status(app, app_queue)
    # https://mesosphere.github.io/marathon/docs/marathon-ui.html#application-status-reference
    if app_queue.to_h.dig('delay', 'overdue') == true
      'waiting'
    elsif app_queue.to_h.dig('delay', 'overdue') == false
      'delayed'
    elsif app['instances'].to_i.zero? && app['tasksRunning'].to_i.zero?
      'suspended'
    elsif app['deployments'].to_a.any?
      'deploying'
    elsif app['instances'].to_i == app['tasksRunning'].to_i
      'running'
    else
      ''
    end
  end

  def get_marathon_app_health(app)
    # https://mesosphere.github.io/marathon/docs/marathon-ui.html#application-health-reference
    if app['tasks'].to_a.length.zero? && app['deployments'].to_a.length.zero?
      'unscheduled'
    elsif app['instances'].to_i < app['tasksRunning'].to_i
      'overcapacity'
    elsif app['tasksStaged'].to_i.positive?
      'staged'
    elsif app['healthChecks'].to_a.empty?
      'unknown'
    elsif app['tasksUnhealthy'].to_i.positive?
      'unhealthy'
    elsif app['healthChecks'].to_a.any? && app['tasksHealthy'].to_i.positive?
      'healthy'
    else
      ''
    end
  end
end