sensu-plugins/sensu-plugins-cassandra

View on GitHub
bin/metrics-cassandra-graphite.rb

Summary

Maintainability
C
1 day
Test Coverage
#! /usr/bin/env ruby
# frozen_string_literal: true

#
#   cassandra-graphite
#
# DESCRIPTION:
#   This plugin uses Apache Cassandra's `nodetool` to collect metrics
#   from an instance of Cassandra. Default is localhost and port 7199.
#   Use 8080 for Cassandra < 0.8.
#
#   By default, only 'info' and 'tpstats' metrics will be output, but
#   can be disabled with `--no-info` or `--no-tpstats`.
#
#   Use `--cfstats` to get detailed metrics on keyspaces and column
#   families.
#
#   Only column-families matching a regex will be output if the
#   `--filter REGEX` flag is used.
#
# OUTPUT:
#   metric data
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   Cassandra's nodetool
#
# USAGE:
#  #   info and tpstats
#   ----------------
#
#     $ ./cassandra-metrics.rb
#
#      host.cassandra.load  75696701.44 1344547246
#      host.cassandra.uptime  580640  1344547246
#      host.cassandra.heap.used 88332042.24 1344547246
#      host.cassandra.heap.total  408944640.0 1344547246
#      host.cassandra.exceptions  0 1344547246
#      host.cassandra.threadpool.ReadStage.active 0 1344547246
#      host.cassandra.threadpool.ReadStage.pending  0 1344547246
#      ...
#
#   All metrics, including keyspaces and column families
#   ----------------------------------------------------
#
#     $ ./cassandra-metrics.rb --cfstats
#
#   Show metrics for column-families matching '.*user.*' regex
#   ----------------------------------------------------------
#
#     $ ./cassandra-metrics.rb  --cfstats --filter .*user.*
#
#   Show keyspace metrics, but not column family metrics
#   ----------------------------------------------------
#
#     $ ./cassandra-metrics.rb --cfstats NOTHING_SHOULD_MATCH_THIS_REGEX
#
# NOTES:
#   Heavily inspired by Datadog's python plugin:
#   https://github.com/miketheman/dd-agent/blob/master/checks/cassandra.py
#
# LICENSE:
#   Copyright 2012 Joe Miller https://github.com/joemiller
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/metric/cli'
require 'socket'

UNITS_FACTOR = {
  'bytes' => 1,
  'KB' => 1024,
  'KiB' => 1024,
  'MB' => 1024**2,
  'MiB' => 1024**2,
  'GB' => 1024**3,
  'GiB' => 1024**3,
  'TB' => 1024**4,
  'TiB' => 1024**4
}.freeze

#
# Cassandra Metrics
#
class CassandraMetrics < Sensu::Plugin::Metric::CLI::Graphite
  option :hostname,
         short: '-h HOSTNAME',
         long: '--host HOSTNAME',
         description: 'cassandra hostname',
         default: 'localhost'

  option :port,
         short: '-P PORT',
         long: '--port PORT',
         description: 'cassandra JMX port',
         default: '7199'

  option :scheme,
         description: 'Metric naming scheme, text to prepend to metric',
         short: '-s SCHEME',
         long: '--scheme SCHEME',
         default: "#{Socket.gethostname}.cassandra"

  option :filter_regex,
         description: 'regular expression for filtering column families (use with --cfstats)',
         on: :tail,
         short: '-f REGEX',
         long: '--filter REGEX'

  option :info,
         description: 'output high-level Cassandra "info" metrics (default: yes)',
         on: :tail,
         short: '-i',
         long: '--[no-]info',
         boolean: true,
         default: true

  option :compactionstats,
         description: 'output Cassandra "compactionstats" metrics (default: yes)',
         on: :tail,
         short: '-o',
         long: '--[no-]compactionstats',
         boolean: true,
         default: true

  option :tpstats,
         description: 'output Cassandra threadPool metrics (default: yes)',
         on: :tail,
         short: '-t',
         long: '--[no-]tpstats',
         boolean: true,
         default: true

  option :cfstats,
         description: 'output metrics on keyspaces and column families (default: no)',
         on: :tail,
         short: '-c',
         long: '--[no-]cfstats',
         boolean: true,
         default: false

  # convert_to_bytes(512, 'KB') => 524288
  # convert_to_bytes(1, 'MB') => 1048576
  def convert_to_bytes(size, unit)
    size.to_f * UNITS_FACTOR[unit]
  end

  # execute cassandra's nodetool and return output as string
  def nodetool_cmd(cmd)
    `nodetool -h #{config[:hostname]} -p #{config[:port]} #{cmd}`
  end

  # nodetool -h localhost info:
  # v 0.7
  #
  # 36299342986353445520010708318471778930
  # Load             : 457.02 KB
  # Generation No    : 1295816448
  # Uptime (seconds) : 95
  # Heap Memory (MB) : 521.86 / 1019.88
  #
  # v 0.8
  # Token            : 51022655878160265769426795515063697984
  # Gossip active    : True
  # Load             : 283.87 GB
  # Generation No    : 1331653944
  # Uptime (seconds) : 188319
  # Heap Memory (MB) : 2527.04 / 3830.00
  # Data Center      : 283
  # Rack             : 76
  # Exceptions       : 0
  #
  # v 1.1
  # Token            : 141784319550391026443072753096570088106
  # Gossip active    : true
  # Thrift active    : true
  # Load             : 821.59 GB
  # Generation No    : 1345535280
  # Uptime (seconds) : 34269
  # Heap Memory (MB) : 2382.02 / 3032.00
  # Data Center      : datacenter1
  # Rack             : rack1
  # Exceptions       : 0
  # Key Cache        : size 28141776 (bytes), capacity 104857584 (bytes), 9489268 hits, 9676043 requests, 0.987 recent hit rate, 14400 save period in seconds
  # Row Cache        : size 7947581 (bytes), capacity 1048576000 (bytes), 84005 hits, 104727 requests, 0.701 recent hit rate, 0 save period in seconds
  #
  # According to io/util/FileUtils.java units for load are:
  # TB/GB/MB/KB/bytes
  #
  def parse_info
    info = nodetool_cmd('info')
    # #YELLOW
    # TODO: come back and refactor me to be better
    info.each_line do |line| # rubocop:disable Metrics/BlockLength
      if (m = line.match(/^Exceptions\s*:\s+([0-9]+)$/))
        output "#{config[:scheme]}.exceptions", m[1], @timestamp
      end

      if (m = line.match(/^Load\s*:\s+([0-9.]+)\s+([KMGT]i?B|bytes)$/))
        output "#{config[:scheme]}.load", convert_to_bytes(m[1], m[2]), @timestamp
      end

      if (m = line.match(/^Uptime[^:]+:\s+(\d+)$/))
        output "#{config[:scheme]}.uptime", m[1], @timestamp
      end

      if (m = line.match(/^Heap Memory[^:]+:\s+([0-9.]+)\s+\/\s+([0-9.]+)$/))
        output "#{config[:scheme]}.heap.used", convert_to_bytes(m[1], 'MB'), @timestamp
        output "#{config[:scheme]}.heap.total", convert_to_bytes(m[2], 'MB'), @timestamp
      end

      # v1.1+
      if (m = line.match(/^Key Cache[^:]+: size ([0-9]+) \(bytes\), capacity ([0-9]+) \(bytes\), ([0-9]+) hits, ([0-9]+) requests/))
        output "#{config[:scheme]}.key_cache.size", m[1], @timestamp
        output "#{config[:scheme]}.key_cache.capacity", m[2], @timestamp
        output "#{config[:scheme]}.key_cache.hits", m[3], @timestamp
        output "#{config[:scheme]}.key_cache.requests", m[4], @timestamp
      end

      # cassandra nodetool v3.0+  Changed the key cache output
      # Key Cache : entries 569669, size 100 MiB, capacity 100 MiB, 35689224 hits, 70654365 requests, 0.505 recent hit rate, 14400 save period in seconds
      # Key Cache : entries 13291, size 7.83 MB, capacity 50 MB, 119444 hits, 139720 requests, 0.855 recent hit rate, 14400 save period in seconds
      if (m = line.match(/^Key Cache[^:]+: entries ([0-9]+), size ([-+]?[0-9]*\.?[0-9]+) ([KMGT]i?B|bytes), capacity ([-+]?[0-9]*\.?[0-9]+) ([KMGT]i?B|bytes), ([0-9]+) hits, ([0-9]+) requests, ([-+]?[0-9]*\.?[0-9]+) recent hit rate/)) # rubocop:disable Layout/LineLength
        output "#{config[:scheme]}.key_cache.size", convert_to_bytes(m[2], m[3]), @timestamp
        output "#{config[:scheme]}.key_cache.capacity", convert_to_bytes(m[4], m[5]), @timestamp
        output "#{config[:scheme]}.key_cache.hits", m[6], @timestamp
        output "#{config[:scheme]}.key_cache.requests", m[7], @timestamp
        output "#{config[:scheme]}.key_cache.hit_rate", m[8], @timestamp
      end

      if (m = line.match(/^Row Cache[^:]+: size ([0-9]+) \(bytes\), capacity ([0-9]+) \(bytes\), ([0-9]+) hits, ([0-9]+) requests/))
        output "#{config[:scheme]}.row_cache.size", m[1], @timestamp
        output "#{config[:scheme]}.row_cache.capacity", m[2], @timestamp
        output "#{config[:scheme]}.row_cache.hits", m[3], @timestamp
        output "#{config[:scheme]}.row_cache.requests", m[4], @timestamp
      end

      # cassandra nodetool v3.0+  Changed the row cache output
      # Row Cache : entries 569669, size 100 MiB, capacity 100 MiB, 35689224 hits, 70654365 requests, 0.505 recent hit rate, 14400 save period in seconds
      # Row Cache : entries 13291, size 7.83 MB, capacity 50 MB, 119444 hits, 139720 requests, 0.855 recent hit rate, 14400 save period in seconds
      if (m = line.match(/^Row Cache[^:]+: entries ([0-9]+), size ([-+]?[0-9]*\.?[0-9]+) ([KMGT]i?B|bytes), capacity ([-+]?[0-9]*\.?[0-9]+) ([KMGT]i?B|bytes), ([0-9]+) hits, ([0-9]+) requests, ([-+]?[0-9]*\.?[0-9]+) recent hit rate/)) # rubocop:disable Layout/LineLength
        output "#{config[:scheme]}.row_cache.size", convert_to_bytes(m[2], m[3]), @timestamp
        output "#{config[:scheme]}.row_cache.capacity", convert_to_bytes(m[4], m[5]), @timestamp
        output "#{config[:scheme]}.row_cache.hits", m[6], @timestamp
        output "#{config[:scheme]}.row_cache.requests", m[7], @timestamp
        output "#{config[:scheme]}.row_cache.hit_rate", m[8], @timestamp
      end
    end
  end

  # nodetool -h localhost tpstats:
  # Pool Name                    Active   Pending      Completed   Blocked  All time blocked
  # ReadStage                         0         0         282971         0                 0
  # RequestResponseStage              0         0          32926         0                 0
  # MutationStage                     0         0        3216105         0                 0
  # ReadRepairStage                   0         0              0         0                 0
  # ReplicateOnWriteStage             0         0              0         0                 0
  # GossipStage                       0         0              0         0                 0
  # AntiEntropyStage                  0         0              0         0                 0
  # MigrationStage                    0         0            188         0                 0
  # MemtablePostFlusher               0         0            110         0                 0
  # StreamStage                       0         0              0         0                 0
  # FlushWriter                       0         0            110         0                 0
  # MiscStage                         0         0              0         0                 0
  # InternalResponseStage             0         0            179         0                 0
  # HintedHandoff                     0         0              0         0                 0
  #
  # Message type           Dropped
  # RANGE_SLICE                  0
  # READ_REPAIR                  0
  # BINARY                       0
  # READ                         0
  # MUTATION                     0
  # REQUEST_RESPONSE             0
  def parse_tpstats# rubocop:disable all
    tpstats = nodetool_cmd('tpstats')
    tpstats.each_line do |line|
      next if line =~ /^Pool Name/
      next if line =~ /^Message type/

      if m = line.match(/^(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)$/)# rubocop:disable all
        (thread, active, pending, completed, blocked) = m.captures

        output "#{config[:scheme]}.threadpool.#{thread}.active", active, @timestamp
        output "#{config[:scheme]}.threadpool.#{thread}.pending", pending, @timestamp
        output "#{config[:scheme]}.threadpool.#{thread}.completed", completed, @timestamp
        output "#{config[:scheme]}.threadpool.#{thread}.blocked", blocked, @timestamp
      end

      if m = line.match(/^(\w+)\s+(\d+)$/)# rubocop:disable all
        (message_type, dropped) = m.captures
        output "#{config[:scheme]}.message_type.#{message_type}.dropped", dropped, @timestamp
      end
    end
  end

  # nodetool -h localhost compactionstats
  # pending tasks: 1
  #    compaction type        keyspace   column family bytes compacted     bytes total  progress
  #     ....
  #
  # note: we are only capturing the 'pending tasks' stats
  def parse_compactionstats
    cstats = nodetool_cmd('compactionstats')
    cstats.each_line do |line|
      if m = line.match(/^pending tasks:\s+([0-9]+)/)# rubocop:disable all
        output "#{config[:scheme]}.compactionstats.pending_tasks", m[1], @timestamp
      end
    end
  end

  # nodetool -h localhost cfstats
  # Keyspace: system
  #   Read Count: 216
  #   Read Latency: 1.4066805555555557 ms.
  #   Write Count: 36
  #   Write Latency: 0.32755555555555554 ms.
  #   Pending Tasks: 0
  #     Column Family: NodeIdInfo
  #     SSTable count: 0
  #     Space used (live): 0
  #     Space used (total): 0
  #     Number of Keys (estimate): 0
  #     Memtable Columns Count: 0
  #     Memtable Data Size: 0
  #     Memtable Switch Count: 0
  #     Read Count: 0
  #     Read Latency: NaN ms.
  #     Write Count: 0
  #     Write Latency: NaN ms.
  #     Pending Tasks: 0
  #     Bloom Filter False Postives: 0
  #     Bloom Filter False Ratio: 0.00000
  #     Bloom Filter Space Used: 0
  #     Key cache capacity: 1
  #     Key cache size: 0
  #     Key cache hit rate: NaN
  #     Row cache: disabled
  #     Compacted row minimum size: 0
  #     Compacted row maximum size: 0
  #     Compacted row mean size: 0
  #
  # some notes on parsing cfstats output:
  # - a line preceeded by 1 tab contains keyspace metrics
  # - a line preceeded by 2 tabs contains column family metrics
  def parse_cfstats
    def get_metric(string) # rubocop:disable Lint/NestedMethodDefinition
      string.strip!
      (metric, value) = string.split(': ')
      if metric.nil? || value.nil? # rubocop:disable Style/GuardClause
        return [nil, nil]
      else
        # sanitize metric names for graphite
        metric.gsub!(/[^a-zA-Z0-9]/, '_')  # convert all other chars to _
        metric.gsub!(/[_]*$/, '')          # remove any _'s at end of the string
        metric.gsub!(/[_]{2,}/, '_')       # convert sequence of multiple _'s to single _
        metric.downcase!
        # sanitize metric values for graphite. Numbers only, please.
        # some versions of nodetool omit the '.' following the 'ms' unit.
        value = value.chomp(' ms.').chomp(' ms')
      end

      [metric, value]
    end

    cfstats = nodetool_cmd('cfstats')

    keyspace = nil
    cf = nil

    cfstats.each_line do |line|
      num_indents = line.count("\t")
      if m = line.match(/^Keyspace\s?:\s+(\w+)$/)# rubocop:disable all
        keyspace = m[1]
      elsif m = line.match(/\t\tColumn Family[^:]*:\s+(\w+)$/)# rubocop:disable all
        cf = m[1]
      elsif num_indents.zero?
        # keyspace = nil
        cf = nil
      elsif num_indents == 2 && !cf.nil?
        # a column family metric
        # #YELLOW
        if config[:filter_regex]
          unless cf.match(config[:filter_regex])
            next
          end
        end
        (metric, value) = get_metric(line)
        output "#{config[:scheme]}.#{keyspace}.#{cf}.#{metric}", value, @timestamp unless value == 'disabled'
      elsif num_indents == 1 && !keyspace.nil?
        # a keyspace metric
        (metric, value) = get_metric(line)
        output "#{config[:scheme]}.#{keyspace}.#{metric}", value, @timestamp
      end
    end
  end

  def run# rubocop:disable all
    @timestamp = Time.now.to_i

    parse_info    if config[:info]
    parse_compactionstats if config[:compactionstats]
    parse_tpstats if config[:tpstats]
    parse_cfstats if config[:cfstats]

    ok
  end
end