sensu-plugins/sensu-plugins-process-checks

View on GitHub
bin/check-process.rb

Summary

Maintainability
C
1 day
Test Coverage
#! /usr/bin/env ruby
#
# check-process
#
# DESCRIPTION:
#   Finds processes matching various filters (name, state, etc). Will not
#   match itself by default. The number of processes found will be tested
#   against the Warning/critical thresholds. By default, fails with a
#   CRITICAL if more than one process matches -- you must specify values
#   for -w and -c to override this.
#
#   Attempts to work on Cygwin (where ps does not have the features we
#   need) by calling Windows' tasklist.exe, but this is not well tested.#
#
# OUTPUT:
#   plain text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   gem: english
#
# USAGE:
#   # chef-client is running
#   check-process.rb -p chef-client -W 1
#
#   # there are not too many zombies
#   check-process.rb -s Z -w 5 -c 10
#
# NOTES:
#
# LICENSE:
#   Copyright 2011 Sonian, Inc <chefs@sonian.net>
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugin/check/cli'
require 'English'

#
# Check Processes
#
class CheckProcess < Sensu::Plugin::Check::CLI
  option :warn_over,
         short: '-w N',
         long: '--warn-over N',
         description: 'Trigger a warning if over a number',
         proc: proc(&:to_i)

  option :crit_over,
         short: '-c N',
         long: '--critical-over N',
         description: 'Trigger a critical if over a number',
         proc: proc(&:to_i)

  option :warn_under,
         short: '-W N',
         long: '--warn-under N',
         description: 'Trigger a warning if under a number',
         proc: proc(&:to_i),
         default: 1

  option :crit_under,
         short: '-C N',
         long: '--critical-under N',
         description: 'Trigger a critial if under a number',
         proc: proc(&:to_i),
         default: 1

  option :metric,
         short: '-t METRIC',
         long: '--metric METRIC',
         description: 'Trigger a critical if there are METRIC procs',
         proc: proc(&:to_sym)

  option :match_self,
         short: '-m',
         long: '--match-self',
         description: 'Match itself',
         boolean: true,
         default: false

  option :match_parent,
         short: '-M',
         long: '--match-parent',
         description: 'Match parent process it uses ruby {process.ppid}',
         boolean: true,
         default: false

  option :cmd_pat,
         short: '-p PATTERN',
         long: '--pattern PATTERN',
         description: 'Match a command against this pattern'

  option :exclude_pat,
         short: '-x PATTERN',
         long: '--exclude-pattern PATTERN',
         description: "Don't match against a pattern to prevent false positives"

  option :file_pid,
         short: '-f PID',
         long: '--file-pid PID',
         description: 'Check against a specific PID'

  option :file_pid_crit,
         short: '-F',
         long: '--file-pid-crit',
         description: 'Trigger a critical if pid file is specified but non-existent'

  option :vsz,
         short: '-z VSZ',
         long: '--virtual-memory-size VSZ',
         description: 'Trigger on a Virtual Memory size is bigger than this',
         proc: proc(&:to_i)

  option :rss,
         short: '-r RSS',
         long: '--resident-set-size RSS',
         description: 'Trigger on a Resident Set size is bigger than this',
         proc: proc(&:to_i)

  option :cpu_utilization,
         short: '-P xx',
         long: '--cpu-utilization xx',
         description: 'Trigger on a Proportional Set Size is bigger than this',
         proc: proc(&:to_f)

  option :thcount,
         short: '-T THCOUNT',
         long: '--thread-count THCOUNT',
         description: 'Trigger on a Thread Count is bigger than this',
         proc: proc(&:to_i)

  option :state,
         short: '-s STATE',
         long: '--state STATE',
         description: 'Trigger on a specific state, example: Z for zombie',
         proc: proc { |a| a.split(',') }

  option :user,
         short: '-u USER',
         long: '--user USER',
         description: 'Trigger on a specific user',
         proc: proc { |a| a.split(',') }

  option :esec_over,
         short: '-e SECONDS',
         long: '--esec-over SECONDS',
         proc: proc(&:to_i),
         description: 'Match processes that are older than this, in SECONDS'

  option :esec_under,
         short: '-E SECONDS',
         long: '--esec-under SECONDS',
         proc: proc(&:to_i),
         description: 'Match process that are younger than this, in SECONDS'

  option :cpu_over,
         short: '-i SECONDS',
         long: '--cpu-over SECONDS',
         proc: proc(&:to_i),
         description: 'Match processes cpu time that is older than this, in SECONDS'

  option :cpu_under,
         short: '-I SECONDS',
         long: '--cpu-under SECONDS',
         proc: proc(&:to_i),
         description: 'Match processes cpu time that is younger than this, in SECONDS'

  option :encoding,
         description: 'Explicit encoding when reading process list',
         long: '--encoding ENCODING',
         default: 'ASCII-8BIT'

  # Read the pid file
  # @param path [String] the path to the pid file, including the file
  def read_pid(path)
    if File.exist?(path)
      File.read(path).strip.to_i
    elsif config[:file_pid_crit].nil?
      unknown "Could not read pid file #{path}"
    else
      critical "Could not read pid file #{path}"
    end
  end

  # read the output of a command
  # @param cmd [String] the command to read the output from
  def read_lines(cmd)
    IO.popen(cmd + ' 2>&1', external_encoding: config[:encoding]) do |child|
      child.read.split("\n")
    end
  end

  # create a hash from the output of each line of a command
  # @param line [String]
  # @param cols
  #
  def line_to_hash(line, *cols)
    Hash[cols.zip(line.strip.split(/\s+/, cols.size))]
  end

  # Is this running on cygwin
  #
  #
  def on_cygwin?
    # #YELLOW
    `ps -W 2>&1`; $CHILD_STATUS.exitstatus.zero? # rubocop:disable Semicolon
  end

  # Acquire all the proceeses on a system for further analysis
  #
  def acquire_procs
    if on_cygwin?
      read_lines('ps -aWl').drop(1).map do |line|
        # Horrible hack because cygwin's ps has no o option, every
        # format includes the STIME column (which may contain spaces),
        # and the process state (which isn't actually a column) can be
        # blank. As of revision 1.35, the format is:
        # const char *lfmt = "%c %7d %7d %7d %10u %4s %4u %8s %s\n";
        state = line.slice!(0..0)
        _stime = line.slice!(45..53)
        line_to_hash(line, :pid, :ppid, :pgid, :winpid, :tty, :uid, :etime, :command, :time).merge(state: state)
      end
    else
      read_lines('ps axwwo user,pid,vsz,rss,pcpu,nlwp,state,etime,time,command').drop(1).map do |line|
        line_to_hash(line, :user, :pid, :vsz, :rss, :cpu, :thcount, :state, :etime, :time, :command)
      end
    end
  end

  # Match to a time
  #
  def etime_to_esec(etime)
    m = /(\d+-)?(\d\d:)?(\d\d):(\d\d)/.match(etime)
    (m[1] || 0).to_i * 86_400 + (m[2] || 0).to_i * 3600 + (m[3] || 0).to_i * 60 + (m[4] || 0).to_i
  end

  # Match to a time
  #
  def cputime_to_csec(time)
    m = /(\d+-)?(\d\d:)?(\d\d):(\d\d)/.match(time)
    (m[1] || 0).to_i * 86_400 + (m[2] || 0).to_i * 3600 + (m[3] || 0).to_i * 60 + (m[4] || 0).to_i
  end

  # The main function
  #
  def run
    procs = acquire_procs

    if config[:file_pid] && (file_pid = read_pid(config[:file_pid]))
      procs.select! { |p| p[:pid].to_i == file_pid }
    end

    procs.reject! { |p| p[:pid].to_i == $PROCESS_ID } unless config[:match_self]
    procs.reject! { |p| p[:pid].to_i == Process.ppid } unless config[:match_parent]
    procs.reject! { |p| p[:command] =~ /#{config[:exclude_pat]}/ } if config[:exclude_pat]
    procs.select! { |p| p[:command] =~ /#{config[:cmd_pat]}/ } if config[:cmd_pat]
    procs.select! { |p| p[:vsz].to_f > config[:vsz] } if config[:vsz]
    procs.select! { |p| p[:rss].to_f > config[:rss] } if config[:rss]
    procs.select! { |p| p[:cpu].to_f > config[:cpu_utilization] } if config[:cpu_utilization]
    procs.select! { |p| p[:thcount].to_i > config[:thcount] } if config[:thcount]
    procs.reject! { |p| etime_to_esec(p[:etime]) >= config[:esec_under] } if config[:esec_under]
    procs.reject! { |p| etime_to_esec(p[:etime]) <= config[:esec_over] } if config[:esec_over]
    procs.reject! { |p| cputime_to_csec(p[:time]) >= config[:cpu_under] } if config[:cpu_under]
    procs.reject! { |p| cputime_to_csec(p[:time]) <= config[:cpu_over] } if config[:cpu_over]
    procs.select! { |p| config[:state].include?(p[:state]) } if config[:state]
    procs.select! { |p| config[:user].include?(p[:user]) } if config[:user]

    msg = "Found #{procs.size} matching processes"
    msg += "; cmd /#{config[:cmd_pat]}/" if config[:cmd_pat]
    msg += "; state #{config[:state].join(',')}" if config[:state]
    msg += "; user #{config[:user].join(',')}" if config[:user]
    msg += "; vsz > #{config[:vsz]}" if config[:vsz]
    msg += "; rss > #{config[:rss]}" if config[:rss]
    msg += "; cpu > #{config[:cpu_utilization]}" if config[:cpu_utilization]
    msg += "; thcount > #{config[:thcount]}" if config[:thcount]
    msg += "; esec < #{config[:esec_under]}" if config[:esec_under]
    msg += "; esec > #{config[:esec_over]}" if config[:esec_over]
    msg += "; csec < #{config[:cpu_under]}" if config[:cpu_under]
    msg += "; csec > #{config[:cpu_over]}" if config[:cpu_over]
    msg += "; pid #{config[:file_pid]}" if config[:file_pid]

    if config[:metric]
      # #YELLOW
      count = procs.map { |p| p[config[:metric]].to_i }.reduce { |a, b| a + b }
      msg += "; #{config[:metric]} == #{count}"
    else
      count = procs.size
    end

    # #YELLOW
    if !!config[:crit_under] && count < config[:crit_under] # rubocop:disable Style/DoubleNegation
      critical msg
    # #YELLOW
    elsif !!config[:crit_over] && count > config[:crit_over] # rubocop:disable Style/DoubleNegation
      critical msg
    # #YELLOW
    elsif !!config[:warn_under] && count < config[:warn_under] # rubocop:disable Style/DoubleNegation
      warning msg
    # #YELLOW
    elsif !!config[:warn_over] && count > config[:warn_over] # rubocop:disable Style/DoubleNegation
      warning msg
    else
      ok msg
    end
  end
end