sensu-plugins/sensu-plugins-postgres

View on GitHub
bin/check-postgres-replication.rb

Summary

Maintainability
A
3 hrs
Test Coverage
#! /usr/bin/env ruby
# frozen_string_literal: false

#
#   check-postgres-replication
#
# DESCRIPTION:
#
#   This plugin checks postgresql replication lag
#
# OUTPUT:
#   plain text
#
# PLATFORMS:
#   Linux
#
# DEPENDENCIES:
#   gem: sensu-plugin
#   gem: pg
#
# USAGE:
#   ./check-postgres-replication.rb -m master_host -s slave_host -P port -d db -u db_user -p db_pass -w warn_threshold -c crit_threshold
#
# NOTES:
#
# LICENSE:
#   Released under the same terms as Sensu (the MIT license); see LICENSE
#   for details.
#

require 'sensu-plugins-postgres/pgpass'
require 'sensu-plugins-postgres/pgutil'
require 'sensu-plugin/check/cli'
require 'pg'

class CheckPostgresReplicationStatus < Sensu::Plugin::Check::CLI
  option :pgpass,
         description: 'Pgpass file',
         short: '-f FILE',
         long: '--pgpass',
         default: ENV['PGPASSFILE'] || "#{ENV['HOME']}/.pgpass"

  option(:master_host,
         short: '-m',
         long: '--master-host=HOST',
         required: true,
         description: 'PostgreSQL master HOST')

  option(:slave_host,
         short: '-s',
         long: '--slave-host=HOST',
         required: true,
         description: 'PostgreSQL slave HOST')

  option(:port,
         short: '-P',
         long: '--port=PORT',
         description: 'PostgreSQL port')

  option(:database,
         short: '-d',
         long: '--database=NAME',
         description: 'Database NAME')

  option(:user,
         short: '-u',
         long: '--user=USER',
         description: 'Database USER')

  option(:password,
         short: '-p',
         long: '--password=PASSWORD',
         description: 'Database PASSWORD')

  option(:ssl,
         short: '-S',
         long: '--ssl',
         boolean: true,
         description: 'Require SSL')

  option(:warn,
         short: '-w',
         long: '--warning=VALUE',
         description: 'Warning threshold for replication lag (in MB)',
         default: 900,
         # #YELLOW
         proc: lambda { |s| s.to_i }) # rubocop:disable Lambda

  option(:crit,
         short: '-c',
         long: '--critical=VALUE',
         description: 'Critical threshold for replication lag (in MB)',
         default: 1800,
         # #YELLOW
         proc: lambda { |s| s.to_i }) # rubocop:disable Lambda

  option(:timeout,
         short: '-T TIMEOUT',
         long: '--timeout=TIMEOUT',
         default: 2,
         description: 'Connection timeout (seconds)',
         proc: proc(&:to_i))

  include Pgpass
  include PgUtil

  def run
    ssl_mode = config[:ssl] ? 'require' : 'prefer'

    critical 'Master and slave cannot be the same host' if config[:master_host] == config[:slave_host]

    # Establishing connection to the master
    pgpass
    conn_master = PG.connect(host: config[:master_host],
                             dbname: config[:database],
                             user: config[:user],
                             password: config[:password],
                             port: config[:port],
                             sslmode: ssl_mode,
                             connect_timeout: config[:timeout])

    master = if check_vsn_newer_than_postgres9(conn_master)
               conn_master.exec('SELECT pg_current_xlog_location()').getvalue(0, 0)
             else
               conn_master.exec('SELECT pg_current_wal_lsn()').getvalue(0, 0)
             end
    m_segbytes = conn_master.exec('SHOW wal_segment_size').getvalue(0, 0).sub(/\D+/, '').to_i << 20
    conn_master.close

    # Establishing connection to the slave
    conn_slave = PG.connect(host: config[:slave_host],
                            dbname: config[:database],
                            user: config[:user],
                            password: config[:password],
                            port: config[:port],
                            sslmode: ssl_mode,
                            connect_timeout: config[:timeout])

    slave = if check_vsn_newer_than_postgres9(conn_slave)
              conn_slave.exec('SELECT pg_last_xlog_receive_location()').getvalue(0, 0)
            else
              conn_slave.exec('SELECT pg_last_wal_replay_lsn()').getvalue(0, 0)
            end
    conn_slave.close

    # Computing lag
    lag = compute_lag(master, slave, m_segbytes)
    lag_in_mb = (lag.to_f / 1024 / 1024).abs

    message = "replication delayed by #{lag_in_mb}MB :: master:#{master} slave:#{slave} m_segbytes:#{m_segbytes}"

    if lag_in_mb >= config[:crit]
      critical message
    elsif lag_in_mb >= config[:warn]
      warning message
    else
      ok message
    end
  end
end