sammcj/nagios

View on GitHub
check_postgres_replication.sh

Summary

Maintainability
Test Coverage
#!/bin/bash
#
# Usage:
# ./check_postgres_replication.sh int-docker-pg-vip int-docker-pg-standby mycoolapp
#
# Warning and Critical values are the size of wal files e.g.
# 83886080 = 5 * 16MB, (default) size of 5 WAL files
# 16777216 = 16 MB, (default) size of 1 WAL file

STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

CONFIG_FILE=/usr/local/etc/vipchange.cfg

function usage () {
    cat <<-EOF

    Usage:
        ${0##*/} [-c --config config_file] [-p --port 5432] [-W --warn 16777216] [-C --crit 83886080] [--help]
            --config ... path to config file, default: $CONFIG_FILE

        Example:
            ${0##*/} --config $CONFIG_FILE -W 16777216 -C 83886080
    EOF
}

GETOPT_PARSED=$(getopt -o hvW:C:p:c: --long help,verbose,config:,port:,warn:,crit: -- "$@" )
if [[ $? != 0 ]] ; then "aborting..." >&2; usage ; exit 1 ; fi

eval set -- "$GETOPT_PARSED"
while true ; do
    case "$1" in
        -h|--help) usage; exit 1 ;;
        -v|--verbose) VERBOSE=1; shift ;;
        -c|--config) CONFIG_FILE=$2 ; shift 2 ;;
        -p|--port) PORT=$2 ; shift 2 ;;
        -W|--warn) WARN=$2 ; shift 2 ;;
        -C|--crit) CRIT=$2 ; shift 2 ;;
        --) shift ; break ;;
        *) echo "Internal error!" ; exit 1 ;;
    esac
done

if [[ -f "$CONFIG_FILE" ]]; then
    . "$CONFIG_FILE"
fi


## Master (p_) and Slave (s_) DB Server Information
p_host=${POSTGRES_ACTIVE_IP:-$1}
p_port=${PORT:-5432}
s_host=${POSTGRES_STANDBY_IP:-$2}
s_port=${PORT:-5432}
database=${3:-postgres}

## Limits
critical_limit=${CRIT:-83886080} # 5 * 16MB, size of 5 WAL files
warning_limit=${WARN:-16777216} # 16 MB, size of 1 WAL file

function bytes() {
    bytes=$1
    if [[ $bytes -gt 2097152 ]]; then
        echo "$(( $bytes / 1048576 )) MiB"
    return
    elif [[ $bytes -gt 2048 ]]; then
        echo "$(( $bytes / 1024 )) KB"
    return
    else
        echo "$bytes B"
        return
    fi
}

# Human-readable format:
critical_limit_M=$( bytes ${critical_limit} )
warning_limit_M=$( bytes ${warning_limit} )

# These 3 values form a bit of a 'race-condition' as they are sampled at different times
# Typically, the replay > slave > master, so we sample them in this order
replay_xlog_loc=$(psql --no-psqlrc -U replicator -h $s_host -p $s_port -A -t -c "SELECT pg_xlog_location_diff(pg_last_xlog_replay_location(), '0/0') AS replay" $database)
slave_xlog_loc=$(psql --no-psqlrc -U replicator  -h $s_host -p $s_port -A -t -c "SELECT pg_xlog_location_diff(pg_last_xlog_receive_location(), '0/0') AS receive" $database)
master_xlog_loc=$(psql --no-psqlrc -U replicator -h $p_host -p $p_port -A -t -c "SELECT pg_xlog_location_diff(pg_current_xlog_location(), '0/0') AS offset" $database)
replay_lag_s=$(psql --no-psqlrc -U replicator -h $s_host -p $s_port -A -t -c "SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0.0 ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END" $database)

# pg_last_xact_replay_timestamp() is the timestamp of the last transaction replayed - it does not represent a 'lag' at all
# in normal operation, it represents the time since 'something happened' on the secondary
#replay_timediff=$(psql --no-psqlrc -U replicator -h $s_host -p $s_port -A -t -c "SELECT -EXTRACT(EPOCH FROM (pg_last_xact_replay_timestamp() - NOW() ))" $database)

if [[ "$replay_lag_s" = '' ]]; then
    # This is normal if nothing has been replayed yet - set to U for performance data
    replay_lag_s=U
fi

if [[ $master_xlog_loc -eq '' || $slave_xlog_loc -eq '' || $replay_xlog_loc -eq '' ]]; then
    echo "CRITICAL: Stream has no value to compare (is replication configured or connectivity problem?)"
    exit $STATE_CRITICAL
fi

master_replay_lag=$( bc <<< "$master_xlog_loc-$replay_xlog_loc" )
master_slave_lag=$( bc <<< "$master_xlog_loc-$slave_xlog_loc" )
PERFDATA="replay_bytes=${master_xlog_loc}c stream_lag=$master_slave_lag replay_lag=$master_replay_lag lag_s=${replay_lag_s}s"

master_slave_lag_M=$( bytes $( bc <<< "($master_xlog_loc-$slave_xlog_loc)" ) )
master_replay_lag_M=$( bytes $( bc <<< "($master_xlog_loc-$replay_xlog_loc)" ) )

if [[ "$master_slave_lag" -gt "$critical_limit" ]]; then
    MESSAGE="Stream beyond critical limit ($master_slave_lag_M > $critical_limit_M )"
    EXIT_CODE=$STATE_CRITICAL
elif [[ "$master_slave_lag" -gt "$warning_limit" ]]; then
    MESSAGE="Stream beyond warning limit ($master_slave_lag_M > $warning_limit_M )"
    EXIT_CODE=$STATE_WARNING
elif [[ "$master_replay_lag" -gt "$warning_limit" ]]; then
    MESSAGE="Replay beyond warning limit ($master_replay_lag_M > $warning_limit_M )"
    EXIT_CODE=$STATE_WARNING
elif [[ "$master_replay_lag" -gt 0 || "$master_slave_lag" -gt 0 ]]; then
    MESSAGE="Lagging within limits: Stream lag: $master_slave_lag_M, Replay lag: $master_replay_lag_M"
    EXIT_CODE=$STATE_OK
elif [[ $master_xlog_loc -eq $slave_xlog_loc && $master_xlog_loc -eq $replay_xlog_loc && $slave_xlog_loc -eq $replay_xlog_loc ]] ; then
    MESSAGE="No lag, MASTER:$master_xlog_loc Slave:$slave_xlog_loc Replay:$replay_xlog_loc"
    EXIT_CODE=$STATE_OK
else
    # Unreachable under normal conditions
    MESSAGE=" MASTER:$master_xlog_loc Slave:$slave_xlog_loc Replay:$replay_xlog_loc Master-slave lag: $master_slave_lag, Master-replay lag: $master_replay_lag"
    EXIT_CODE=3
fi


case "$EXIT_CODE" in
    0) MESSAGE="OK: $MESSAGE";;
    1) MESSAGE="WARNING: $MESSAGE";;
    2) MESSAGE="CRITICAL: $MESSAGE";;
    *) MESSAGE="UNKNOWN: $MESSAGE";;
esac

if [[ "$replay_lag_s" != U ]]; then
    MESSAGE="$MESSAGE,  Replay lag: ${replay_lag_s}s"
fi
echo "${MESSAGE}|${PERFDATA}"
exit $EXIT_CODE