Timeout dans le fichier d'alerts_wrapper

This commit is contained in:
William Hirigoyen 2024-02-09 18:26:23 +01:00
parent 6a051b4672
commit c37880626c
5 changed files with 301 additions and 139 deletions

140
nagios-nrpe/files/alerts_switch Normal file → Executable file
View file

@ -1,45 +1,57 @@
#!/bin/bash
# https://forge.evolix.org/projects/evolix-private/repository
#
# You should not alter this file.
# If you need to, create and customize a copy.
# Source:
# https://gitea.evolix.org/evolix/ansible-roles/src/branch/stable/nagios-nrpe
#
set -e
readonly PROGNAME=$(basename $0)
readonly PROGDIR=$(readlink -m $(dirname $0))
readonly ARGS="$@"
readonly base_dir="/var/lib/monitoringctl"
readonly log_file="/var/log/monitoringctl.log"
# Default disable duration
duration="1h"
usage() {
echo "$PROGNAME disables or enables NRPE alerts wrapped by the script 'alerts_wrapper' in NRPE configuration."
echo "Usage: $PROGNAME enable|disable <NAME>"
echo "Usage: $PROGNAME disable [-d|--duration <DURATION>] <NAME>"
echo " $PROGNAME enable <NAME>"
echo " $PROGNAME help"
echo "NAME: one of the names given to '--names' option of 'alerts_wrapper'."
echo "DURATION: Duration of alert disabling."
echo " Can be '1d' for 1 day, '5m' for 5 minutes or more complex expressions like '1w2d10m42s' (if no time unit is provided, hour is assumed)"
echo " Default value: 1h"
}
disable_alerts () {
disabled_file="$1_disabled"
enabled_file="$1_enabled"
if [ -e "${enabled_file}" ]; then
mv "${enabled_file}" "${disabled_file}"
time_in_seconds() {
if echo "${1}" | grep -E -q '^([0-9]+[wdhms])+$'; then
echo "${1}" | sed 's/w/ * 604800 + /g; s/d/ * 86400 + /g; s/h/ * 3600 + /g; s/m/ * 60 + /g; s/s/ + /g; s/+ $//' | xargs expr
elif echo "${1}" | grep -E -q '^([0-9]+$)'; then
echo "${1} * 3600" | xargs expr
else
touch "${disabled_file}"
chmod 0644 "${disabled_file}"
>&2 echo "Invalid duration: '${1}'."
usage
exit 1
fi
}
get_disable_names() {
echo "all,$(grep "alerts_wrapper" -Rs /etc/nagios/ | grep -vE "^\s*#" | awk '{ for (i=1; i<=NF; i++) { if ($i ~ /(-n|--name[s]?)/) print $(i+1) } }')" | tr ',' '\n' | sort | uniq
}
disable_alerts () {
echo "${duration_sec}" > "${disable_file_path}"
chmod 0644 "${disable_file_path}"
log_disable
}
enable_alerts () {
disabled_file="$1_disabled"
enabled_file="$1_enabled"
if [ -e "${disabled_file}" ]; then
mv "${disabled_file}" "${enabled_file}"
else
touch "${enabled_file}"
chmod 0644 "${enabled_file}"
if [ -e "${disable_file_path}" ]; then
rm "${disable_file_path}"
fi
log_enable
}
now () {
@ -47,41 +59,71 @@ now () {
}
log_disable () {
echo "$(now) - alerts_switch: alerts disabled by $(logname || echo unknown)" >> $1
echo "$(now) - alerts_switch: ${disable_name} alerts disabled by $(logname || echo unknown)" >> $log_file
}
log_enable () {
echo "$(now) - alerts_switch: alerts enabled by $(logname || echo unknown)" >> $1
echo "$(now) - alerts_switch: ${disable_name} alerts enabled by $(logname || echo unknown)" >> $log_file
}
main () {
local action=$1
local prefix=$2
disable_file_path="${base_dir}/${disable_name}_alerts_disabled"
local base_dir="/var/lib/misc"
mkdir -p "${base_dir}"
local file_path="${base_dir}/${prefix}_alerts"
local log_file="/var/log/monitoringctl.log"
case "$action" in
enable)
enable_alerts ${file_path}
log_enable ${log_file}
;;
disable)
disable_alerts ${file_path}
log_disable ${log_file}
;;
help)
if [ "${action}" == 'enable' ]; then
enable_alerts
elif [ "${action}" == 'disable' ]; then
duration_sec=$(time_in_seconds "${duration}")
disable_alerts
elif [ "${action}" == 'help' ]; then
usage
;;
*)
>&2 echo "Unknown action '$action'"
usage
exit 1
;;
esac
fi
}
main $ARGS
while :; do
case "$1" in
enable|disable|help)
action="$1"
shift;;
-d|--duration)
if [ "$#" -gt 1 ]; then
duration="$2"
else
>&2 echo "Missing --duration argument."
usage
exit 1
fi
shift; shift;;
*)
if [ -z "${action}" ]; then
>&2 echo "Missing action argument."
usage
exit 1
fi
if [ -z "$1" ]; then
break
fi
get_disable_names | grep --quiet -E "^$1$"
arg_is_in_disable_names_rc=$?
if [ "${arg_is_in_disable_names_rc}" -eq 0 ] && [ -z "${disable_name}" ]; then
disable_name="$1"
else
>&2 echo "Unknown argument '$1', or NAME not defined in NRPE configuration."
usage
exit 1
fi
shift;;
esac
done
if [ -z "${disable_name}" ] && [ "${action}" != 'help' ] ; then
>&2 echo "Missing NAME argument."
usage
exit 1
fi
main

132
nagios-nrpe/files/alerts_wrapper Normal file → Executable file
View file

@ -1,12 +1,16 @@
#!/bin/bash
# https://forge.evolix.org/projects/evolix-private/repository
#
# You should not alter this file.
# If you need to, create and customize a copy.
# Source:
# https://gitea.evolix.org/evolix/ansible-roles/src/branch/stable/nagios-nrpe
#
VERSION="21.04"
readonly VERSION
readonly base_dir="/var/lib/monitoringctl"
# Default maximum allowed disable time
disable_max_time_default="1d"
readonly disable_max_time_default
# base functions
@ -27,17 +31,20 @@ show_help() {
cat <<END
alerts_wrapper wraps an NRPE command and overrides the return code.
Usage: alerts_wrapper [--limit 1d] --names check_name[,other_disable_names,...] <check command with optional arguments>
Usage: alerts_wrapper [--maximum 1d] [--names check,other_disable_name,...]] <check command with optional arguments>
Usage: alerts_wrapper disable_name <check command with optional arguments>
Options
--limit max age of the "check file" ;
can be "1d" for 1 day, "5m" for 5 minutes…
or more complex expressions like "1w2d10m42s"
--names disable name (shoud contain at least the check name)
--name (deprecated) disable name (kept for backward compatibility)
-h, --help print this message and exit
-V, --version print version and exit
-m|--max|--maximum Max age of the disable file.
Can be "1d" for 1 day, "5m" for 5 minutes…
or more complex expressions like "1w2d10m42s"
If no time unit is provided, hour is assumed.
--limit (deprecated) Same as above (kept for backward compatibility).
-n|--names Disable name (recommanded: set at least the check name).
Special name: 'all' is already defined (and cannot be removed).
--name (deprecated) Disable name (kept for backward compatibility).
-h, --help Print this message and exit.
-V, --version Print version and exit.
END
}
@ -51,13 +58,20 @@ time_in_seconds() {
fi
}
delay_from_alerts_disabled_file() {
delay_from_disable_file() {
# $1: disabled file
last_change=$(stat -c %Z "$1")
limit_seconds=$(time_in_seconds "${wrapper_limit}" || time_in_seconds "${wrapper_limit_default}")
limit_date=$(date --date "${limit_seconds} seconds ago" +"%s")
echo $(( last_change - limit_date ))
disable_secs=$(grep -v -E "^\s*#" "${1}" | grep -E "[0-9]+" | head -n1 | awk '{print$1}')
disable_max_secs=$(time_in_seconds "${disable_max_time}" || time_in_seconds "${disable_max_time_default}")
if [ "${disable_secs}" -gt "${disable_max_secs}" ]; then
disable_secs="${disable_max_secs}"
fi
disable_date=$(date --date "${disable_secs} seconds ago" +"%s")
echo $(( last_change - disable_date ))
}
enable_checks() {
@ -70,30 +84,52 @@ enable_checks() {
}
main() {
${check_command} > "${check_stdout}"
check_stdout=$(timeout 9 ${check_command})
check_rc=$?
readonly check_rc
if [ "${check_rc}" -eq 124 ] && [ -z "${check_stdout}" ]; then
check_stdout="Check timeout (9 sec)"
fi
delay=0
disabled="False"
for disable_name in ${disable_names}; do
alerts_disabled_file="/var/lib/misc/${disable_name}_alerts_disabled"
if [ -e "${alerts_disabled_file}" ]; then
delay=$(delay_from_alerts_disabled_file "${alerts_disabled_file}")
disable_file="${base_dir}/${disable_name}_alerts_disabled"
if [ -e "${disable_file}" ]; then
delay=$(delay_from_disable_file "${disable_file}")
if [ "${delay}" -le "0" ]; then
enable_checks "${disable_name}"
fi
fi
if [ -e "${alerts_disabled_file}" ]; then
formatted_last_change=$(date --date "@$(stat -c %Z "${alerts_disabled_file}")" +'%c')
readonly formatted_last_change
if [ -e "${disable_file}" ]; then
disabled="True"
formatted_last_change=$(date --date "@$(stat -c %Z "${disable_file}")" +'%c')
echo "ALERTS DISABLED for ${disable_names} (since ${formatted_last_change}, delay: ${delay} sec) - $(cat "${check_stdout}")"
delay_days="$(( delay /86400 ))"
if [ "${delay_days}" -eq 0 ]; then delay_days=""
else delay_days="${delay_days}d "; fi
delay_hours="$(( (delay %86400) /3600 ))"
if [ "${delay_hours}" -eq 0 ]; then delay_hours=""
else delay_hours="${delay_hours}h "; fi
delay_minutes="$(( ((delay %86400) %3600) /60 ))"
if [ "${delay_minutes}" -eq 0 ]; then delay_minutes=""
else delay_minutes="${delay_minutes}m "; fi
delay_seconds="$(( ((delay %86400) %3600) %60 ))"
if [ "${delay_seconds}" -eq 0 ]; then delay_seconds=""
else delay_days="${delay_days}s "; fi
echo "ALERTS DISABLED for ${disable_names} (since ${formatted_last_change}, delay: ${delay_days}${delay_hours}${delay_minutes}${delay_seconds})."
fi
done
echo "${check_stdout}"
if [ "${disabled}" == "True" ]; then
if [ ${check_rc} = 0 ]; then
# Nagios OK
exit 0
@ -102,14 +138,10 @@ main() {
exit 1
fi
else
cat "${check_stdout}"
exit ${check_rc}
fi
}
# Default: 1 day before re-enabling the check
wrapper_limit_default="1d"
readonly wrapper_limit_default
if [[ "${1}" =~ -.* ]]; then
# parse options
@ -125,27 +157,27 @@ if [[ "${1}" =~ -.* ]]; then
exit 0
;;
--limit)
-m|--max|--maximum|--limit)
# with value separated by space
if [ -n "$2" ]; then
wrapper_limit=$2
disable_max_time=$2
shift
else
printf 'ERROR: "--limit" requires a non-empty option argument.\n' >&2
printf 'ERROR: "--maximum" requires a non-empty option argument.\n' >&2
exit 1
fi
;;
--limit=?*)
-m|--max|--maximum|--limit=?*)
# with value speparated by =
wrapper_limit=${1#*=}
disable_max_time=${1#*=}
;;
--limit=)
-m|--max|--maximum|--limit=)
# without value
printf 'ERROR: "--limit" requires a non-empty option argument.\n' >&2
printf 'ERROR: "--maximum" requires a non-empty option argument.\n' >&2
exit 1
;;
--name|--names)
-n|--name|--names)
# with value separated by space
if [ -n "$2" ]; then
disable_names=$2
@ -155,11 +187,11 @@ if [[ "${1}" =~ -.* ]]; then
exit 1
fi
;;
--name=?*|--names=?*)
-n|--name=?*|--names=?*)
# with value speparated by =
disable_names=${1#*=}
;;
--name=|--names=)
-n|--name=|--names=)
# without value
printf 'ERROR: "--name" requires a non-empty option argument.\n' >&2
exit 1
@ -194,13 +226,13 @@ else
fi
# Default values or errors
if [ -z "${wrapper_limit}" ]; then
wrapper_limit="${wrapper_limit_default}"
fi
if [ -z "${disable_names}" ]; then
printf 'ERROR: You must specify a check name, with --names.\n' >&2
exit 1
if [ -z "${disable_max_time}" ]; then
disable_max_time="${disable_max_time_default}"
fi
#if [ -z "${disable_names}" ]; then
# printf 'ERROR: You must specify a check name, with --names.\n' >&2
# exit 1
#fi
if [ -z "${check_command}" ]; then
printf 'ERROR: You must specify a command to execute.\n' >&2
exit 1
@ -209,12 +241,6 @@ fi
disable_names="all $(echo "${disable_names}" | tr ',' ' ')"
readonly disable_names
readonly check_command
readonly wrapper_limit
check_stdout=$(mktemp --tmpdir=/tmp "${disable_names}_stdout.XXXX")
readonly check_stdout
# shellcheck disable=SC2064
trap "rm ${check_stdout}" EXIT
readonly disable_max_time
main

5
nagios-nrpe/files/check-local_completion Normal file → Executable file
View file

@ -1,5 +1,6 @@
#!/usr/bin/env bash
# List of available checks
_check_local_dynamic_completion() {
local cur;
cur=${COMP_WORDS[COMP_CWORD]};
@ -7,6 +8,10 @@ _check_local_dynamic_completion() {
COMPREPLY=( $( compgen -W '$(grep "\[check_" -Rs /etc/nagios/ | grep -vE "^[[:blank:]]*#" | awk -F"[\\\[\\\]=]" "{print \$2}" | sed "s/check_//" | sort | uniq)' -- $cur ) );
}
# List of available disable names of alerts_wrapper
# grep "alerts_wrapper" -Rs /etc/nagios/ | grep -vE "^\s*#" | awk '{ for (i=1; i<=NF; i++) { if ($i ~ /--name[s]?/) print $(i+1) } }' | tr ',' '\n' | sort | uniq
# + ajouter all
complete -F _check_local_dynamic_completion check-local

0
nagios-nrpe/files/check_async Normal file → Executable file
View file

View file

@ -2,8 +2,9 @@
#set -x
log_path="/var/log/monitoringctl.log"
conf_path="/etc/nagios/nrpe.cfg"
readonly base_dir="/var/lib/monitoringctl"
readonly log_path="/var/log/monitoringctl.log"
readonly conf_path="/etc/nagios/nrpe.cfg"
function show_help {
cat <<EOF
@ -156,10 +157,10 @@ function check {
exit 1
fi
server_address=$(echo "$conf_lines" | grep "server_address" | cut -d'=' -f2)
server_address=$(echo "$conf_lines" | grep "server_address" | cut -d'=' -f2)
if [ -z "${server_address}" ]; then server_address="127.0.0.1"; fi
server_port=$(echo "$conf_lines" | grep "server_port" | cut -d'=' -f2)
server_port=$(echo "$conf_lines" | grep "server_port" | cut -d'=' -f2)
if [ -z "${server_port}" ]; then server_port="5666"; fi
check_commands=$(get_check_commands "$1")
@ -224,17 +225,17 @@ function filter_duration {
# Check that NRPE commands are wrapped by alerts_wrapper script
function is_nrpe_wrapped {
for check in $(get_checks_list); do
command=$(get_check_commands "${check}" | tail -n1)
echo "${command}" | grep --quiet --no-messages alerts_wrapper
cmd=$(get_check_commands "${check}" | tail -n1)
echo "${cmd}" | grep --quiet --no-messages alerts_wrapper
rc=$?
if [ "${rc}" -ne 0 ]; then
>&2 echo "Warning: check '${check}' has no alerts_wrapper, it will not be disabled:"
>&2 echo " ${command}"
>&2 echo " ${cmd}"
fi
done
}
function disable-alerts {
function disable_alerts {
# $1: comment
if ! command -v alerts_switch &> /dev/null; then
@ -243,10 +244,12 @@ function disable-alerts {
exit 1
fi
# Are alerts already disabled ?
if [ -f /var/lib/misc/all_alerts_disabled ]; then
fi
#TODO Are alerts already disabled ?
# -> mauvais indicateur, cf. le timeout à l'intérieur + le max autorisé dans la commande alerts_wrapper
#if [ -f "${base_dir}/all_alerts_disabled" ]; then
# echo "All alerts are already disabled."
# alerts-status
#fi
default_msg="."
if [ "${default_duration}" = "True" ]; then
@ -284,13 +287,9 @@ EOF
#done
log "Executing 'alerts_switch disable all'"
alerts_switch disable all
alerts_switch disable all --duration "${duration}"
#TODO remove previous units if any
#TODO systemd-run --quiet --unit="" --on-calendar="" -- monitoringctl enable-alerts "[AUTO] $1"
echo "Alerts are now disabled for ${duration}."
echo "All alerts are now disabled for ${duration}."
}
function enable-alerts {
@ -299,18 +298,101 @@ function enable-alerts {
log "Action enable-alerts requested by user $(logname || echo unknown): '${1}'"
log "Executing 'alerts_switch enable all'"
alerts_switch enable all
echo "Alerts are now re-enabled (stub)."
#TODO ou: echo "Alerts were already enabled."
echo "All alerts are now enabled."
}
### ALERTS-STATUS ACTION ##########################
function alerts-status {
# TODO
true
# Converts human writable duration into seconds
function duration_to_seconds {
# $1: duration (XdYhZm…)
if echo "${1}" | grep -E -q '^([0-9]+[wdhms])+$'; then
echo "${1}" | sed 's/w/ * 604800 + /g; s/d/ * 86400 + /g; s/h/ * 3600 + /g; s/m/ * 60 + /g; s/s/ + /g; s/+ $//' | xargs expr
elif echo "${1}" | grep -E -q '^([0-9]+$)'; then
echo "${1} * 3600" | xargs expr
else
return 1
fi
}
# Converts seconds into human readable duration
function seconds_to_duration {
# $1: integer (seconds)
delay="$1"
delay_days="$(( delay /86400 ))"
if [ "${delay_days}" -eq 0 ]; then delay_days=""
else delay_days="${delay_days}d "; fi
delay_hours="$(( (delay %86400) /3600 ))"
if [ "${delay_hours}" -eq 0 ]; then delay_hours=""
else delay_hours="${delay_hours}h "; fi
delay_minutes="$(( ((delay %86400) %3600) /60 ))"
if [ "${delay_minutes}" -eq 0 ]; then delay_minutes=""
else delay_minutes="${delay_minutes}m "; fi
delay_seconds="$(( ((delay %86400) %3600) %60 ))"
if [ "${delay_seconds}" -eq 0 ]; then delay_seconds=""
else delay_seconds="${delay_seconds}s"; fi
echo "${delay_days}${delay_hours}${delay_minutes}${delay_seconds}"
}
# Get from NRPE / alerts_wrapper options the maximum duration of disable.
# If different values are found for the same disable name, the lowest is keept.
function get_max_disable_duration {
min_of_max_duration=""
min_of_max_sec=""
for check in $(get_checks_list); do
cmd=$(get_check_commands "${check}" | tail -n1)
max_duration=$(echo "${cmd}" | awk '{ for (i=1; i<=NF; i++) { if ($i ~ /(-m|--max|--maximum|--limit)[ =]/) print $(i+1) } }')
if [ -z "${max_duration}" ]; then
continue
fi
max_sec=$(duration_to_seconds "${max_duration}")
if [ -z "${min_of_max_sec}" ] || [ "${max_sec}" -lt "${min_of_max_sec}" ]; then
min_of_max_sec="${max_sec}"
min_of_max_duration="${max_duration}"
fi
done
echo "${min_of_max_duration:-"1d"}" # 1d is alerts_wrapper default --max
}
function disabled_secs_left {
disabled_file="${base_dir}/all_alerts_disabled"
if [ ! -e "${disabled_file}" ]; then
echo 0
return
fi
max_disable_duration="$(get_max_disable_duration)"
max_disable_secs="$(duration_to_seconds "${max_disable_duration}")"
disable_secs="$(grep -v -E "^\s*#" "${disabled_file}" | grep -E "[0-9]+" | head -n1 | awk '{print$1}')"
if [ "${disable_secs}" -gt "${max_disable_secs}" ]; then
disable_secs="${max_disable_secs}"
fi
disable_date=$(date --date "${disable_secs} seconds ago" +"%s")
last_change=$(stat -c %Z "${disabled_file}")
echo $(( last_change - disable_date ))
}
function alerts_status {
local disabled_secs_left=$(disabled_secs_left)
disabled_duration_left="$(seconds_to_duration "${disabled_secs_left}")"
if [ -z "${disabled_duration_left}" ]; then
echo "All alerts are enabled."
else
disable_date=$(date --date "+${disabled_secs_left} seconds" "+%d %h %Y at %H:%M:%S")
echo "All alerts are still disabled for ${disabled_duration_left}."
echo "They will be re-enabled the ${disable_date}."
fi
}
@ -328,13 +410,16 @@ if [ "$#" = "0" ]; then
exit 1
fi
debian_major_version=$(cut -d "." -f 1 < /etc/debian_version)
conf_lines=$(get_conf_from_file "${conf_path}")
# Default arguments and options
action=""
comment=""
verbose="False"
duration="1h"
default_duration="True"
bypass_nrpe="False"
default_duration="True"
# Parse arguments and options
while :; do
@ -363,31 +448,35 @@ while :; do
action="$1"
shift;;
*)
break;;
if [ "${action}" = "check" ] && [ -n "$1" ]; then
if get_checks_list | grep --quiet -E "^$1$"; then
check_name=$1
shift
else
usage_error "Action check: unknown argument '$1'."
fi
else
# Other arguments are the comment
break
fi
;;
esac
done
debian_major_version=$(cut -d "." -f 1 < /etc/debian_version)
conf_lines=$(get_conf_from_file "${conf_path}")
if [ -z "${action}" ]; then
usage_error "Missing or invalid ACTION argument."
fi
if [ "${action}" = "check" ]; then
if [ "$#" = 0 ]; then
usage_error "Action check: missing CHECK_NAME argument."
fi
if [ "$#" -gt 1 ]; then
if [ "$#" -gt 0 ]; then
usage_error "Action check: too many arguments."
fi
if [ "${default_duration}" = "False" ]; then
usage_error "Action check: there is no --duration option."
fi
check_name="$1"
check "$check_name"
elif [ "${action}" = "enable-alerts" ]; then
@ -415,13 +504,13 @@ elif [ "${action}" = "disable-alerts" ]; then
is_nrpe_wrapped
comment="$1"
disable-alerts "${comment}"
disable_alerts "${comment}"
elif [ "${action}" = "alerts-status" ]; then
if [ "$#" -gt 0 ]; then
usage_error "Action alerts-status: too many arguments."
fi
alerts-status
alerts_status
fi