diff --git a/nagios-nrpe/files/plugins/check_hpraid b/nagios-nrpe/files/plugins/check_hpraid new file mode 100644 index 00000000..3bd285cb --- /dev/null +++ b/nagios-nrpe/files/plugins/check_hpraid @@ -0,0 +1,266 @@ +#!/usr/bin/env bash +set -euo pipefail + +# This check_hpraid is a fork from check_cciss v0.15 written by Simone Rosa. +# Upstream now at: +# https://gitea.evolix.org/evolix/ansible-roles/src/branch/stable/nagios-nrpe/files/plugins +# Source of the fork: +# https://exchange.nagios.org/directory/Plugins/Hardware/Storage-Systems/RAID-Controllers/check_cciss--2D-HP-and-Compaq-Smart-Array-Hardware-status/details +# +# Description: +# +# This plugin checks hardware status for Smart Array Controllers, +# using HPE Smart Storage Administrator. It should support Debian 9 and over. +# (Array, controller, cache, battery, etc...) +# +# NOTE: +# +# You need to install the proprietary tool HPE Smart Storage Administrator (ssacli) from: +# https://downloads.linux.hpe.com/SDR/repo/mcp +# Also NRPE need to launch ssacli as root. +# +# Please add this line to /etc/sudoers : +# -------------------------------------------------- +# nagios ALL=NOPASSWD: /usr/sbin/ssacli +# +# Examples: +# +# ./check_cciss +# ---------------- +# RAID OK +# +# ./check_cciss -v +# ------------------- +# RAID OK: Smart Array 6i in Slot 0 array A logicaldrive 1 (67.8 GB, RAID 1+0, OK) +# [Controller Status: OK Cache Status: OK Battery Status: OK] +# +# RAID CRITICAL - HP Smart Array Failed: Smart Array 6i in Slot 0 (Embedded) \ +# array A logicaldrive 1 (33.9 GB, RAID 1, Interim Recovery Mode) \ +# physicaldrive 1:0 (port 1:id 0 , Parallel SCSI, --- GB, Failed) +# +# RAID WARNING - HP Smart Array Rebuilding: Smart Array 6i in Slot 0 (Embedded) \ +# array A logicaldrive 1 (33.9 GB, RAID 1, Recovering, 26% complete) \ +# physicaldrive 1:0 (port 1:id 0 , Parallel SCSI, 36.4 GB, Rebuilding) +# +# ./check_cciss -v -p +# -------------------- +# RAID OK: Smart Array 6i in Slot 0 (Embedded) array A logicaldrive 1 (33.9 GB, RAID 1, OK) +# physicaldrive 2:0 (port 2:id 0 , Parallel SCSI, 36.4 GB, OK) +# physicaldrive 2:1 (port 2:id 1 , Parallel SCSI, 36.4 GB, OK) +# physicaldrive 1:5 (port 1:id 5 , Parallel SCSI, 72.8 GB, OK, spare) +# [Controller Status: OK Cache Status: OK Battery/Capacitor Status: OK] +# +# RAID CRITICAL - HP Smart Array Failed: Smart Array 6i in Slot 0 (Embedded) \ +# array A logicaldrive 1 (33.9 GB, RAID 1, Interim Recovery Mode) \ +# physicaldrive 1:0 (port 1:id 0 , Parallel SCSI, --- GB, Failed) \ +# physicaldrive 1:1 (port 1:id 1 , Parallel SCSI, 36.4 GB, OK) +# +# RAID WARNING - HP Smart Array Rebuilding: Smart Array 6i in Slot 0 (Embedded) \ +# array A logicaldrive 1 (33.9 GB, RAID 1, Recovering, 26% complete) \ +# physicaldrive 1:0 (port 1:id 0 , Parallel SCSI, 36.4 GB, Rebuilding) \ +# physicaldrive 1:1 (port 1:id 1 , Parallel SCSI, 36.4 GB, OK) +# +# ./check_cciss -v -b +# ---------------- +# +# RAID OK: Smart Array 6i in Slot 0 (Embedded) array A logicaldrive 1 (33.9 GB, RAID 1, OK) [Controller Status: OK] +# +# [insted of] +# RAID CRITICAL - HP Smart Array Failed: Smart Array 6i in Slot 0 (Embedded) \ +# Controller Status: OK Cache Status: Temporarily Disabled \ +# Battery/Capacitor Status: Failed (Replace Batteries/Capacitors) + +PROGNAME=$(basename $0) +NAGIOS_PLUGINS="/usr/lib/nagios/plugins/" +REVISION=$(echo '0.16-evolix') +DEBUG="0" +VERBOSE="0" +ssacli=$(command -v ssacli) +PHYSICAL_DRIVE=0 + +. ${NAGIOS_PLUGINS}/utils.sh + +print_usage() { + echo "" + echo "Usage: $PROGNAME [-v] [-p] [-e ] [-E ] [-b] [-s] [-d]" + echo "Usage: $PROGNAME [-h]" + echo "Usage: $PROGNAME [-V]" + echo "" + echo " -v = show status and informations about RAID" + echo " -p = show detail for physical drives" + echo " -e = exclude slot number" + echo " -b = exclude battery/capacitor/cache status check" + echo " -d = use for debug (command line mode)" + echo " -h = help information" + echo " -V = version information" + echo "" + echo " =============" +} + +print_help() { + print_revision $PROGNAME $REVISION + echo "" + print_usage + echo "" + echo "This plugin checks hardware status for Smart Array Controllers," + echo "using HPE Smart Storage Administrator." + echo "" + support + exit 0 +} + +while getopts "N:cvpbsde:Vh" options +do + case $options in + N) ;; + c) ;; + v) VERBOSE=1;; + p) PHYSICAL_DRIVE=1;; + d) DEBUG=1;; + e) EXCLUDE_SLOT=1 + excludeslot="$OPTARG";; + b) EXCLUDE_BATTERY=1;; + V) print_revision $PROGNAME $REVISION + exit 0;; + h) print_help + exit 0;; + \?) print_usage + exit 0;; + *) print_usage + exit 0;; + esac +done + +# Use smartpqi driver +# https://manpages.debian.org/buster/manpages/smartpqi.4.en.html +if [ -d /sys/bus/pci/drivers/smartpqi ]; then + DRIVER="/sys/bus/pci/drivers/smartpqi" + driverPresent='YES.' +else + driverPresent='No!' +fi +if [ "$DEBUG" = "1" ]; then + echo "### Check if \"HP Smart Array\" ($DRIVER) is present >>>\n"${driverPresent}"\n" +fi +if [[ "$driverPresent" == "No!" ]]; then + echo "RAID UNKNOWN - HP Smart Array not found" + exit $STATE_UNKNOWN +fi + +# Check if "HP Array Utility CLI" is present +if [ "$DEBUG" = "1" ]; then + echo "### Check if \"ssacli\" is present >>>\n" +fi +if [ ! -x $ssacli ]; then + if [ -x $ssacli ]; then + if [ "$DEBUG" = "1" ]; then + echo "### \"ssacli\" is present >>>\n" + fi + else + echo "ERROR: ssacli tools should be installed and with right sudoers/permissions (see the notes above)" + exit $STATE_UNKNOWN + fi +fi + +# Check if "HP Controller" work correctly +check=$(sudo -u root $ssacli controller all show status 2>&1) +status=$? +if [ "$DEBUG" = "1" ]; then + echo "### Check if \"HP Controller\" work correctly >>>\n"${check}"\n" +fi +if test ${status} -ne 0; then + echo "RAID UNKNOWN - $ssacli did not execute properly : "${check} + exit $STATE_UNKNOWN +fi + +# Get "Slot" & exclude slot needed +EXCLUDE_SLOT=${EXCLUDE_SLOT:-0} +if [ "$EXCLUDE_SLOT" = "1" ]; then + slots=$(echo ${check} | egrep -o "Slot \w" | awk '{print $NF}' | grep -v "$excludeslot") +else + slots=$(echo ${check} | egrep -o "Slot \w" | awk '{print $NF}') +fi +if [ "$DEBUG" = "1" ]; then + echo "### Get \"Slot\" & exclude slot not needed >>>\n"${slots}"\n" +fi +for slot in $slots; do + # Get "logicaldrive" for slot + check2b=$(sudo -u root $ssacli controller slot=$slot logicaldrive all show 2>&1) + status=$? + if test ${status} -ne 0; then + echo "RAID UNKNOWN - $ssacli did not execute properly : "${check2b} + exit $STATE_UNKNOWN + fi + check2=${check2:-} + check2="$check2$check2b" + if [ "$DEBUG" = "1" ]; then + echo "### Get \"logicaldrive\" for slot >>>\n"${check2b}"\n" + fi + + # Get "physicaldrive" for slot + if [ "$PHYSICAL_DRIVE" = "1" -o "$DEBUG" = "1" ]; then + check2b=$(sudo -u root $ssacli controller slot=$slot physicaldrive all show | sed -e 's/\?/\-/g' 2>&1 | grep "physicaldrive") + else + check2b=$(sudo -u root $ssacli controller slot=$slot physicaldrive all show | sed -e 's/\?/\-/g' 2>&1 | grep "physicaldrive" | (grep "\(Failure\|Failed\|Rebuilding\)" || true)) + fi + status=$? + if [ "$PHYSICAL_DRIVE" = "1" -o "$DEBUG" = "1" ]; then + if test ${status} -ne 0; then + echo "RAID UNKNOWN - $ssacli did not execute properly : "${check2b} + exit $STATE_UNKNOWN + fi + fi + check2="$check2$check2b" + if [ "$DEBUG" = "1" ]; then + echo "### Get \"physicaldrive\" for slot >>>\n"${check2b}"\n" + fi +done + +# Check STATUS +if [ "$DEBUG" = "1" ]; then + echo "### Check STATUS >>>" +fi + +# Omit battery/capacitor/cache status check if requested +EXCLUDE_BATTERY=${EXCLUDE_BATTERY:-0} +if [ "$EXCLUDE_BATTERY" = "1" ]; then + check=$(echo "$check" | grep -v 'Battery/Capacitor Status: Failed (Replace Batteries/Capacitors)') + check=$(echo "$check" | grep -v 'Cache Status: Temporarily Disabled') +fi + +if echo ${check} | egrep Failed >/dev/null; then + echo "RAID CRITICAL - HP Smart Array Failed: "${check} | egrep Failed + exit $STATE_CRITICAL +elif echo ${check} | egrep Disabled >/dev/null; then + echo "RAID CRITICAL - HP Smart Array Problem: "${check} | egrep Disabled + exit $STATE_CRITICAL +elif echo ${check2} | egrep Failed >/dev/null; then + echo "RAID CRITICAL - HP Smart Array Failed: "${check2} | egrep Failed + exit $STATE_CRITICAL +elif echo ${check2} | egrep Failure >/dev/null; then + echo "RAID WARNING - Component Failure: "${check2} | egrep Failure + exit $STATE_WARNING +elif echo ${check2} | egrep Rebuild >/dev/null; then + echo "RAID WARNING - HP Smart Array Rebuilding: "${check2} | egrep Rebuild + exit $STATE_WARNING +elif echo ${check2} | egrep Recover >/dev/null; then + echo "RAID WARNING - HP Smart Array Recovering: "${check2} | egrep Recover + exit $STATE_WARNING +elif echo ${check} | egrep "Cache Status: Temporarily Disabled" >/dev/null; then + echo "RAID WARNING - HP Smart Array Cache Disabled: "${check} + exit $STATE_WARNING +elif echo ${check} | egrep FIRMWARE >/dev/null; then + echo "RAID WARNING - "${check} + exit $STATE_WARNING +else + if [ "$DEBUG" = "1" -o "$VERBOSE" = "1" ]; then + check3=$(echo "${check}" | egrep Status) + check3=$(echo ${check3}) + echo "RAID OK: "${check2}" ["${check3}"]" + else + echo "RAID OK" + fi + exit $STATE_OK +fi + +exit $STATE_UNKNOWN