rabbitmq: NRPE check and config

This commit is contained in:
Jérémy Lecour 2017-11-26 19:30:24 +01:00 committed by Jérémy Lecour
parent f21ce97903
commit f1063cce94
5 changed files with 277 additions and 0 deletions

View File

@ -0,0 +1,4 @@
---
rabbitmq_connections_critical: 200
rabbitmq_connections_warning: 150

View File

@ -0,0 +1,226 @@
#!/usr/bin/env python2
from optparse import OptionParser
import shlex
import subprocess
import sys
import requests
import json
if "check_output" not in dir( subprocess ): # duck punch it in!
def f(*popenargs, **kwargs):
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise subprocess.CalledProcessError(retcode, cmd)
return output
subprocess.check_output = f
class RabbitCmdWrapper(object):
"""So basically this just runs rabbitmqctl commands and returns parsed output.
Typically this means you need root privs for this to work.
Made this it's own class so it could be used in other monitoring tools
if desired."""
@classmethod
def list_connections(cls):
args = shlex.split("sudo rabbitmqctl list_connections")
cmd_result = subprocess.check_output(args).strip()
results = cls._parse_list_results(cmd_result)
return results
@classmethod
def list_queues(cls):
args = shlex.split('sudo rabbitmqctl list_queues')
cmd_result = subprocess.check_output(args).strip()
results = cls._parse_list_results(cmd_result)
return results
@classmethod
def status(cls):
args = shlex.split('sudo rabbitmqctl status')
cmd_result = subprocess.check_output(args).strip()
results = cls._parse_list_results(cmd_result)
return results
@classmethod
def _parse_list_results(cls, result_string):
results = result_string.strip().split('\n')
#remove text fluff
if "Listing connections ..." in results: results.remove("Listing connections ...")
if "Listing queues ..." in results: results.remove("Listing queues ...")
return_data = []
for row in results:
return_data.append(row.split('\t'))
return return_data
def check_connection_count(critical=0, warning=0):
"""Checks to make sure the numbers of connections are within parameters."""
try:
count = len(RabbitCmdWrapper.list_connections())
if count >= critical:
print "CRITICAL - Connection Count %d" % count
sys.exit(2)
elif count >= warning:
print "WARNING - Connection Count %d" % count
sys.exit(1)
else:
print "OK - Connection Count %d" % count
except Exception, err:
print "CRITICAL - %s" % err
def check_queues_count(critical=1000, warning=1000):
"""
A blanket check to make sure all queues are within count parameters.
TODO: Possibly break this out so test can be done on individual queues.
"""
try:
critical_q = []
warning_q = []
results = RabbitCmdWrapper.list_queues()
for queue in results:
if queue.count == 2:
count = int(queue[1])
if count >= critical:
critical_q.append("%s: %s" % (queue[0], count))
elif count >= warning:
warning_q.append("%s: %s" % (queue[0], count))
if critical_q:
print "CRITICAL - %s" % ", ".join(critical_q)
sys.exit(2)
elif warning_q:
print "WARNING - %s" % ", ".join(warning_q)
sys.exit(1)
else:
print "OK - NO QUEUES EXCEED THRESHOLDS"
sys.exit(0)
except Exception, err:
print "CRITICAL - %s" % err
sys.exit(2)
def check_mem_usage(critical=75, warning=50):
"""Check to make sure the RAM usage of rabbitmq process does not exceed 50%% of its max"""
try:
results = RabbitCmdWrapper.status()
for idx,val in enumerate(results):
if "memory," in str(val):
mem_used_raw = str(results[idx + 1])
if "vm_memory_limit" in str(val):
mem_limit_raw = str(val)
memory_used = float(filter(str.isdigit, mem_used_raw))
memory_limit = float(filter(str.isdigit, mem_limit_raw))
percent_usage = int(memory_used/memory_limit * 100)
if percent_usage > critical:
print "CRITICAL - RABBITMQ RAM USAGE at %s%% of max" % percent_usage
sys.exit(2)
elif percent_usage > warning:
print "WARNING - RABBITMQ RAM USAGE at %s%% of max" % percent_usage
sys.exit(1)
else:
print "OK - RABBITMQ RAM USAGE OK at %s%% of max" % percent_usage
sys.exit(0)
except Exception, err:
print "Critical - %s" % err
sys.exit(2)
def check_aliveness(username, password, timeout, cluster):
"""Declares a test queue, then publishes and consumes a message. Intended for use by monitoring tools. If everything is working correctly, will return HTTP status 200 with body"""
try:
r = requests.get("http://%s:15672/api/aliveness-test/%%2F" % cluster, auth=(username, password), timeout=timeout)
except requests.exceptions.RequestException as e: # Throw error if rabbitmq is down
print "Critical - %s" % e
sys.exit(2)
if r.status_code == 200:
print "OK - RABBITMQ Aliveness Test Returns: %s" % r
sys.exit(0)
elif r.status_code != 200:
print "CRITICAL - RabbitMQ Error: %s" % r.content
sys.exit(2)
else:
print "UNKNOWN - RABBITMQ Aliveness Test"
sys.ext(1)
def check_cluster(username, password, timeout, cluster):
"""Checks the health of a cluster, if a node is not running mark as offline """
try:
url = "http://%s:15672/api/nodes" % cluster
r = requests.get(url, auth=(username, password), timeout=timeout)
except requests.exceptions.RequestException as e: # Throw error if no response
print "Critical - %s" % e
sys.exit(2)
text = r.text
nodes = json.loads(text)
running_nodes = []
failed_nodes = []
for node in nodes:
if not node['running']:
failed_nodes.append(node['name'])
if node['running']:
running_nodes.append(node['name'])
if len(failed_nodes) == 1:
print "WARNING: RabbitMQ cluster is degraged: Not running %s" % failed_nodes[0]
sys.exit(1)
elif len(failed_nodes) >= 2:
print "CRITICAL: RabbitMQ cluster is critical: Not running %s" % failed_nodes
sys.exit(2)
else:
print "OK: RabbitMQ cluster members: %s" % (" ".join(running_nodes))
sys.exit(0)
USAGE = """Usage: ./check_rabbitmq -a [action] -C [critical] -W [warning]
Actions:
- connection_count
checks the number of connection in rabbitmq's list_connections
- queues_count
checks the count in each of the queues in rabbitmq's list_queues
- mem_usage
checks to ensure mem usage of rabbitmq process does not exceed 50%
- aliveness
Use the /api/aliveness-test API to send/receive a message. (requires -u username -p password args)
- cluster_status
Parse /api/nodes to check the cluster status. (requires -u username -p password"""
if __name__ == "__main__":
parser = OptionParser(USAGE)
parser.add_option("-a", "--action", dest="action",
help="Action to Check")
parser.add_option("-C", "--critical", dest="critical",
type="int", help="Critical Threshold")
parser.add_option("-W", "--warning", dest="warning",
type="int", help="Warning Threshold")
parser.add_option("-u", "--username", dest="username", default="guest",
type="string", help="RabbitMQ username, Default guest")
parser.add_option("-p", "--password", dest="password", default="guest",
type="string", help="RabbitMQ password, Default guest")
parser.add_option("-t", "--timeout", dest="timeout", default=1,
type="int", help="Request Timeout, defaults to 1 second")
parser.add_option("-c", "--cluster", dest="cluster", default="localhost",
type="string", help="Cluster IP/DNS name, defaults to localhost")
(options, args) = parser.parse_args()
if options.action == "connection_count":
check_connection_count(options.critical, options.warning)
elif options.action == "queues_count":
check_queues_count(options.critical, options.warning)
elif options.action == "mem_usage":
check_mem_usage(options.critical, options.warning)
elif options.action == "aliveness":
check_aliveness(options.username, options.password, options.timeout, options.cluster)
elif options.action == "cluster_status":
check_cluster(options.username, options.password, options.timeout, options.cluster)
else:
print "Invalid action: %s" % options.action
print USAGE

View File

@ -4,3 +4,8 @@
name: rabbitmq-server
state: restarted
- name: restart nagios-nrpe-server
service:
name: nagios-nrpe-server
state: restarted

View File

@ -27,3 +27,14 @@
lineinfile:
dest: /etc/default/rabbitmq-server
line: ulimit -n 2048
- name: is NRPE present ?
stat:
path: /etc/nagios/nrpe.d/evolix.cfg
check_mode: no
register: nrpe_evolix_config
tags:
- nrpe
- include: nrpe.yml
when: nrpe_evolix_config.stat.exists

31
rabbitmq/tasks/nrpe.yml Normal file
View File

@ -0,0 +1,31 @@
---
- name: check_rabbitmq dependencies
apt:
name: python-requests
state: installed
# https://raw.githubusercontent.com/CaptPhunkosis/check_rabbitmq/master/check_rabbitmq
- name: check_rabbitmq is installed
copy:
src: check_rabbitmq
dest: /usr/local/lib/nagios/plugins/check_rabbitmq
owner: root
group: root
mode: "0755"
force: yes
- name: check_rabbitmq is available for NRPE
lineinfile:
dest: /etc/nagios/nrpe.d/evolix.cfg
regexp: 'command\[check_rab_connection_count\]'
line: 'command[check_rab_connection_count]=sudo /usr/local/lib/nagios/plugins/check_rabbitmq -a connection_count -C {{ rabbitmq_connections_critical }} -W {{ rabbitmq_connections_warning }}'
notify: restart nagios-nrpe-server
- name: sudo without password for nagios
lineinfile:
dest: /etc/sudoers.d/evolinux
regexp: 'check_rabbitmq'
line: 'nagios ALL = NOPASSWD: /usr/local/lib/nagios/plugins/check_rabbitmq'
insertafter: '^nagios'
validate: "visudo -cf %s"