forked from evolix/ansible-roles
227 lines
9 KiB
Python
227 lines
9 KiB
Python
#!/usr/bin/env python3
|
|
from optparse import OptionParser
|
|
import shlex
|
|
import subprocess
|
|
import sys
|
|
import requests
|
|
import json
|
|
|
|
if "check_output" not in dir( subprocess ): # duck punch it in!
|
|
def f(*popenargs, **kwargs):
|
|
if 'stdout' in kwargs:
|
|
raise ValueError('stdout argument not allowed, it will be overridden.')
|
|
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
|
output, unused_err = process.communicate()
|
|
retcode = process.poll()
|
|
if retcode:
|
|
cmd = kwargs.get("args")
|
|
if cmd is None:
|
|
cmd = popenargs[0]
|
|
raise subprocess.CalledProcessError(retcode, cmd)
|
|
return output
|
|
subprocess.check_output = f
|
|
|
|
|
|
class RabbitCmdWrapper(object):
|
|
"""So basically this just runs rabbitmqctl commands and returns parsed output.
|
|
Typically this means you need root privs for this to work.
|
|
Made this it's own class so it could be used in other monitoring tools
|
|
if desired."""
|
|
|
|
@classmethod
|
|
def list_connections(cls):
|
|
args = shlex.split("sudo rabbitmqctl list_connections")
|
|
cmd_result = subprocess.check_output(args, text=True).strip()
|
|
results = cls._parse_list_results(cmd_result)
|
|
return results
|
|
|
|
@classmethod
|
|
def list_queues(cls):
|
|
args = shlex.split('sudo rabbitmqctl list_queues')
|
|
cmd_result = subprocess.check_output(args, text=True).strip()
|
|
results = cls._parse_list_results(cmd_result)
|
|
return results
|
|
|
|
@classmethod
|
|
def status(cls):
|
|
args = shlex.split('sudo rabbitmqctl status')
|
|
cmd_result = subprocess.check_output(args, text=True).strip()
|
|
results = cls._parse_list_results(cmd_result)
|
|
return results
|
|
|
|
@classmethod
|
|
def _parse_list_results(cls, result_string):
|
|
results = result_string.strip().split('\n')
|
|
#remove text fluff
|
|
if "Listing connections ..." in results: results.remove("Listing connections ...")
|
|
if "Listing queues ..." in results: results.remove("Listing queues ...")
|
|
return_data = []
|
|
for row in results:
|
|
return_data.append(row.split('\t'))
|
|
return return_data
|
|
|
|
|
|
def check_connection_count(critical=0, warning=0):
|
|
"""Checks to make sure the numbers of connections are within parameters."""
|
|
try:
|
|
count = len(RabbitCmdWrapper.list_connections())
|
|
if count >= critical:
|
|
print("CRITICAL - Connection Count %d" % count)
|
|
sys.exit(2)
|
|
elif count >= warning:
|
|
print("WARNING - Connection Count %d" % count)
|
|
sys.exit(1)
|
|
else:
|
|
print("OK - Connection Count %d" % count)
|
|
except Exception as err:
|
|
print("CRITICAL - %s" % err)
|
|
|
|
|
|
def check_queues_count(critical=1000, warning=1000):
|
|
"""
|
|
A blanket check to make sure all queues are within count parameters.
|
|
TODO: Possibly break this out so test can be done on individual queues.
|
|
"""
|
|
try:
|
|
critical_q = []
|
|
warning_q = []
|
|
results = RabbitCmdWrapper.list_queues()
|
|
for queue in results:
|
|
if queue.count == 2:
|
|
count = int(queue[1])
|
|
if count >= critical:
|
|
critical_q.append("%s: %s" % (queue[0], count))
|
|
elif count >= warning:
|
|
warning_q.append("%s: %s" % (queue[0], count))
|
|
if critical_q:
|
|
print("CRITICAL - %s" % ", ".join(critical_q))
|
|
sys.exit(2)
|
|
elif warning_q:
|
|
print("WARNING - %s" % ", ".join(warning_q))
|
|
sys.exit(1)
|
|
else:
|
|
print("OK - NO QUEUES EXCEED THRESHOLDS")
|
|
sys.exit(0)
|
|
except Exception as err:
|
|
print("CRITICAL - %s" % err)
|
|
sys.exit(2)
|
|
|
|
def check_mem_usage(critical=75, warning=50):
|
|
"""Check to make sure the RAM usage of rabbitmq process does not exceed 50%% of its max"""
|
|
try:
|
|
results = RabbitCmdWrapper.status()
|
|
|
|
for idx,val in enumerate(results):
|
|
if "memory," in str(val):
|
|
mem_used_raw = str(results[idx + 1])
|
|
if "vm_memory_limit" in str(val):
|
|
mem_limit_raw = str(val)
|
|
|
|
memory_used = float(filter(str.isdigit, mem_used_raw))
|
|
memory_limit = float(filter(str.isdigit, mem_limit_raw))
|
|
percent_usage = int(memory_used/memory_limit * 100)
|
|
|
|
if percent_usage > critical:
|
|
print("CRITICAL - RABBITMQ RAM USAGE at %s%% of max" % percent_usage)
|
|
sys.exit(2)
|
|
elif percent_usage > warning:
|
|
print("WARNING - RABBITMQ RAM USAGE at %s%% of max" % percent_usage)
|
|
sys.exit(1)
|
|
else:
|
|
print("OK - RABBITMQ RAM USAGE OK at %s%% of max" % percent_usage)
|
|
sys.exit(0)
|
|
except Exception as err:
|
|
print("Critical - %s" % err)
|
|
sys.exit(2)
|
|
|
|
def check_aliveness(username, password, timeout, cluster):
|
|
"""Declares a test queue, then publishes and consumes a message. Intended for use by monitoring tools. If everything is working correctly, will return HTTP status 200 with body"""
|
|
try:
|
|
r = requests.get("http://%s:15672/api/aliveness-test/%%2F" % cluster, auth=(username, password), timeout=timeout)
|
|
except requests.exceptions.RequestException as e: # Throw error if rabbitmq is down
|
|
print("Critical - %s" % e)
|
|
sys.exit(2)
|
|
if r.status_code == 200:
|
|
print("OK - RABBITMQ Aliveness Test Returns: %s" % r)
|
|
sys.exit(0)
|
|
elif r.status_code != 200:
|
|
print("CRITICAL - RabbitMQ Error: %s" % r.content)
|
|
sys.exit(2)
|
|
else:
|
|
print("UNKNOWN - RABBITMQ Aliveness Test")
|
|
sys.ext(1)
|
|
|
|
def check_cluster(username, password, timeout, cluster):
|
|
"""Checks the health of a cluster, if a node is not running mark as offline """
|
|
try:
|
|
url = "http://%s:15672/api/nodes" % cluster
|
|
r = requests.get(url, auth=(username, password), timeout=timeout)
|
|
except requests.exceptions.RequestException as e: # Throw error if no response
|
|
print("Critical - %s" % e)
|
|
sys.exit(2)
|
|
text = r.text
|
|
nodes = json.loads(text)
|
|
|
|
running_nodes = []
|
|
failed_nodes = []
|
|
for node in nodes:
|
|
if not node['running']:
|
|
failed_nodes.append(node['name'])
|
|
if node['running']:
|
|
running_nodes.append(node['name'])
|
|
if len(failed_nodes) == 1:
|
|
print("WARNING: RabbitMQ cluster is degraged: Not running %s" % failed_nodes[0])
|
|
sys.exit(1)
|
|
elif len(failed_nodes) >= 2:
|
|
print("CRITICAL: RabbitMQ cluster is critical: Not running %s" % failed_nodes)
|
|
sys.exit(2)
|
|
else:
|
|
print("OK: RabbitMQ cluster members: %s" % (" ".join(running_nodes)))
|
|
sys.exit(0)
|
|
|
|
|
|
USAGE = """Usage: ./check_rabbitmq -a [action] -C [critical] -W [warning]
|
|
Actions:
|
|
- connection_count
|
|
checks the number of connection in rabbitmq's list_connections
|
|
- queues_count
|
|
checks the count in each of the queues in rabbitmq's list_queues
|
|
- mem_usage
|
|
checks to ensure mem usage of rabbitmq process does not exceed 50%
|
|
- aliveness
|
|
Use the /api/aliveness-test API to send/receive a message. (requires -u username -p password args)
|
|
- cluster_status
|
|
Parse /api/nodes to check the cluster status. (requires -u username -p password"""
|
|
|
|
if __name__ == "__main__":
|
|
parser = OptionParser(USAGE)
|
|
parser.add_option("-a", "--action", dest="action",
|
|
help="Action to Check")
|
|
parser.add_option("-C", "--critical", dest="critical",
|
|
type="int", help="Critical Threshold")
|
|
parser.add_option("-W", "--warning", dest="warning",
|
|
type="int", help="Warning Threshold")
|
|
parser.add_option("-u", "--username", dest="username", default="guest",
|
|
type="string", help="RabbitMQ username, Default guest")
|
|
parser.add_option("-p", "--password", dest="password", default="guest",
|
|
type="string", help="RabbitMQ password, Default guest")
|
|
parser.add_option("-t", "--timeout", dest="timeout", default=1,
|
|
type="int", help="Request Timeout, defaults to 1 second")
|
|
parser.add_option("-c", "--cluster", dest="cluster", default="localhost",
|
|
type="string", help="Cluster IP/DNS name, defaults to localhost")
|
|
(options, args) = parser.parse_args()
|
|
|
|
if options.action == "connection_count":
|
|
check_connection_count(options.critical, options.warning)
|
|
elif options.action == "queues_count":
|
|
check_queues_count(options.critical, options.warning)
|
|
elif options.action == "mem_usage":
|
|
check_mem_usage(options.critical, options.warning)
|
|
elif options.action == "aliveness":
|
|
check_aliveness(options.username, options.password, options.timeout, options.cluster)
|
|
elif options.action == "cluster_status":
|
|
check_cluster(options.username, options.password, options.timeout, options.cluster)
|
|
else:
|
|
print("Invalid action: %s" % options.action)
|
|
print(USAGE)
|