227 lines
8.9 KiB
Plaintext
227 lines
8.9 KiB
Plaintext
|
#!/usr/bin/env python2
|
||
|
from optparse import OptionParser
|
||
|
import shlex
|
||
|
import subprocess
|
||
|
import sys
|
||
|
import requests
|
||
|
import json
|
||
|
|
||
|
if "check_output" not in dir( subprocess ): # duck punch it in!
|
||
|
def f(*popenargs, **kwargs):
|
||
|
if 'stdout' in kwargs:
|
||
|
raise ValueError('stdout argument not allowed, it will be overridden.')
|
||
|
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
||
|
output, unused_err = process.communicate()
|
||
|
retcode = process.poll()
|
||
|
if retcode:
|
||
|
cmd = kwargs.get("args")
|
||
|
if cmd is None:
|
||
|
cmd = popenargs[0]
|
||
|
raise subprocess.CalledProcessError(retcode, cmd)
|
||
|
return output
|
||
|
subprocess.check_output = f
|
||
|
|
||
|
|
||
|
class RabbitCmdWrapper(object):
|
||
|
"""So basically this just runs rabbitmqctl commands and returns parsed output.
|
||
|
Typically this means you need root privs for this to work.
|
||
|
Made this it's own class so it could be used in other monitoring tools
|
||
|
if desired."""
|
||
|
|
||
|
@classmethod
|
||
|
def list_connections(cls):
|
||
|
args = shlex.split("sudo rabbitmqctl list_connections")
|
||
|
cmd_result = subprocess.check_output(args).strip()
|
||
|
results = cls._parse_list_results(cmd_result)
|
||
|
return results
|
||
|
|
||
|
@classmethod
|
||
|
def list_queues(cls):
|
||
|
args = shlex.split('sudo rabbitmqctl list_queues')
|
||
|
cmd_result = subprocess.check_output(args).strip()
|
||
|
results = cls._parse_list_results(cmd_result)
|
||
|
return results
|
||
|
|
||
|
@classmethod
|
||
|
def status(cls):
|
||
|
args = shlex.split('sudo rabbitmqctl status')
|
||
|
cmd_result = subprocess.check_output(args).strip()
|
||
|
results = cls._parse_list_results(cmd_result)
|
||
|
return results
|
||
|
|
||
|
@classmethod
|
||
|
def _parse_list_results(cls, result_string):
|
||
|
results = result_string.strip().split('\n')
|
||
|
#remove text fluff
|
||
|
if "Listing connections ..." in results: results.remove("Listing connections ...")
|
||
|
if "Listing queues ..." in results: results.remove("Listing queues ...")
|
||
|
return_data = []
|
||
|
for row in results:
|
||
|
return_data.append(row.split('\t'))
|
||
|
return return_data
|
||
|
|
||
|
|
||
|
def check_connection_count(critical=0, warning=0):
|
||
|
"""Checks to make sure the numbers of connections are within parameters."""
|
||
|
try:
|
||
|
count = len(RabbitCmdWrapper.list_connections())
|
||
|
if count >= critical:
|
||
|
print "CRITICAL - Connection Count %d" % count
|
||
|
sys.exit(2)
|
||
|
elif count >= warning:
|
||
|
print "WARNING - Connection Count %d" % count
|
||
|
sys.exit(1)
|
||
|
else:
|
||
|
print "OK - Connection Count %d" % count
|
||
|
except Exception, err:
|
||
|
print "CRITICAL - %s" % err
|
||
|
|
||
|
|
||
|
def check_queues_count(critical=1000, warning=1000):
|
||
|
"""
|
||
|
A blanket check to make sure all queues are within count parameters.
|
||
|
TODO: Possibly break this out so test can be done on individual queues.
|
||
|
"""
|
||
|
try:
|
||
|
critical_q = []
|
||
|
warning_q = []
|
||
|
results = RabbitCmdWrapper.list_queues()
|
||
|
for queue in results:
|
||
|
if queue.count == 2:
|
||
|
count = int(queue[1])
|
||
|
if count >= critical:
|
||
|
critical_q.append("%s: %s" % (queue[0], count))
|
||
|
elif count >= warning:
|
||
|
warning_q.append("%s: %s" % (queue[0], count))
|
||
|
if critical_q:
|
||
|
print "CRITICAL - %s" % ", ".join(critical_q)
|
||
|
sys.exit(2)
|
||
|
elif warning_q:
|
||
|
print "WARNING - %s" % ", ".join(warning_q)
|
||
|
sys.exit(1)
|
||
|
else:
|
||
|
print "OK - NO QUEUES EXCEED THRESHOLDS"
|
||
|
sys.exit(0)
|
||
|
except Exception, err:
|
||
|
print "CRITICAL - %s" % err
|
||
|
sys.exit(2)
|
||
|
|
||
|
def check_mem_usage(critical=75, warning=50):
|
||
|
"""Check to make sure the RAM usage of rabbitmq process does not exceed 50%% of its max"""
|
||
|
try:
|
||
|
results = RabbitCmdWrapper.status()
|
||
|
|
||
|
for idx,val in enumerate(results):
|
||
|
if "memory," in str(val):
|
||
|
mem_used_raw = str(results[idx + 1])
|
||
|
if "vm_memory_limit" in str(val):
|
||
|
mem_limit_raw = str(val)
|
||
|
|
||
|
memory_used = float(filter(str.isdigit, mem_used_raw))
|
||
|
memory_limit = float(filter(str.isdigit, mem_limit_raw))
|
||
|
percent_usage = int(memory_used/memory_limit * 100)
|
||
|
|
||
|
if percent_usage > critical:
|
||
|
print "CRITICAL - RABBITMQ RAM USAGE at %s%% of max" % percent_usage
|
||
|
sys.exit(2)
|
||
|
elif percent_usage > warning:
|
||
|
print "WARNING - RABBITMQ RAM USAGE at %s%% of max" % percent_usage
|
||
|
sys.exit(1)
|
||
|
else:
|
||
|
print "OK - RABBITMQ RAM USAGE OK at %s%% of max" % percent_usage
|
||
|
sys.exit(0)
|
||
|
except Exception, err:
|
||
|
print "Critical - %s" % err
|
||
|
sys.exit(2)
|
||
|
|
||
|
def check_aliveness(username, password, timeout, cluster):
|
||
|
"""Declares a test queue, then publishes and consumes a message. Intended for use by monitoring tools. If everything is working correctly, will return HTTP status 200 with body"""
|
||
|
try:
|
||
|
r = requests.get("http://%s:15672/api/aliveness-test/%%2F" % cluster, auth=(username, password), timeout=timeout)
|
||
|
except requests.exceptions.RequestException as e: # Throw error if rabbitmq is down
|
||
|
print "Critical - %s" % e
|
||
|
sys.exit(2)
|
||
|
if r.status_code == 200:
|
||
|
print "OK - RABBITMQ Aliveness Test Returns: %s" % r
|
||
|
sys.exit(0)
|
||
|
elif r.status_code != 200:
|
||
|
print "CRITICAL - RabbitMQ Error: %s" % r.content
|
||
|
sys.exit(2)
|
||
|
else:
|
||
|
print "UNKNOWN - RABBITMQ Aliveness Test"
|
||
|
sys.ext(1)
|
||
|
|
||
|
def check_cluster(username, password, timeout, cluster):
|
||
|
"""Checks the health of a cluster, if a node is not running mark as offline """
|
||
|
try:
|
||
|
url = "http://%s:15672/api/nodes" % cluster
|
||
|
r = requests.get(url, auth=(username, password), timeout=timeout)
|
||
|
except requests.exceptions.RequestException as e: # Throw error if no response
|
||
|
print "Critical - %s" % e
|
||
|
sys.exit(2)
|
||
|
text = r.text
|
||
|
nodes = json.loads(text)
|
||
|
|
||
|
running_nodes = []
|
||
|
failed_nodes = []
|
||
|
for node in nodes:
|
||
|
if not node['running']:
|
||
|
failed_nodes.append(node['name'])
|
||
|
if node['running']:
|
||
|
running_nodes.append(node['name'])
|
||
|
if len(failed_nodes) == 1:
|
||
|
print "WARNING: RabbitMQ cluster is degraged: Not running %s" % failed_nodes[0]
|
||
|
sys.exit(1)
|
||
|
elif len(failed_nodes) >= 2:
|
||
|
print "CRITICAL: RabbitMQ cluster is critical: Not running %s" % failed_nodes
|
||
|
sys.exit(2)
|
||
|
else:
|
||
|
print "OK: RabbitMQ cluster members: %s" % (" ".join(running_nodes))
|
||
|
sys.exit(0)
|
||
|
|
||
|
|
||
|
USAGE = """Usage: ./check_rabbitmq -a [action] -C [critical] -W [warning]
|
||
|
Actions:
|
||
|
- connection_count
|
||
|
checks the number of connection in rabbitmq's list_connections
|
||
|
- queues_count
|
||
|
checks the count in each of the queues in rabbitmq's list_queues
|
||
|
- mem_usage
|
||
|
checks to ensure mem usage of rabbitmq process does not exceed 50%
|
||
|
- aliveness
|
||
|
Use the /api/aliveness-test API to send/receive a message. (requires -u username -p password args)
|
||
|
- cluster_status
|
||
|
Parse /api/nodes to check the cluster status. (requires -u username -p password"""
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = OptionParser(USAGE)
|
||
|
parser.add_option("-a", "--action", dest="action",
|
||
|
help="Action to Check")
|
||
|
parser.add_option("-C", "--critical", dest="critical",
|
||
|
type="int", help="Critical Threshold")
|
||
|
parser.add_option("-W", "--warning", dest="warning",
|
||
|
type="int", help="Warning Threshold")
|
||
|
parser.add_option("-u", "--username", dest="username", default="guest",
|
||
|
type="string", help="RabbitMQ username, Default guest")
|
||
|
parser.add_option("-p", "--password", dest="password", default="guest",
|
||
|
type="string", help="RabbitMQ password, Default guest")
|
||
|
parser.add_option("-t", "--timeout", dest="timeout", default=1,
|
||
|
type="int", help="Request Timeout, defaults to 1 second")
|
||
|
parser.add_option("-c", "--cluster", dest="cluster", default="localhost",
|
||
|
type="string", help="Cluster IP/DNS name, defaults to localhost")
|
||
|
(options, args) = parser.parse_args()
|
||
|
|
||
|
if options.action == "connection_count":
|
||
|
check_connection_count(options.critical, options.warning)
|
||
|
elif options.action == "queues_count":
|
||
|
check_queues_count(options.critical, options.warning)
|
||
|
elif options.action == "mem_usage":
|
||
|
check_mem_usage(options.critical, options.warning)
|
||
|
elif options.action == "aliveness":
|
||
|
check_aliveness(options.username, options.password, options.timeout, options.cluster)
|
||
|
elif options.action == "cluster_status":
|
||
|
check_cluster(options.username, options.password, options.timeout, options.cluster)
|
||
|
else:
|
||
|
print "Invalid action: %s" % options.action
|
||
|
print USAGE
|