diff --git a/rabbitmq/defaults/main.yml b/rabbitmq/defaults/main.yml new file mode 100644 index 00000000..f08eeed9 --- /dev/null +++ b/rabbitmq/defaults/main.yml @@ -0,0 +1,4 @@ +--- + +rabbitmq_connections_critical: 200 +rabbitmq_connections_warning: 150 diff --git a/rabbitmq/files/check_rabbitmq b/rabbitmq/files/check_rabbitmq new file mode 100644 index 00000000..4969cc5a --- /dev/null +++ b/rabbitmq/files/check_rabbitmq @@ -0,0 +1,226 @@ +#!/usr/bin/env python2 +from optparse import OptionParser +import shlex +import subprocess +import sys +import requests +import json + +if "check_output" not in dir( subprocess ): # duck punch it in! + def f(*popenargs, **kwargs): + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise subprocess.CalledProcessError(retcode, cmd) + return output + subprocess.check_output = f + + +class RabbitCmdWrapper(object): + """So basically this just runs rabbitmqctl commands and returns parsed output. + Typically this means you need root privs for this to work. + Made this it's own class so it could be used in other monitoring tools + if desired.""" + + @classmethod + def list_connections(cls): + args = shlex.split("sudo rabbitmqctl list_connections") + cmd_result = subprocess.check_output(args).strip() + results = cls._parse_list_results(cmd_result) + return results + + @classmethod + def list_queues(cls): + args = shlex.split('sudo rabbitmqctl list_queues') + cmd_result = subprocess.check_output(args).strip() + results = cls._parse_list_results(cmd_result) + return results + + @classmethod + def status(cls): + args = shlex.split('sudo rabbitmqctl status') + cmd_result = subprocess.check_output(args).strip() + results = cls._parse_list_results(cmd_result) + return results + + @classmethod + def _parse_list_results(cls, result_string): + results = result_string.strip().split('\n') + #remove text fluff + if "Listing connections ..." in results: results.remove("Listing connections ...") + if "Listing queues ..." in results: results.remove("Listing queues ...") + return_data = [] + for row in results: + return_data.append(row.split('\t')) + return return_data + + +def check_connection_count(critical=0, warning=0): + """Checks to make sure the numbers of connections are within parameters.""" + try: + count = len(RabbitCmdWrapper.list_connections()) + if count >= critical: + print "CRITICAL - Connection Count %d" % count + sys.exit(2) + elif count >= warning: + print "WARNING - Connection Count %d" % count + sys.exit(1) + else: + print "OK - Connection Count %d" % count + except Exception, err: + print "CRITICAL - %s" % err + + +def check_queues_count(critical=1000, warning=1000): + """ + A blanket check to make sure all queues are within count parameters. + TODO: Possibly break this out so test can be done on individual queues. + """ + try: + critical_q = [] + warning_q = [] + results = RabbitCmdWrapper.list_queues() + for queue in results: + if queue.count == 2: + count = int(queue[1]) + if count >= critical: + critical_q.append("%s: %s" % (queue[0], count)) + elif count >= warning: + warning_q.append("%s: %s" % (queue[0], count)) + if critical_q: + print "CRITICAL - %s" % ", ".join(critical_q) + sys.exit(2) + elif warning_q: + print "WARNING - %s" % ", ".join(warning_q) + sys.exit(1) + else: + print "OK - NO QUEUES EXCEED THRESHOLDS" + sys.exit(0) + except Exception, err: + print "CRITICAL - %s" % err + sys.exit(2) + +def check_mem_usage(critical=75, warning=50): + """Check to make sure the RAM usage of rabbitmq process does not exceed 50%% of its max""" + try: + results = RabbitCmdWrapper.status() + + for idx,val in enumerate(results): + if "memory," in str(val): + mem_used_raw = str(results[idx + 1]) + if "vm_memory_limit" in str(val): + mem_limit_raw = str(val) + + memory_used = float(filter(str.isdigit, mem_used_raw)) + memory_limit = float(filter(str.isdigit, mem_limit_raw)) + percent_usage = int(memory_used/memory_limit * 100) + + if percent_usage > critical: + print "CRITICAL - RABBITMQ RAM USAGE at %s%% of max" % percent_usage + sys.exit(2) + elif percent_usage > warning: + print "WARNING - RABBITMQ RAM USAGE at %s%% of max" % percent_usage + sys.exit(1) + else: + print "OK - RABBITMQ RAM USAGE OK at %s%% of max" % percent_usage + sys.exit(0) + except Exception, err: + print "Critical - %s" % err + sys.exit(2) + +def check_aliveness(username, password, timeout, cluster): + """Declares a test queue, then publishes and consumes a message. Intended for use by monitoring tools. If everything is working correctly, will return HTTP status 200 with body""" + try: + r = requests.get("http://%s:15672/api/aliveness-test/%%2F" % cluster, auth=(username, password), timeout=timeout) + except requests.exceptions.RequestException as e: # Throw error if rabbitmq is down + print "Critical - %s" % e + sys.exit(2) + if r.status_code == 200: + print "OK - RABBITMQ Aliveness Test Returns: %s" % r + sys.exit(0) + elif r.status_code != 200: + print "CRITICAL - RabbitMQ Error: %s" % r.content + sys.exit(2) + else: + print "UNKNOWN - RABBITMQ Aliveness Test" + sys.ext(1) + +def check_cluster(username, password, timeout, cluster): + """Checks the health of a cluster, if a node is not running mark as offline """ + try: + url = "http://%s:15672/api/nodes" % cluster + r = requests.get(url, auth=(username, password), timeout=timeout) + except requests.exceptions.RequestException as e: # Throw error if no response + print "Critical - %s" % e + sys.exit(2) + text = r.text + nodes = json.loads(text) + + running_nodes = [] + failed_nodes = [] + for node in nodes: + if not node['running']: + failed_nodes.append(node['name']) + if node['running']: + running_nodes.append(node['name']) + if len(failed_nodes) == 1: + print "WARNING: RabbitMQ cluster is degraged: Not running %s" % failed_nodes[0] + sys.exit(1) + elif len(failed_nodes) >= 2: + print "CRITICAL: RabbitMQ cluster is critical: Not running %s" % failed_nodes + sys.exit(2) + else: + print "OK: RabbitMQ cluster members: %s" % (" ".join(running_nodes)) + sys.exit(0) + + +USAGE = """Usage: ./check_rabbitmq -a [action] -C [critical] -W [warning] + Actions: + - connection_count + checks the number of connection in rabbitmq's list_connections + - queues_count + checks the count in each of the queues in rabbitmq's list_queues + - mem_usage + checks to ensure mem usage of rabbitmq process does not exceed 50% + - aliveness + Use the /api/aliveness-test API to send/receive a message. (requires -u username -p password args) + - cluster_status + Parse /api/nodes to check the cluster status. (requires -u username -p password""" + +if __name__ == "__main__": + parser = OptionParser(USAGE) + parser.add_option("-a", "--action", dest="action", + help="Action to Check") + parser.add_option("-C", "--critical", dest="critical", + type="int", help="Critical Threshold") + parser.add_option("-W", "--warning", dest="warning", + type="int", help="Warning Threshold") + parser.add_option("-u", "--username", dest="username", default="guest", + type="string", help="RabbitMQ username, Default guest") + parser.add_option("-p", "--password", dest="password", default="guest", + type="string", help="RabbitMQ password, Default guest") + parser.add_option("-t", "--timeout", dest="timeout", default=1, + type="int", help="Request Timeout, defaults to 1 second") + parser.add_option("-c", "--cluster", dest="cluster", default="localhost", + type="string", help="Cluster IP/DNS name, defaults to localhost") + (options, args) = parser.parse_args() + + if options.action == "connection_count": + check_connection_count(options.critical, options.warning) + elif options.action == "queues_count": + check_queues_count(options.critical, options.warning) + elif options.action == "mem_usage": + check_mem_usage(options.critical, options.warning) + elif options.action == "aliveness": + check_aliveness(options.username, options.password, options.timeout, options.cluster) + elif options.action == "cluster_status": + check_cluster(options.username, options.password, options.timeout, options.cluster) + else: + print "Invalid action: %s" % options.action + print USAGE diff --git a/rabbitmq/handlers/main.yml b/rabbitmq/handlers/main.yml index ee19e00e..4163ca25 100644 --- a/rabbitmq/handlers/main.yml +++ b/rabbitmq/handlers/main.yml @@ -4,3 +4,8 @@ name: rabbitmq-server state: restarted + +- name: restart nagios-nrpe-server + service: + name: nagios-nrpe-server + state: restarted diff --git a/rabbitmq/tasks/main.yml b/rabbitmq/tasks/main.yml index 336af58e..0002ed26 100644 --- a/rabbitmq/tasks/main.yml +++ b/rabbitmq/tasks/main.yml @@ -27,3 +27,14 @@ lineinfile: dest: /etc/default/rabbitmq-server line: ulimit -n 2048 + +- name: is NRPE present ? + stat: + path: /etc/nagios/nrpe.d/evolix.cfg + check_mode: no + register: nrpe_evolix_config + tags: + - nrpe + +- include: nrpe.yml + when: nrpe_evolix_config.stat.exists diff --git a/rabbitmq/tasks/nrpe.yml b/rabbitmq/tasks/nrpe.yml new file mode 100644 index 00000000..e0efe8fb --- /dev/null +++ b/rabbitmq/tasks/nrpe.yml @@ -0,0 +1,31 @@ +--- + +- name: check_rabbitmq dependencies + apt: + name: python-requests + state: installed + +# https://raw.githubusercontent.com/CaptPhunkosis/check_rabbitmq/master/check_rabbitmq +- name: check_rabbitmq is installed + copy: + src: check_rabbitmq + dest: /usr/local/lib/nagios/plugins/check_rabbitmq + owner: root + group: root + mode: "0755" + force: yes + +- name: check_rabbitmq is available for NRPE + lineinfile: + dest: /etc/nagios/nrpe.d/evolix.cfg + regexp: 'command\[check_rab_connection_count\]' + line: 'command[check_rab_connection_count]=sudo /usr/local/lib/nagios/plugins/check_rabbitmq -a connection_count -C {{ rabbitmq_connections_critical }} -W {{ rabbitmq_connections_warning }}' + notify: restart nagios-nrpe-server + +- name: sudo without password for nagios + lineinfile: + dest: /etc/sudoers.d/evolinux + regexp: 'check_rabbitmq' + line: 'nagios ALL = NOPASSWD: /usr/local/lib/nagios/plugins/check_rabbitmq' + insertafter: '^nagios' + validate: "visudo -cf %s"