From e0ba847e9cead9b40d80d28fe93a81c14acb2855 Mon Sep 17 00:00:00 2001 From: William Hirigoyen Date: Wed, 14 Sep 2022 12:19:55 +0200 Subject: [PATCH] nagios-nrpe: upgrade check_mongo --- nagios-nrpe/files/plugins/check_mongodb | 653 ++++++++++++++++-------- 1 file changed, 436 insertions(+), 217 deletions(-) mode change 100755 => 100644 nagios-nrpe/files/plugins/check_mongodb diff --git a/nagios-nrpe/files/plugins/check_mongodb b/nagios-nrpe/files/plugins/check_mongodb old mode 100755 new mode 100644 index bc6278ac..cce3a76d --- a/nagios-nrpe/files/plugins/check_mongodb +++ b/nagios-nrpe/files/plugins/check_mongodb @@ -17,24 +17,29 @@ # - Dag Stockstad # - @Andor on github # - Steven Richards - Captainkrtek on github -# - Max Vernimmen +# - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github +# - Kris Nova - @kris@nivenly.com github.com/kris-nova +# - Jan Kantert - firstname@lastname.net # # USAGE # # See the README.md # +from __future__ import print_function +from __future__ import division import sys import time import optparse -import textwrap import re import os +import numbers +import socket try: import pymongo -except ImportError, e: - print e +except ImportError as e: + print(e) sys.exit(2) # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt @@ -78,37 +83,35 @@ def performance_data(perf_data, params): def numeric_type(param): - if ((type(param) == float or type(param) == int or param == None)): - return True - return False + return param is None or isinstance(param, numbers.Real) def check_levels(param, warning, critical, message, ok=[]): if (numeric_type(critical) and numeric_type(warning)): if param >= critical: - print "CRITICAL - " + message + print("CRITICAL - " + message) sys.exit(2) elif param >= warning: - print "WARNING - " + message + print("WARNING - " + message) sys.exit(1) else: - print "OK - " + message + print("OK - " + message) sys.exit(0) else: if param in critical: - print "CRITICAL - " + message + print("CRITICAL - " + message) sys.exit(2) if param in warning: - print "WARNING - " + message + print("WARNING - " + message) sys.exit(1) if param in ok: - print "OK - " + message + print("OK - " + message) sys.exit(0) # unexpected param value - print "CRITICAL - Unexpected value : %d" % param + "; " + message + print("CRITICAL - Unexpected value : %d" % param + "; " + message) return 2 @@ -120,21 +123,32 @@ def get_server_status(con): data = con.admin.command(son.SON([('serverStatus', 1)])) return data +def split_host_port(string): + if not string.rsplit(':', 1)[-1].isdigit(): + return (string, None) + string = string.rsplit(':', 1) + host = string[0] # 1st index is always host + port = int(string[1]) + return (host, port) + def main(argv): p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.") p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to') - p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on') + p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)') + p.add_option('--rdns-lookup', action='store_true', dest='rdns_lookup', default=False, help='RDNS(PTR) lookup on given host/host-to-check, to convert ip-address to fqdn') + p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on') + p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)') p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as') p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user') - p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set') - p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set') + p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set') + p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set') p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take', choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock', - 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size', - 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults', - 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum']) + 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size', + 'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', + 'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum']) p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)') p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)') p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data') @@ -145,12 +159,28 @@ def main(argv): p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second') p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check') p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults') + p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3', + choices=['2','3']) + p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against') + p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates") + p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL') + p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication') + p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb', + choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1']) + p.add_option('--disable_retry_writes', dest='retry_writes_disabled', default=False, action='callback', callback=optional_arg(True), help='Disable retryWrites feature') options, arguments = p.parse_args() host = options.host + host_to_check = options.host_to_check if options.host_to_check else options.host + rdns_lookup = options.rdns_lookup + if (rdns_lookup): + host_to_check = socket.getnameinfo((host_to_check, 0), 0)[0] port = options.port + port_to_check = options.port_to_check if options.port_to_check else options.port user = options.user passwd = options.passwd + authdb = options.authdb + query_type = options.query_type collection = options.collection sample_time = options.sample_time @@ -164,9 +194,15 @@ def main(argv): action = options.action perf_data = options.perf_data max_lag = options.max_lag + mongo_version = options.mongo_version database = options.database ssl = options.ssl replicaset = options.replicaset + insecure = options.insecure + ssl_ca_cert_file = options.ssl_ca_cert_file + cert_file = options.cert_file + auth_mechanism = options.auth_mechanism + retry_writes_disabled = options.retry_writes_disabled if action == 'replica_primary' and replicaset is None: return "replicaset must be passed in when using replica_primary check" @@ -177,31 +213,35 @@ def main(argv): # moving the login up here and passing in the connection # start = time.time() - err, con = mongo_connect(host, port, ssl, user, passwd, replicaset) + err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) + if err != 0: + return err + + # Autodetect mongo-version and force pymongo to let us know if it can connect or not. + err, mongo_version = check_version(con) if err != 0: return err conn_time = time.time() - start - conn_time = round(conn_time, 0) if action == "connections": return check_connections(con, warning, critical, perf_data) elif action == "replication_lag": - return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd) + return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, False, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) elif action == "replication_lag_percent": - return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd) + return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, True, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) elif action == "replset_state": return check_replset_state(con, perf_data, warning, critical) elif action == "memory": - return check_memory(con, warning, critical, perf_data, options.mapped_memory) + return check_memory(con, warning, critical, perf_data, options.mapped_memory, host) elif action == "memory_mapped": return check_memory_mapped(con, warning, critical, perf_data) elif action == "queues": return check_queues(con, warning, critical, perf_data) elif action == "lock": - return check_lock(con, warning, critical, perf_data) + return check_lock(con, warning, critical, perf_data, mongo_version) elif action == "current_lock": - return check_current_lock(con, host, warning, critical, perf_data) + return check_current_lock(con, host, port, warning, critical, perf_data) elif action == "flushing": return check_flushing(con, warning, critical, True, perf_data) elif action == "last_flush_time": @@ -223,22 +263,26 @@ def main(argv): return check_database_size(con, database, warning, critical, perf_data) elif action == "database_indexes": return check_database_indexes(con, database, warning, critical, perf_data) + elif action == "collection_documents": + return check_collection_documents(con, database, collection, warning, critical, perf_data) elif action == "collection_indexes": return check_collection_indexes(con, database, collection, warning, critical, perf_data) elif action == "collection_size": return check_collection_size(con, database, collection, warning, critical, perf_data) + elif action == "collection_storageSize": + return check_collection_storageSize(con, database, collection, warning, critical, perf_data) elif action == "journaled": return check_journaled(con, warning, critical, perf_data) elif action == "write_data_files": return check_write_to_datafiles(con, warning, critical, perf_data) elif action == "opcounters": - return check_opcounters(con, host, warning, critical, perf_data) + return check_opcounters(con, host, port, warning, critical, perf_data) elif action == "asserts": - return check_asserts(con, host, warning, critical, perf_data) + return check_asserts(con, host, port, warning, critical, perf_data) elif action == "replica_primary": - return check_replica_primary(con, host, warning, critical, perf_data, replicaset) + return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version) elif action == "queries_per_second": - return check_queries_per_second(con, query_type, warning, critical, perf_data) + return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version) elif action == "page_faults": check_page_faults(con, sample_time, warning, critical, perf_data) elif action == "chunks_balance": @@ -255,30 +299,73 @@ def main(argv): return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time) -def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None): +def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None, retry_writes_disabled=False): + from pymongo.errors import ConnectionFailure + from pymongo.errors import PyMongoError + import ssl as SSL + + con_args = dict() + + if ssl: + if insecure: + con_args['ssl_cert_reqs'] = SSL.CERT_NONE + else: + con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED + con_args['ssl'] = ssl + if ssl_ca_cert_file: + con_args['ssl_ca_certs'] = ssl_ca_cert_file + if ssl_cert: + con_args['ssl_certfile'] = ssl_cert + + if retry_writes_disabled: + con_args['retryWrites'] = False + try: # ssl connection for pymongo > 2.3 if pymongo.version >= "2.3": if replica is None: - con = pymongo.MongoClient(host, port) + con = pymongo.MongoClient(host, port, **con_args) else: - con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10) + con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args) else: if replica is None: con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) else: con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) - #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10) + + # we must authenticate the connection, otherwise we won't be able to perform certain operations + if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256': + con.the_database.authenticate(user, mechanism='SCRAM-SHA-256') + elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1': + con.the_database.authenticate(user, mechanism='SCRAM-SHA-1') + elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509': + con.the_database.authenticate(user, mechanism='MONGODB-X509') + + try: + result = con.admin.command("ismaster") + except ConnectionFailure: + print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) ) + sys.exit(2) + + if 'arbiterOnly' in result and result['arbiterOnly'] == True: + print("OK - State: 7 (Arbiter on port %s)" % (port)) + sys.exit(0) if user and passwd: - db = con["admin"] - if not db.authenticate(user, passwd): + db = con[authdb] + try: + db.authenticate(user, password=passwd) + except PyMongoError: sys.exit("Username/Password incorrect") - except Exception, e: + + # Ping to check that the server is responding. + con.admin.command("ping") + + except Exception as e: if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1: # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter - print "OK - State: 7 (Arbiter)" + print("OK - State: 7 (Arbiter)") sys.exit(0) return exit_with_general_critical(e), None return 0, con @@ -288,7 +375,7 @@ def exit_with_general_warning(e): if isinstance(e, SystemExit): return e else: - print "WARNING - General MongoDB warning:", e + print("WARNING - General MongoDB warning:", e) return 1 @@ -296,19 +383,27 @@ def exit_with_general_critical(e): if isinstance(e, SystemExit): return e else: - print "CRITICAL - General MongoDB Error:", e + print("CRITICAL - General MongoDB Error:", e) return 2 def set_read_preference(db): - if pymongo.version >= "2.1": + if pymongo.version >= "2.2": + pymongo.read_preferences.Secondary + else: db.read_preference = pymongo.ReadPreference.SECONDARY +def check_version(con): + try: + server_info = con.server_info() + except Exception as e: + return exit_with_general_critical(e), None + return 0, int(server_info['version'].split('.')[0].strip()) def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time): warning = warning or 3 critical = critical or 6 - message = "Connection took %i seconds" % conn_time + message = "Connection took %.3f seconds" % conn_time message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)]) return check_levels(conn_time, warning, critical, message) @@ -330,13 +425,17 @@ def check_connections(con, warning, critical, perf_data): (available, "available_connections")]) return check_levels(used_percent, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) -def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): +def check_rep_lag(con, host, port, rdns_lookup, warning, critical, percent, perf_data, max_lag, ssl=False, user=None, passwd=None, replicaset=None, authdb="admin", insecure=None, ssl_ca_cert_file=None, cert_file=None, auth_mechanism=None, retry_writes_disabled=False): # Get mongo to tell us replica set member name when connecting locally if "127.0.0.1" == host: + if not "me" in list(con.admin.command("ismaster","1").keys()): + print("UNKNOWN - This is not replicated MongoDB") + return 3 + host = con.admin.command("ismaster","1")["me"].split(':')[0] if percent: @@ -348,16 +447,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la rs_status = {} slaveDelays = {} try: - set_read_preference(con.admin) + #set_read_preference(con.admin) # Get replica set status try: rs_status = con.admin.command("replSetGetStatus") - except pymongo.errors.OperationFailure, e: - if e.code == None and str(e).find('failed: not running with --replSet"'): - print "OK - Not running with replSet" - return 0 - + except pymongo.errors.OperationFailure as e: + if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))): + print("UNKNOWN - Not running with replSet") + return 3 serverVersion = tuple(con.server_info()['version'].split('.')) if serverVersion >= tuple("2.0.0".split(".")): # @@ -377,24 +475,32 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la for member in rs_status["members"]: if member["stateStr"] == "PRIMARY": primary_node = member - if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port: + + # if rdns_lookup is true then lookup both values back to their rdns value so we can compare hostname vs fqdn + if rdns_lookup: + member_host, member_port = split_host_port(member.get('name')) + member_host = "{0}:{1}".format(socket.getnameinfo((member_host, 0), 0)[0], member_port) + if member_host == "{0}:{1}".format(socket.getnameinfo((host, 0), 0)[0], port): + host_node = member + # Exact match + elif member.get('name') == "{0}:{1}".format(host, port): host_node = member # Check if we're in the middle of an election and don't have a primary if primary_node is None: - print "WARNING - No primary defined. In an election?" + print("WARNING - No primary defined. In an election?") return 1 # Check if we failed to find the current host # below should never happen if host_node is None: - print "CRITICAL - Unable to find host '" + host + "' in replica set." + print("CRITICAL - Unable to find host '" + host + "' in replica set.") return 2 # Is the specified host the primary? if host_node["stateStr"] == "PRIMARY": if max_lag == False: - print "OK - This is the primary." + print("OK - This is the primary.") return 0 else: #get the maximal replication lag @@ -407,7 +513,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la data = data + member['name'] + " lag=%d;" % replicationLag maximal_lag = max(maximal_lag, replicationLag) if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) if err != 0: return err primary_timediff = replication_get_time_diff(con) @@ -419,8 +525,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)]) return check_levels(maximal_lag, warning, critical, message) elif host_node["stateStr"] == "ARBITER": - print "OK - This is an arbiter" - return 0 + print("UNKNOWN - This is an arbiter") + return 3 # Find the difference in optime between current node and PRIMARY @@ -439,7 +545,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) if err != 0: return err primary_timediff = replication_get_time_diff(con) @@ -471,19 +577,19 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la # Check if we're in the middle of an election and don't have a primary if primary_node is None: - print "WARNING - No primary defined. In an election?" + print("WARNING - No primary defined. In an election?") sys.exit(1) # Is the specified host the primary? if host_node["stateStr"] == "PRIMARY": - print "OK - This is the primary." + print("OK - This is the primary.") sys.exit(0) # Find the difference in optime between current node and PRIMARY optime_lag = abs(primary_node[1] - host_node["optimeDate"]) lag = optime_lag.seconds if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1])) + err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled) if err != 0: return err primary_timediff = replication_get_time_diff(con) @@ -495,26 +601,34 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) return check_levels(lag, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) - -def check_memory(con, warning, critical, perf_data, mapped_memory): - # - # These thresholds are basically meaningless, and must be customized to your system's ram - # - - # Get the total system merory and calculate based on that how much memory used by Mongodb is ok or not. +# +# Check the memory usage of mongo. Alerting on this may be hard to get right +# because it'll try to get as much memory as it can. And that's probably +# a good thing. +# +def check_memory(con, warning, critical, perf_data, mapped_memory, host): + # Get the total system memory of this system (This is totally bogus if you + # are running this command remotely) and calculate based on that how much + # memory used by Mongodb is ok or not. meminfo = open('/proc/meminfo').read() matched = re.search(r'^MemTotal:\s+(\d+)', meminfo) - if matched: + if matched: mem_total_kB = int(matched.groups()[0]) - # Old way - #critical = critical or 16 - # The new way. if using >80% then warn, if >90% then critical level - warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0 - critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0 + if host != "127.0.0.1" and not warning: + # Running remotely and value was not set by user, use hardcoded value + warning = 12 + else: + # running locally or user provided value + warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0 + + if host != "127.0.0.1" and not critical: + critical = 16 + else: + critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0 # debugging #print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical) @@ -522,7 +636,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory): try: data = get_server_status(con) if not data['mem']['supported'] and not mapped_memory: - print "OK - Platform not supported for memory info" + print("OK - Platform not supported for memory info") return 0 # # convert to gigs @@ -559,7 +673,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory): else: return check_levels(mem_resident, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -572,7 +686,7 @@ def check_memory_mapped(con, warning, critical, perf_data): try: data = get_server_status(con) if not data['mem']['supported']: - print "OK - Platform not supported for memory info" + print("OK - Platform not supported for memory info") return 0 # # convert to gigs @@ -589,38 +703,45 @@ def check_memory_mapped(con, warning, critical, perf_data): message += " %.2fGB mappedWithJournal" % mem_mapped_journal except: mem_mapped_journal = 0 - message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")]) + message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")]) if not mem_mapped == -1: return check_levels(mem_mapped, warning, critical, message) else: - print "OK - Server does not provide mem.mapped info" + print("OK - Server does not provide mem.mapped info") return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) -def check_lock(con, warning, critical, perf_data): +# +# Return the percentage of the time there was a global Lock +# +def check_lock(con, warning, critical, perf_data, mongo_version): warning = warning or 10 critical = critical or 30 - try: - data = get_server_status(con) - # - # calculate percentage - # - lockTime = data['globalLock']['lockTime'] - totalTime = data['globalLock']['totalTime'] - if lockTime > totalTime: - lock_percentage = 0.00 - else: - lock_percentage = float(lockTime) / float(totalTime) * 100 - message = "Lock Percentage: %.2f%%" % lock_percentage - message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)]) - return check_levels(lock_percentage, warning, critical, message) - - except Exception, e: - return exit_with_general_critical(e) + if mongo_version == 2: + try: + data = get_server_status(con) + lockTime = data['globalLock']['lockTime'] + totalTime = data['globalLock']['totalTime'] + # + # calculate percentage + # + if lockTime > totalTime: + lock_percentage = 0.00 + else: + lock_percentage = float(lockTime) / float(totalTime) * 100 + message = "Lock Percentage: %.2f%%" % lock_percentage + message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)]) + return check_levels(lock_percentage, warning, critical, message) + except Exception as e: + print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.") + return exit_with_general_critical(e) + else: + print("OK - MongoDB version 3 doesn't report on global locks") + return 0 def check_flushing(con, warning, critical, avg, perf_data): @@ -632,19 +753,24 @@ def check_flushing(con, warning, critical, avg, perf_data): critical = critical or 15000 try: data = get_server_status(con) - if avg: - flush_time = float(data['backgroundFlushing']['average_ms']) - stat_type = "Average" - else: - flush_time = float(data['backgroundFlushing']['last_ms']) - stat_type = "Last" + try: + data['backgroundFlushing'] + if avg: + flush_time = float(data['backgroundFlushing']['average_ms']) + stat_type = "Average" + else: + flush_time = float(data['backgroundFlushing']['last_ms']) + stat_type = "Last" - message = "%s Flush Time: %.2fms" % (stat_type, flush_time) - message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)]) + message = "%s Flush Time: %.2fms" % (stat_type, flush_time) + message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)]) - return check_levels(flush_time, warning, critical, message) + return check_levels(flush_time, warning, critical, message) + except Exception: + print("OK - flushing stats not available for this storage engine") + return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -655,6 +781,7 @@ def index_miss_ratio(con, warning, critical, perf_data): data = get_server_status(con) try: + data['indexCounters'] serverVersion = tuple(con.server_info()['version'].split('.')) if serverVersion >= tuple("2.4.0".split(".")): miss_ratio = float(data['indexCounters']['missRatio']) @@ -662,19 +789,24 @@ def index_miss_ratio(con, warning, critical, perf_data): miss_ratio = float(data['indexCounters']['btree']['missRatio']) except KeyError: not_supported_msg = "not supported on this platform" - if data['indexCounters'].has_key('note'): - print "OK - MongoDB says: " + not_supported_msg + try: + data['indexCounters'] + if 'note' in data['indexCounters']: + print("OK - MongoDB says: " + not_supported_msg) + return 0 + else: + print("WARNING - Can't get counter from MongoDB") + return 1 + except Exception: + print("OK - MongoDB says: " + not_supported_msg) return 0 - else: - print "WARNING - Can't get counter from MongoDB" - return 1 message = "Miss Ratio: %.2f" % miss_ratio message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)]) return check_levels(miss_ratio, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) def check_replset_quorum(con, perf_data): @@ -698,7 +830,7 @@ def check_replset_quorum(con, perf_data): message = "Cluster is not quorate and cannot operate" return check_levels(state, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -713,44 +845,63 @@ def check_replset_state(con, perf_data, warning="", critical=""): except: critical = [8, 4, -1] - ok = range(-1, 8) # should include the range of all posiible values + ok = list(range(-1, 8)) # should include the range of all posiible values try: + worst_state = -2 + message = "" try: try: set_read_preference(con.admin) data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)])) except: data = con.admin.command(son.SON([('replSetGetStatus', 1)])) - state = int(data['myState']) - except pymongo.errors.OperationFailure, e: - if e.code == None and str(e).find('failed: not running with --replSet"'): - state = -1 + members = data['members'] + my_state = int(data['myState']) + worst_state = my_state + for member in members: + their_state = int(member['state']) + message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state)) + if state_is_worse(their_state, worst_state, warning, critical): + worst_state = their_state + message += performance_data(perf_data, [(my_state, "state")]) - if state == 8: - message = "State: %i (Down)" % state - elif state == 4: - message = "State: %i (Fatal error)" % state - elif state == 0: - message = "State: %i (Starting up, phase1)" % state - elif state == 3: - message = "State: %i (Recovering)" % state - elif state == 5: - message = "State: %i (Starting up, phase2)" % state - elif state == 1: - message = "State: %i (Primary)" % state - elif state == 2: - message = "State: %i (Secondary)" % state - elif state == 7: - message = "State: %i (Arbiter)" % state - elif state == -1: - message = "Not running with replSet" - else: - message = "State: %i (Unknown state)" % state - message += performance_data(perf_data, [(state, "state")]) - return check_levels(state, warning, critical, message, ok) - except Exception, e: + except pymongo.errors.OperationFailure as e: + if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))): + worst_state = -1 + + return check_levels(worst_state, warning, critical, message, ok) + except Exception as e: return exit_with_general_critical(e) +def state_is_worse(state, worst_state, warning, critical): + if worst_state in critical: + return False + if worst_state in warning: + return state in critical + return (state in warning) or (state in critical) + +def state_text(state): + if state == 8: + return "Down" + elif state == 4: + return "Fatal error" + elif state == 0: + return "Starting up, phase1" + elif state == 3: + return "Recovering" + elif state == 5: + return "Starting up, phase2" + elif state == 1: + return "Primary" + elif state == 2: + return "Secondary" + elif state == 7: + return "Arbiter" + elif state == -1: + return "Not running with replSet" + else: + return "Unknown state" + def check_databases(con, warning, critical, perf_data=None): try: @@ -764,7 +915,7 @@ def check_databases(con, warning, critical, perf_data=None): message = "Number of DBs: %.0f" % count message += performance_data(perf_data, [(count, "databases", warning, critical, message)]) return check_levels(count, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -786,7 +937,7 @@ def check_collections(con, warning, critical, perf_data=None): message += performance_data(perf_data, [(count, "collections", warning, critical, message)]) return check_levels(count, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -823,21 +974,21 @@ def check_database_size(con, database, warning, critical, perf_data): try: set_read_preference(con.admin) data = con[database].command('dbstats') - storage_size = data['storageSize'] / 1024 / 1024 + storage_size = data['storageSize'] // 1024 // 1024 if perf_data: perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical) #perfdata += " database=%s" %(database) if storage_size >= critical: - print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata) + print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) return 2 elif storage_size >= warning: - print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata) + print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) return 1 else: - print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata) + print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -851,20 +1002,42 @@ def check_database_indexes(con, database, warning, critical, perf_data): try: set_read_preference(con.admin) data = con[database].command('dbstats') - index_size = data['indexSize'] / 1024 / 1024 + index_size = data['indexSize'] / 1024 // 1024 if perf_data: perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical) if index_size >= critical: - print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata) + print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) return 2 elif index_size >= warning: - print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata) + print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) return 1 else: - print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata) + print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) return 0 - except Exception, e: + except Exception as e: + return exit_with_general_critical(e) + + +def check_collection_documents(con, database, collection, warning, critical, perf_data): + perfdata = "" + try: + set_read_preference(con.admin) + data = con[database].command('collstats', collection) + documents = data['count'] + if perf_data: + perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical) + + if documents >= critical: + print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata)) + return 2 + elif documents >= warning: + print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata)) + return 1 + else: + print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata)) + return 0 + except Exception as e: return exit_with_general_critical(e) @@ -883,15 +1056,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_ perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical) if total_index_size >= critical: - print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata) + print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) return 2 elif total_index_size >= warning: - print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata) + print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) return 1 else: - print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata) + print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -908,7 +1081,7 @@ def check_queues(con, warning, critical, perf_data): message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")]) return check_levels(total_queues, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) def check_collection_size(con, database, collection, warning, critical, perf_data): @@ -923,18 +1096,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical) if size >= critical: - print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata) + print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)) return 2 elif size >= warning: - print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata) + print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)) return 1 else: - print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata) + print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)) return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) -def check_queries_per_second(con, query_type, warning, critical, perf_data): + +def check_collection_storageSize(con, database, collection, warning, critical, perf_data): + warning = warning or 100 + critical = critical or 1000 + perfdata = "" + try: + set_read_preference(con.admin) + data = con[database].command('collstats', collection) + storageSize = data['storageSize'] / 1024 / 1024 + if perf_data: + perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical) + + if storageSize >= critical: + print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata)) + return 2 + elif storageSize >= warning: + print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata)) + return 1 + else: + print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata)) + return 0 + except Exception as e: + return exit_with_general_critical(e) + + +def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version): warning = warning or 250 critical = critical or 500 @@ -955,10 +1153,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data): diff_query = num - last_count['data'][query_type]['count'] diff_ts = ts - last_count['data'][query_type]['ts'] + if diff_ts == 0: + message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts) + return check_levels(0, warning, critical, message) + query_per_sec = float(diff_query) / float(diff_ts) # update the count now - db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) + if mongo_version == 2: + db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) + else: + db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) message = "Queries / Sec: %f" % query_per_sec message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)]) @@ -967,17 +1172,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data): # since it is the first run insert it query_per_sec = 0 message = "First run of check.. no data" - db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) + if mongo_version == 2: + db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) + else: + db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) + except TypeError: # # since it is the first run insert it query_per_sec = 0 message = "First run of check.. no data" - db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}}) + if mongo_version == 2: + db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}}) + else: + db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}}) return check_levels(query_per_sec, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -1024,7 +1236,7 @@ def check_oplog(con, warning, critical, perf_data): message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')]) return check_levels(-approx_level, -warning, -critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -1042,7 +1254,7 @@ Under very high write situations it is normal for this value to be nonzero. """ message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)]) return check_levels(j_commits_in_wl, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -1058,7 +1270,7 @@ def check_journaled(con, warning, critical, perf_data): message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)]) return check_levels(journaled, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -1075,11 +1287,11 @@ than the amount physically written to disk.""" message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)]) return check_levels(writes, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) -def get_opcounters(data, opcounters_name, host): +def get_opcounters(data, opcounters_name, host, port): try: insert = data[opcounters_name]['insert'] query = data[opcounters_name]['query'] @@ -1087,21 +1299,21 @@ def get_opcounters(data, opcounters_name, host): delete = data[opcounters_name]['delete'] getmore = data[opcounters_name]['getmore'] command = data[opcounters_name]['command'] - except KeyError, e: + except KeyError as e: return 0, [0] * 100 total_commands = insert + query + update + delete + getmore + command new_vals = [total_commands, insert, query, update, delete, getmore, command] - return maintain_delta(new_vals, host, opcounters_name) + return maintain_delta(new_vals, host, port, opcounters_name) -def check_opcounters(con, host, warning, critical, perf_data): +def check_opcounters(con, host, port, warning, critical, perf_data): """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl""" warning = warning or 10000 critical = critical or 15000 data = get_server_status(con) - err1, delta_opcounters = get_opcounters(data, 'opcounters', host) - err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host) + err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port) + err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port) if err1 == 0 and err2 == 0: delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)] delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized @@ -1109,14 +1321,14 @@ def check_opcounters(con, host, warning, critical, perf_data): message = "Test succeeded , old values missing" message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta) message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"), - (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"), + (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"), (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")])) return check_levels(per_minute_delta[0], warning, critical, message) else: return exit_with_general_critical("problem reading data from temp file") -def check_current_lock(con, host, warning, critical, perf_data): +def check_current_lock(con, host, port, warning, critical, perf_data): """ A function to get current lock percentage and not a global one, as check_lock function does""" warning = warning or 10 critical = critical or 30 @@ -1125,7 +1337,7 @@ def check_current_lock(con, host, warning, critical, perf_data): lockTime = float(data['globalLock']['lockTime']) totalTime = float(data['globalLock']['totalTime']) - err, delta = maintain_delta([totalTime, lockTime], host, "locktime") + err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime") if err == 0: lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100 message = "Current Lock Percentage: %.2f%%" % lock_percentage @@ -1135,7 +1347,7 @@ def check_current_lock(con, host, warning, critical, perf_data): return exit_with_general_warning("problem reading data from temp file") -def check_page_faults(con, host, warning, critical, perf_data): +def check_page_faults(con, host, port, warning, critical, perf_data): """ A function to get page_faults per second from the system""" warning = warning or 10 critical = critical or 30 @@ -1147,7 +1359,7 @@ def check_page_faults(con, host, warning, critical, perf_data): # page_faults unsupported on the underlaying system return exit_with_general_critical("page_faults unsupported on the underlaying system") - err, delta = maintain_delta([page_faults], host, "page_faults") + err, delta = maintain_delta([page_faults], host, port, "page_faults") if err == 0: page_faults_ps = delta[1] / delta[0] message = "Page faults : %.2f ps" % page_faults_ps @@ -1157,7 +1369,7 @@ def check_page_faults(con, host, warning, critical, perf_data): return exit_with_general_warning("problem reading data from temp file") -def check_asserts(con, host, warning, critical, perf_data): +def check_asserts(con, host, port, warning, critical, perf_data): """ A function to get asserts from the system""" warning = warning or 1 critical = critical or 10 @@ -1172,7 +1384,7 @@ def check_asserts(con, host, warning, critical, perf_data): user = asserts['user'] rollovers = asserts['rollovers'] - err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts") + err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts") if err == 0: if delta[5] != 0: @@ -1206,7 +1418,7 @@ def get_stored_primary_server_name(db): return stored_primary_server -def check_replica_primary(con, host, warning, critical, perf_data, replicaset): +def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version): """ A function to check if the primary server of a replica set has changed """ if warning is None and critical is None: warning = 1 @@ -1229,7 +1441,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset): saved_primary = "None" if current_primary != saved_primary: last_primary_server_record = {"server": current_primary} - db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True) + if mongo_version == 2: + db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True) + else: + db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True) message = "Primary server has changed from %s to %s" % (saved_primary, current_primary) primary_status = 1 return check_levels(primary_status, warning, critical, message) @@ -1251,9 +1466,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data): try: #on linux servers only - page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time + page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time except KeyError: - print "WARNING - Can't get extra_info.page_faults counter from MongoDB" + print("WARNING - Can't get extra_info.page_faults counter from MongoDB") sys.exit(1) message = "Page Faults: %i" % (page_faults) @@ -1261,7 +1476,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data): message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)]) check_levels(page_faults, warning, critical, message) - except Exception, e: + except Exception as e: exit_with_general_critical(e) @@ -1277,35 +1492,35 @@ def chunks_balance(con, database, collection, warning, critical): shards = col.distinct("shard") except: - print "WARNING - Can't get chunks infos from MongoDB" + print("WARNING - Can't get chunks infos from MongoDB") sys.exit(1) if nscount == 0: - print "WARNING - Namespace %s is not sharded" % (nsfilter) + print("WARNING - Namespace %s is not sharded" % (nsfilter)) sys.exit(1) - avgchunksnb = nscount / len(shards) - warningnb = avgchunksnb * warning / 100 - criticalnb = avgchunksnb * critical / 100 + avgchunksnb = nscount // len(shards) + warningnb = avgchunksnb * warning // 100 + criticalnb = avgchunksnb * critical // 100 for shard in shards: delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count()) message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta) if delta >= criticalnb and delta > 0: - print "CRITICAL - Chunks not well balanced " + message + print("CRITICAL - Chunks not well balanced " + message) sys.exit(2) elif delta >= warningnb and delta > 0: - print "WARNING - Chunks not well balanced " + message + print("WARNING - Chunks not well balanced " + message) sys.exit(1) - print "OK - Chunks well balanced across shards" + print("OK - Chunks well balanced across shards") sys.exit(0) - except Exception, e: + except Exception as e: exit_with_general_critical(e) - print "OK - Chunks well balanced across shards" + print("OK - Chunks well balanced across shards") sys.exit(0) @@ -1321,7 +1536,7 @@ def check_connect_primary(con, warning, critical, perf_data): data = con.admin.command(son.SON([('isMaster', 1)])) if data['ismaster'] == True: - print "OK - This server is primary" + print("OK - This server is primary") return 0 phost = data['primary'].split(':')[0] @@ -1339,17 +1554,17 @@ def check_connect_primary(con, warning, critical, perf_data): return check_levels(pconn_time, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) def check_collection_state(con, database, collection): try: con[database][collection].find_one() - print "OK - Collection %s.%s is reachable " % (database, collection) + print("OK - Collection %s.%s is reachable " % (database, collection)) return 0 - except Exception, e: + except Exception as e: return exit_with_general_critical(e) @@ -1361,14 +1576,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data): return check_levels(count, warning, critical, message) - except Exception, e: + except Exception as e: return exit_with_general_critical(e) -def build_file_name(host, action): +def build_file_name(host, port, action): #done this way so it will work when run independently and from shell module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2) - return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data" + + if (port == 27017): + return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data" + else: + return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data" def ensure_dir(f): @@ -1381,7 +1600,7 @@ def write_values(file_name, string): f = None try: f = open(file_name, 'w') - except IOError, e: + except IOError as e: #try creating if (e.errno == 2): ensure_dir(file_name) @@ -1400,11 +1619,11 @@ def read_values(file_name): data = f.read() f.close() return 0, data - except IOError, e: + except IOError as e: if (e.errno == 2): #no previous data return 1, '' - except Exception, e: + except Exception as e: return 2, None @@ -1420,8 +1639,8 @@ def calc_delta(old, new): return 0, delta -def maintain_delta(new_vals, host, action): - file_name = build_file_name(host, action) +def maintain_delta(new_vals, host, port, action): + file_name = build_file_name(host, port, action) err, data = read_values(file_name) old_vals = data.split(';') new_vals = [str(int(time.time()))] + new_vals @@ -1442,8 +1661,8 @@ def replication_get_time_diff(con): col = 'oplog.$main' firstc = local[col].find().sort("$natural", 1).limit(1) lastc = local[col].find().sort("$natural", -1).limit(1) - first = firstc.next() - last = lastc.next() + first = next(firstc) + last = next(lastc) tfirst = first["ts"] tlast = last["ts"] delta = tlast.time - tfirst.time