nagios-nrpe: upgrade check_mongo
gitea/ansible-roles/pipeline/head This commit looks good Details

This commit is contained in:
William Hirigoyen 2022-09-14 12:19:55 +02:00
parent 6ce3004818
commit e0ba847e9c
1 changed files with 436 additions and 217 deletions

653
nagios-nrpe/files/plugins/check_mongodb Executable file → Normal file
View File

@ -17,24 +17,29 @@
# - Dag Stockstad <dag.stockstad@gmail.com>
# - @Andor on github
# - Steven Richards - Captainkrtek on github
# - Max Vernimmen
# - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
# - Kris Nova - @kris@nivenly.com github.com/kris-nova
# - Jan Kantert - firstname@lastname.net
#
# USAGE
#
# See the README.md
#
from __future__ import print_function
from __future__ import division
import sys
import time
import optparse
import textwrap
import re
import os
import numbers
import socket
try:
import pymongo
except ImportError, e:
print e
except ImportError as e:
print(e)
sys.exit(2)
# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
@ -78,37 +83,35 @@ def performance_data(perf_data, params):
def numeric_type(param):
if ((type(param) == float or type(param) == int or param == None)):
return True
return False
return param is None or isinstance(param, numbers.Real)
def check_levels(param, warning, critical, message, ok=[]):
if (numeric_type(critical) and numeric_type(warning)):
if param >= critical:
print "CRITICAL - " + message
print("CRITICAL - " + message)
sys.exit(2)
elif param >= warning:
print "WARNING - " + message
print("WARNING - " + message)
sys.exit(1)
else:
print "OK - " + message
print("OK - " + message)
sys.exit(0)
else:
if param in critical:
print "CRITICAL - " + message
print("CRITICAL - " + message)
sys.exit(2)
if param in warning:
print "WARNING - " + message
print("WARNING - " + message)
sys.exit(1)
if param in ok:
print "OK - " + message
print("OK - " + message)
sys.exit(0)
# unexpected param value
print "CRITICAL - Unexpected value : %d" % param + "; " + message
print("CRITICAL - Unexpected value : %d" % param + "; " + message)
return 2
@ -120,21 +123,32 @@ def get_server_status(con):
data = con.admin.command(son.SON([('serverStatus', 1)]))
return data
def split_host_port(string):
if not string.rsplit(':', 1)[-1].isdigit():
return (string, None)
string = string.rsplit(':', 1)
host = string[0] # 1st index is always host
port = int(string[1])
return (host, port)
def main(argv):
p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
p.add_option('--rdns-lookup', action='store_true', dest='rdns_lookup', default=False, help='RDNS(PTR) lookup on given host/host-to-check, to convert ip-address to fqdn')
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
@ -145,12 +159,28 @@ def main(argv):
p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
choices=['2','3'])
p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates")
p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
p.add_option('--disable_retry_writes', dest='retry_writes_disabled', default=False, action='callback', callback=optional_arg(True), help='Disable retryWrites feature')
options, arguments = p.parse_args()
host = options.host
host_to_check = options.host_to_check if options.host_to_check else options.host
rdns_lookup = options.rdns_lookup
if (rdns_lookup):
host_to_check = socket.getnameinfo((host_to_check, 0), 0)[0]
port = options.port
port_to_check = options.port_to_check if options.port_to_check else options.port
user = options.user
passwd = options.passwd
authdb = options.authdb
query_type = options.query_type
collection = options.collection
sample_time = options.sample_time
@ -164,9 +194,15 @@ def main(argv):
action = options.action
perf_data = options.perf_data
max_lag = options.max_lag
mongo_version = options.mongo_version
database = options.database
ssl = options.ssl
replicaset = options.replicaset
insecure = options.insecure
ssl_ca_cert_file = options.ssl_ca_cert_file
cert_file = options.cert_file
auth_mechanism = options.auth_mechanism
retry_writes_disabled = options.retry_writes_disabled
if action == 'replica_primary' and replicaset is None:
return "replicaset must be passed in when using replica_primary check"
@ -177,31 +213,35 @@ def main(argv):
# moving the login up here and passing in the connection
#
start = time.time()
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset)
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
if err != 0:
return err
# Autodetect mongo-version and force pymongo to let us know if it can connect or not.
err, mongo_version = check_version(con)
if err != 0:
return err
conn_time = time.time() - start
conn_time = round(conn_time, 0)
if action == "connections":
return check_connections(con, warning, critical, perf_data)
elif action == "replication_lag":
return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, False, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
elif action == "replication_lag_percent":
return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, True, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
elif action == "replset_state":
return check_replset_state(con, perf_data, warning, critical)
elif action == "memory":
return check_memory(con, warning, critical, perf_data, options.mapped_memory)
return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
elif action == "memory_mapped":
return check_memory_mapped(con, warning, critical, perf_data)
elif action == "queues":
return check_queues(con, warning, critical, perf_data)
elif action == "lock":
return check_lock(con, warning, critical, perf_data)
return check_lock(con, warning, critical, perf_data, mongo_version)
elif action == "current_lock":
return check_current_lock(con, host, warning, critical, perf_data)
return check_current_lock(con, host, port, warning, critical, perf_data)
elif action == "flushing":
return check_flushing(con, warning, critical, True, perf_data)
elif action == "last_flush_time":
@ -223,22 +263,26 @@ def main(argv):
return check_database_size(con, database, warning, critical, perf_data)
elif action == "database_indexes":
return check_database_indexes(con, database, warning, critical, perf_data)
elif action == "collection_documents":
return check_collection_documents(con, database, collection, warning, critical, perf_data)
elif action == "collection_indexes":
return check_collection_indexes(con, database, collection, warning, critical, perf_data)
elif action == "collection_size":
return check_collection_size(con, database, collection, warning, critical, perf_data)
elif action == "collection_storageSize":
return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
elif action == "journaled":
return check_journaled(con, warning, critical, perf_data)
elif action == "write_data_files":
return check_write_to_datafiles(con, warning, critical, perf_data)
elif action == "opcounters":
return check_opcounters(con, host, warning, critical, perf_data)
return check_opcounters(con, host, port, warning, critical, perf_data)
elif action == "asserts":
return check_asserts(con, host, warning, critical, perf_data)
return check_asserts(con, host, port, warning, critical, perf_data)
elif action == "replica_primary":
return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
elif action == "queries_per_second":
return check_queries_per_second(con, query_type, warning, critical, perf_data)
return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
elif action == "page_faults":
check_page_faults(con, sample_time, warning, critical, perf_data)
elif action == "chunks_balance":
@ -255,30 +299,73 @@ def main(argv):
return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None):
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None, retry_writes_disabled=False):
from pymongo.errors import ConnectionFailure
from pymongo.errors import PyMongoError
import ssl as SSL
con_args = dict()
if ssl:
if insecure:
con_args['ssl_cert_reqs'] = SSL.CERT_NONE
else:
con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED
con_args['ssl'] = ssl
if ssl_ca_cert_file:
con_args['ssl_ca_certs'] = ssl_ca_cert_file
if ssl_cert:
con_args['ssl_certfile'] = ssl_cert
if retry_writes_disabled:
con_args['retryWrites'] = False
try:
# ssl connection for pymongo > 2.3
if pymongo.version >= "2.3":
if replica is None:
con = pymongo.MongoClient(host, port)
con = pymongo.MongoClient(host, port, **con_args)
else:
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10)
con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
else:
if replica is None:
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
else:
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
#con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
# we must authenticate the connection, otherwise we won't be able to perform certain operations
if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
con.the_database.authenticate(user, mechanism='MONGODB-X509')
try:
result = con.admin.command("ismaster")
except ConnectionFailure:
print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
sys.exit(2)
if 'arbiterOnly' in result and result['arbiterOnly'] == True:
print("OK - State: 7 (Arbiter on port %s)" % (port))
sys.exit(0)
if user and passwd:
db = con["admin"]
if not db.authenticate(user, passwd):
db = con[authdb]
try:
db.authenticate(user, password=passwd)
except PyMongoError:
sys.exit("Username/Password incorrect")
except Exception, e:
# Ping to check that the server is responding.
con.admin.command("ping")
except Exception as e:
if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
# We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
# This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
print "OK - State: 7 (Arbiter)"
print("OK - State: 7 (Arbiter)")
sys.exit(0)
return exit_with_general_critical(e), None
return 0, con
@ -288,7 +375,7 @@ def exit_with_general_warning(e):
if isinstance(e, SystemExit):
return e
else:
print "WARNING - General MongoDB warning:", e
print("WARNING - General MongoDB warning:", e)
return 1
@ -296,19 +383,27 @@ def exit_with_general_critical(e):
if isinstance(e, SystemExit):
return e
else:
print "CRITICAL - General MongoDB Error:", e
print("CRITICAL - General MongoDB Error:", e)
return 2
def set_read_preference(db):
if pymongo.version >= "2.1":
if pymongo.version >= "2.2":
pymongo.read_preferences.Secondary
else:
db.read_preference = pymongo.ReadPreference.SECONDARY
def check_version(con):
try:
server_info = con.server_info()
except Exception as e:
return exit_with_general_critical(e), None
return 0, int(server_info['version'].split('.')[0].strip())
def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
warning = warning or 3
critical = critical or 6
message = "Connection took %i seconds" % conn_time
message = "Connection took %.3f seconds" % conn_time
message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
return check_levels(conn_time, warning, critical, message)
@ -330,13 +425,17 @@ def check_connections(con, warning, critical, perf_data):
(available, "available_connections")])
return check_levels(used_percent, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
def check_rep_lag(con, host, port, rdns_lookup, warning, critical, percent, perf_data, max_lag, ssl=False, user=None, passwd=None, replicaset=None, authdb="admin", insecure=None, ssl_ca_cert_file=None, cert_file=None, auth_mechanism=None, retry_writes_disabled=False):
# Get mongo to tell us replica set member name when connecting locally
if "127.0.0.1" == host:
if not "me" in list(con.admin.command("ismaster","1").keys()):
print("UNKNOWN - This is not replicated MongoDB")
return 3
host = con.admin.command("ismaster","1")["me"].split(':')[0]
if percent:
@ -348,16 +447,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
rs_status = {}
slaveDelays = {}
try:
set_read_preference(con.admin)
#set_read_preference(con.admin)
# Get replica set status
try:
rs_status = con.admin.command("replSetGetStatus")
except pymongo.errors.OperationFailure, e:
if e.code == None and str(e).find('failed: not running with --replSet"'):
print "OK - Not running with replSet"
return 0
except pymongo.errors.OperationFailure as e:
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
print("UNKNOWN - Not running with replSet")
return 3
serverVersion = tuple(con.server_info()['version'].split('.'))
if serverVersion >= tuple("2.0.0".split(".")):
#
@ -377,24 +475,32 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
for member in rs_status["members"]:
if member["stateStr"] == "PRIMARY":
primary_node = member
if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
# if rdns_lookup is true then lookup both values back to their rdns value so we can compare hostname vs fqdn
if rdns_lookup:
member_host, member_port = split_host_port(member.get('name'))
member_host = "{0}:{1}".format(socket.getnameinfo((member_host, 0), 0)[0], member_port)
if member_host == "{0}:{1}".format(socket.getnameinfo((host, 0), 0)[0], port):
host_node = member
# Exact match
elif member.get('name') == "{0}:{1}".format(host, port):
host_node = member
# Check if we're in the middle of an election and don't have a primary
if primary_node is None:
print "WARNING - No primary defined. In an election?"
print("WARNING - No primary defined. In an election?")
return 1
# Check if we failed to find the current host
# below should never happen
if host_node is None:
print "CRITICAL - Unable to find host '" + host + "' in replica set."
print("CRITICAL - Unable to find host '" + host + "' in replica set.")
return 2
# Is the specified host the primary?
if host_node["stateStr"] == "PRIMARY":
if max_lag == False:
print "OK - This is the primary."
print("OK - This is the primary.")
return 0
else:
#get the maximal replication lag
@ -407,7 +513,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
data = data + member['name'] + " lag=%d;" % replicationLag
maximal_lag = max(maximal_lag, replicationLag)
if percent:
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
if err != 0:
return err
primary_timediff = replication_get_time_diff(con)
@ -419,8 +525,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
return check_levels(maximal_lag, warning, critical, message)
elif host_node["stateStr"] == "ARBITER":
print "OK - This is an arbiter"
return 0
print("UNKNOWN - This is an arbiter")
return 3
# Find the difference in optime between current node and PRIMARY
@ -439,7 +545,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
if percent:
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
if err != 0:
return err
primary_timediff = replication_get_time_diff(con)
@ -471,19 +577,19 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
# Check if we're in the middle of an election and don't have a primary
if primary_node is None:
print "WARNING - No primary defined. In an election?"
print("WARNING - No primary defined. In an election?")
sys.exit(1)
# Is the specified host the primary?
if host_node["stateStr"] == "PRIMARY":
print "OK - This is the primary."
print("OK - This is the primary.")
sys.exit(0)
# Find the difference in optime between current node and PRIMARY
optime_lag = abs(primary_node[1] - host_node["optimeDate"])
lag = optime_lag.seconds
if percent:
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]))
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
if err != 0:
return err
primary_timediff = replication_get_time_diff(con)
@ -495,26 +601,34 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
return check_levels(lag, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_memory(con, warning, critical, perf_data, mapped_memory):
#
# These thresholds are basically meaningless, and must be customized to your system's ram
#
# Get the total system merory and calculate based on that how much memory used by Mongodb is ok or not.
#
# Check the memory usage of mongo. Alerting on this may be hard to get right
# because it'll try to get as much memory as it can. And that's probably
# a good thing.
#
def check_memory(con, warning, critical, perf_data, mapped_memory, host):
# Get the total system memory of this system (This is totally bogus if you
# are running this command remotely) and calculate based on that how much
# memory used by Mongodb is ok or not.
meminfo = open('/proc/meminfo').read()
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
if matched:
if matched:
mem_total_kB = int(matched.groups()[0])
# Old way
#critical = critical or 16
# The new way. if using >80% then warn, if >90% then critical level
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
if host != "127.0.0.1" and not warning:
# Running remotely and value was not set by user, use hardcoded value
warning = 12
else:
# running locally or user provided value
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
if host != "127.0.0.1" and not critical:
critical = 16
else:
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
# debugging
#print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
@ -522,7 +636,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
try:
data = get_server_status(con)
if not data['mem']['supported'] and not mapped_memory:
print "OK - Platform not supported for memory info"
print("OK - Platform not supported for memory info")
return 0
#
# convert to gigs
@ -559,7 +673,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
else:
return check_levels(mem_resident, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -572,7 +686,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
try:
data = get_server_status(con)
if not data['mem']['supported']:
print "OK - Platform not supported for memory info"
print("OK - Platform not supported for memory info")
return 0
#
# convert to gigs
@ -589,38 +703,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
message += " %.2fGB mappedWithJournal" % mem_mapped_journal
except:
mem_mapped_journal = 0
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
if not mem_mapped == -1:
return check_levels(mem_mapped, warning, critical, message)
else:
print "OK - Server does not provide mem.mapped info"
print("OK - Server does not provide mem.mapped info")
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_lock(con, warning, critical, perf_data):
#
# Return the percentage of the time there was a global Lock
#
def check_lock(con, warning, critical, perf_data, mongo_version):
warning = warning or 10
critical = critical or 30
try:
data = get_server_status(con)
#
# calculate percentage
#
lockTime = data['globalLock']['lockTime']
totalTime = data['globalLock']['totalTime']
if lockTime > totalTime:
lock_percentage = 0.00
else:
lock_percentage = float(lockTime) / float(totalTime) * 100
message = "Lock Percentage: %.2f%%" % lock_percentage
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
return check_levels(lock_percentage, warning, critical, message)
except Exception, e:
return exit_with_general_critical(e)
if mongo_version == 2:
try:
data = get_server_status(con)
lockTime = data['globalLock']['lockTime']
totalTime = data['globalLock']['totalTime']
#
# calculate percentage
#
if lockTime > totalTime:
lock_percentage = 0.00
else:
lock_percentage = float(lockTime) / float(totalTime) * 100
message = "Lock Percentage: %.2f%%" % lock_percentage
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
return check_levels(lock_percentage, warning, critical, message)
except Exception as e:
print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
return exit_with_general_critical(e)
else:
print("OK - MongoDB version 3 doesn't report on global locks")
return 0
def check_flushing(con, warning, critical, avg, perf_data):
@ -632,19 +753,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
critical = critical or 15000
try:
data = get_server_status(con)
if avg:
flush_time = float(data['backgroundFlushing']['average_ms'])
stat_type = "Average"
else:
flush_time = float(data['backgroundFlushing']['last_ms'])
stat_type = "Last"
try:
data['backgroundFlushing']
if avg:
flush_time = float(data['backgroundFlushing']['average_ms'])
stat_type = "Average"
else:
flush_time = float(data['backgroundFlushing']['last_ms'])
stat_type = "Last"
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
return check_levels(flush_time, warning, critical, message)
return check_levels(flush_time, warning, critical, message)
except Exception:
print("OK - flushing stats not available for this storage engine")
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -655,6 +781,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
data = get_server_status(con)
try:
data['indexCounters']
serverVersion = tuple(con.server_info()['version'].split('.'))
if serverVersion >= tuple("2.4.0".split(".")):
miss_ratio = float(data['indexCounters']['missRatio'])
@ -662,19 +789,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
miss_ratio = float(data['indexCounters']['btree']['missRatio'])
except KeyError:
not_supported_msg = "not supported on this platform"
if data['indexCounters'].has_key('note'):
print "OK - MongoDB says: " + not_supported_msg
try:
data['indexCounters']
if 'note' in data['indexCounters']:
print("OK - MongoDB says: " + not_supported_msg)
return 0
else:
print("WARNING - Can't get counter from MongoDB")
return 1
except Exception:
print("OK - MongoDB says: " + not_supported_msg)
return 0
else:
print "WARNING - Can't get counter from MongoDB"
return 1
message = "Miss Ratio: %.2f" % miss_ratio
message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
return check_levels(miss_ratio, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_replset_quorum(con, perf_data):
@ -698,7 +830,7 @@ def check_replset_quorum(con, perf_data):
message = "Cluster is not quorate and cannot operate"
return check_levels(state, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -713,44 +845,63 @@ def check_replset_state(con, perf_data, warning="", critical=""):
except:
critical = [8, 4, -1]
ok = range(-1, 8) # should include the range of all posiible values
ok = list(range(-1, 8)) # should include the range of all posiible values
try:
worst_state = -2
message = ""
try:
try:
set_read_preference(con.admin)
data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
except:
data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
state = int(data['myState'])
except pymongo.errors.OperationFailure, e:
if e.code == None and str(e).find('failed: not running with --replSet"'):
state = -1
members = data['members']
my_state = int(data['myState'])
worst_state = my_state
for member in members:
their_state = int(member['state'])
message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
if state_is_worse(their_state, worst_state, warning, critical):
worst_state = their_state
message += performance_data(perf_data, [(my_state, "state")])
if state == 8:
message = "State: %i (Down)" % state
elif state == 4:
message = "State: %i (Fatal error)" % state
elif state == 0:
message = "State: %i (Starting up, phase1)" % state
elif state == 3:
message = "State: %i (Recovering)" % state
elif state == 5:
message = "State: %i (Starting up, phase2)" % state
elif state == 1:
message = "State: %i (Primary)" % state
elif state == 2:
message = "State: %i (Secondary)" % state
elif state == 7:
message = "State: %i (Arbiter)" % state
elif state == -1:
message = "Not running with replSet"
else:
message = "State: %i (Unknown state)" % state
message += performance_data(perf_data, [(state, "state")])
return check_levels(state, warning, critical, message, ok)
except Exception, e:
except pymongo.errors.OperationFailure as e:
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
worst_state = -1
return check_levels(worst_state, warning, critical, message, ok)
except Exception as e:
return exit_with_general_critical(e)
def state_is_worse(state, worst_state, warning, critical):
if worst_state in critical:
return False
if worst_state in warning:
return state in critical
return (state in warning) or (state in critical)
def state_text(state):
if state == 8:
return "Down"
elif state == 4:
return "Fatal error"
elif state == 0:
return "Starting up, phase1"
elif state == 3:
return "Recovering"
elif state == 5:
return "Starting up, phase2"
elif state == 1:
return "Primary"
elif state == 2:
return "Secondary"
elif state == 7:
return "Arbiter"
elif state == -1:
return "Not running with replSet"
else:
return "Unknown state"
def check_databases(con, warning, critical, perf_data=None):
try:
@ -764,7 +915,7 @@ def check_databases(con, warning, critical, perf_data=None):
message = "Number of DBs: %.0f" % count
message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
return check_levels(count, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -786,7 +937,7 @@ def check_collections(con, warning, critical, perf_data=None):
message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
return check_levels(count, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -823,21 +974,21 @@ def check_database_size(con, database, warning, critical, perf_data):
try:
set_read_preference(con.admin)
data = con[database].command('dbstats')
storage_size = data['storageSize'] / 1024 / 1024
storage_size = data['storageSize'] // 1024 // 1024
if perf_data:
perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
#perfdata += " database=%s" %(database)
if storage_size >= critical:
print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
return 2
elif storage_size >= warning:
print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
return 1
else:
print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -851,20 +1002,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
try:
set_read_preference(con.admin)
data = con[database].command('dbstats')
index_size = data['indexSize'] / 1024 / 1024
index_size = data['indexSize'] / 1024 // 1024
if perf_data:
perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
if index_size >= critical:
print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
return 2
elif index_size >= warning:
print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
return 1
else:
print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_collection_documents(con, database, collection, warning, critical, perf_data):
perfdata = ""
try:
set_read_preference(con.admin)
data = con[database].command('collstats', collection)
documents = data['count']
if perf_data:
perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
if documents >= critical:
print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
return 2
elif documents >= warning:
print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
return 1
else:
print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
return 0
except Exception as e:
return exit_with_general_critical(e)
@ -883,15 +1056,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
if total_index_size >= critical:
print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
return 2
elif total_index_size >= warning:
print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
return 1
else:
print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -908,7 +1081,7 @@ def check_queues(con, warning, critical, perf_data):
message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
return check_levels(total_queues, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_collection_size(con, database, collection, warning, critical, perf_data):
@ -923,18 +1096,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
if size >= critical:
print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
return 2
elif size >= warning:
print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
return 1
else:
print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_queries_per_second(con, query_type, warning, critical, perf_data):
def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
warning = warning or 100
critical = critical or 1000
perfdata = ""
try:
set_read_preference(con.admin)
data = con[database].command('collstats', collection)
storageSize = data['storageSize'] / 1024 / 1024
if perf_data:
perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
if storageSize >= critical:
print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
return 2
elif storageSize >= warning:
print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
return 1
else:
print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
return 0
except Exception as e:
return exit_with_general_critical(e)
def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
warning = warning or 250
critical = critical or 500
@ -955,10 +1153,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
diff_query = num - last_count['data'][query_type]['count']
diff_ts = ts - last_count['data'][query_type]['ts']
if diff_ts == 0:
message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
return check_levels(0, warning, critical, message)
query_per_sec = float(diff_query) / float(diff_ts)
# update the count now
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
if mongo_version == 2:
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
else:
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
message = "Queries / Sec: %f" % query_per_sec
message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
@ -967,17 +1172,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
# since it is the first run insert it
query_per_sec = 0
message = "First run of check.. no data"
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
if mongo_version == 2:
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
else:
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
except TypeError:
#
# since it is the first run insert it
query_per_sec = 0
message = "First run of check.. no data"
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
if mongo_version == 2:
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
else:
db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
return check_levels(query_per_sec, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -1024,7 +1236,7 @@ def check_oplog(con, warning, critical, perf_data):
message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
return check_levels(-approx_level, -warning, -critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -1042,7 +1254,7 @@ Under very high write situations it is normal for this value to be nonzero. """
message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
return check_levels(j_commits_in_wl, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -1058,7 +1270,7 @@ def check_journaled(con, warning, critical, perf_data):
message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
return check_levels(journaled, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -1075,11 +1287,11 @@ than the amount physically written to disk."""
message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
return check_levels(writes, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def get_opcounters(data, opcounters_name, host):
def get_opcounters(data, opcounters_name, host, port):
try:
insert = data[opcounters_name]['insert']
query = data[opcounters_name]['query']
@ -1087,21 +1299,21 @@ def get_opcounters(data, opcounters_name, host):
delete = data[opcounters_name]['delete']
getmore = data[opcounters_name]['getmore']
command = data[opcounters_name]['command']
except KeyError, e:
except KeyError as e:
return 0, [0] * 100
total_commands = insert + query + update + delete + getmore + command
new_vals = [total_commands, insert, query, update, delete, getmore, command]
return maintain_delta(new_vals, host, opcounters_name)
return maintain_delta(new_vals, host, port, opcounters_name)
def check_opcounters(con, host, warning, critical, perf_data):
def check_opcounters(con, host, port, warning, critical, perf_data):
""" A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
warning = warning or 10000
critical = critical or 15000
data = get_server_status(con)
err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
if err1 == 0 and err2 == 0:
delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
@ -1109,14 +1321,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
message = "Test succeeded , old values missing"
message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
(per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
return check_levels(per_minute_delta[0], warning, critical, message)
else:
return exit_with_general_critical("problem reading data from temp file")
def check_current_lock(con, host, warning, critical, perf_data):
def check_current_lock(con, host, port, warning, critical, perf_data):
""" A function to get current lock percentage and not a global one, as check_lock function does"""
warning = warning or 10
critical = critical or 30
@ -1125,7 +1337,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
lockTime = float(data['globalLock']['lockTime'])
totalTime = float(data['globalLock']['totalTime'])
err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
if err == 0:
lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
message = "Current Lock Percentage: %.2f%%" % lock_percentage
@ -1135,7 +1347,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
return exit_with_general_warning("problem reading data from temp file")
def check_page_faults(con, host, warning, critical, perf_data):
def check_page_faults(con, host, port, warning, critical, perf_data):
""" A function to get page_faults per second from the system"""
warning = warning or 10
critical = critical or 30
@ -1147,7 +1359,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
# page_faults unsupported on the underlaying system
return exit_with_general_critical("page_faults unsupported on the underlaying system")
err, delta = maintain_delta([page_faults], host, "page_faults")
err, delta = maintain_delta([page_faults], host, port, "page_faults")
if err == 0:
page_faults_ps = delta[1] / delta[0]
message = "Page faults : %.2f ps" % page_faults_ps
@ -1157,7 +1369,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
return exit_with_general_warning("problem reading data from temp file")
def check_asserts(con, host, warning, critical, perf_data):
def check_asserts(con, host, port, warning, critical, perf_data):
""" A function to get asserts from the system"""
warning = warning or 1
critical = critical or 10
@ -1172,7 +1384,7 @@ def check_asserts(con, host, warning, critical, perf_data):
user = asserts['user']
rollovers = asserts['rollovers']
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
if err == 0:
if delta[5] != 0:
@ -1206,7 +1418,7 @@ def get_stored_primary_server_name(db):
return stored_primary_server
def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
""" A function to check if the primary server of a replica set has changed """
if warning is None and critical is None:
warning = 1
@ -1229,7 +1441,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
saved_primary = "None"
if current_primary != saved_primary:
last_primary_server_record = {"server": current_primary}
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
if mongo_version == 2:
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
else:
db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
primary_status = 1
return check_levels(primary_status, warning, critical, message)
@ -1251,9 +1466,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
try:
#on linux servers only
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
except KeyError:
print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
sys.exit(1)
message = "Page Faults: %i" % (page_faults)
@ -1261,7 +1476,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
check_levels(page_faults, warning, critical, message)
except Exception, e:
except Exception as e:
exit_with_general_critical(e)
@ -1277,35 +1492,35 @@ def chunks_balance(con, database, collection, warning, critical):
shards = col.distinct("shard")
except:
print "WARNING - Can't get chunks infos from MongoDB"
print("WARNING - Can't get chunks infos from MongoDB")
sys.exit(1)
if nscount == 0:
print "WARNING - Namespace %s is not sharded" % (nsfilter)
print("WARNING - Namespace %s is not sharded" % (nsfilter))
sys.exit(1)
avgchunksnb = nscount / len(shards)
warningnb = avgchunksnb * warning / 100
criticalnb = avgchunksnb * critical / 100
avgchunksnb = nscount // len(shards)
warningnb = avgchunksnb * warning // 100
criticalnb = avgchunksnb * critical // 100
for shard in shards:
delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
if delta >= criticalnb and delta > 0:
print "CRITICAL - Chunks not well balanced " + message
print("CRITICAL - Chunks not well balanced " + message)
sys.exit(2)
elif delta >= warningnb and delta > 0:
print "WARNING - Chunks not well balanced " + message
print("WARNING - Chunks not well balanced " + message)
sys.exit(1)
print "OK - Chunks well balanced across shards"
print("OK - Chunks well balanced across shards")
sys.exit(0)
except Exception, e:
except Exception as e:
exit_with_general_critical(e)
print "OK - Chunks well balanced across shards"
print("OK - Chunks well balanced across shards")
sys.exit(0)
@ -1321,7 +1536,7 @@ def check_connect_primary(con, warning, critical, perf_data):
data = con.admin.command(son.SON([('isMaster', 1)]))
if data['ismaster'] == True:
print "OK - This server is primary"
print("OK - This server is primary")
return 0
phost = data['primary'].split(':')[0]
@ -1339,17 +1554,17 @@ def check_connect_primary(con, warning, critical, perf_data):
return check_levels(pconn_time, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def check_collection_state(con, database, collection):
try:
con[database][collection].find_one()
print "OK - Collection %s.%s is reachable " % (database, collection)
print("OK - Collection %s.%s is reachable " % (database, collection))
return 0
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
@ -1361,14 +1576,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
return check_levels(count, warning, critical, message)
except Exception, e:
except Exception as e:
return exit_with_general_critical(e)
def build_file_name(host, action):
def build_file_name(host, port, action):
#done this way so it will work when run independently and from shell
module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
if (port == 27017):
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
else:
return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
def ensure_dir(f):
@ -1381,7 +1600,7 @@ def write_values(file_name, string):
f = None
try:
f = open(file_name, 'w')
except IOError, e:
except IOError as e:
#try creating
if (e.errno == 2):
ensure_dir(file_name)
@ -1400,11 +1619,11 @@ def read_values(file_name):
data = f.read()
f.close()
return 0, data
except IOError, e:
except IOError as e:
if (e.errno == 2):
#no previous data
return 1, ''
except Exception, e:
except Exception as e:
return 2, None
@ -1420,8 +1639,8 @@ def calc_delta(old, new):
return 0, delta
def maintain_delta(new_vals, host, action):
file_name = build_file_name(host, action)
def maintain_delta(new_vals, host, port, action):
file_name = build_file_name(host, port, action)
err, data = read_values(file_name)
old_vals = data.split(';')
new_vals = [str(int(time.time()))] + new_vals
@ -1442,8 +1661,8 @@ def replication_get_time_diff(con):
col = 'oplog.$main'
firstc = local[col].find().sort("$natural", 1).limit(1)
lastc = local[col].find().sort("$natural", -1).limit(1)
first = firstc.next()
last = lastc.next()
first = next(firstc)
last = next(lastc)
tfirst = first["ts"]
tlast = last["ts"]
delta = tlast.time - tfirst.time