nagios-nrpe: upgrade check_mongo
gitea/ansible-roles/pipeline/head This commit looks good
Details
gitea/ansible-roles/pipeline/head This commit looks good
Details
This commit is contained in:
parent
6ce3004818
commit
e0ba847e9c
|
@ -17,24 +17,29 @@
|
|||
# - Dag Stockstad <dag.stockstad@gmail.com>
|
||||
# - @Andor on github
|
||||
# - Steven Richards - Captainkrtek on github
|
||||
# - Max Vernimmen
|
||||
# - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
|
||||
# - Kris Nova - @kris@nivenly.com github.com/kris-nova
|
||||
# - Jan Kantert - firstname@lastname.net
|
||||
#
|
||||
# USAGE
|
||||
#
|
||||
# See the README.md
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
import sys
|
||||
import time
|
||||
import optparse
|
||||
import textwrap
|
||||
import re
|
||||
import os
|
||||
import numbers
|
||||
import socket
|
||||
|
||||
try:
|
||||
import pymongo
|
||||
except ImportError, e:
|
||||
print e
|
||||
except ImportError as e:
|
||||
print(e)
|
||||
sys.exit(2)
|
||||
|
||||
# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
|
||||
|
@ -78,37 +83,35 @@ def performance_data(perf_data, params):
|
|||
|
||||
|
||||
def numeric_type(param):
|
||||
if ((type(param) == float or type(param) == int or param == None)):
|
||||
return True
|
||||
return False
|
||||
return param is None or isinstance(param, numbers.Real)
|
||||
|
||||
|
||||
def check_levels(param, warning, critical, message, ok=[]):
|
||||
if (numeric_type(critical) and numeric_type(warning)):
|
||||
if param >= critical:
|
||||
print "CRITICAL - " + message
|
||||
print("CRITICAL - " + message)
|
||||
sys.exit(2)
|
||||
elif param >= warning:
|
||||
print "WARNING - " + message
|
||||
print("WARNING - " + message)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print "OK - " + message
|
||||
print("OK - " + message)
|
||||
sys.exit(0)
|
||||
else:
|
||||
if param in critical:
|
||||
print "CRITICAL - " + message
|
||||
print("CRITICAL - " + message)
|
||||
sys.exit(2)
|
||||
|
||||
if param in warning:
|
||||
print "WARNING - " + message
|
||||
print("WARNING - " + message)
|
||||
sys.exit(1)
|
||||
|
||||
if param in ok:
|
||||
print "OK - " + message
|
||||
print("OK - " + message)
|
||||
sys.exit(0)
|
||||
|
||||
# unexpected param value
|
||||
print "CRITICAL - Unexpected value : %d" % param + "; " + message
|
||||
print("CRITICAL - Unexpected value : %d" % param + "; " + message)
|
||||
return 2
|
||||
|
||||
|
||||
|
@ -120,21 +123,32 @@ def get_server_status(con):
|
|||
data = con.admin.command(son.SON([('serverStatus', 1)]))
|
||||
return data
|
||||
|
||||
def split_host_port(string):
|
||||
if not string.rsplit(':', 1)[-1].isdigit():
|
||||
return (string, None)
|
||||
string = string.rsplit(':', 1)
|
||||
host = string[0] # 1st index is always host
|
||||
port = int(string[1])
|
||||
return (host, port)
|
||||
|
||||
|
||||
def main(argv):
|
||||
p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
|
||||
|
||||
p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
|
||||
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
|
||||
p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
|
||||
p.add_option('--rdns-lookup', action='store_true', dest='rdns_lookup', default=False, help='RDNS(PTR) lookup on given host/host-to-check, to convert ip-address to fqdn')
|
||||
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
|
||||
p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
|
||||
p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
|
||||
p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
|
||||
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
|
||||
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
|
||||
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
|
||||
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
|
||||
p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
|
||||
choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
|
||||
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
|
||||
'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
|
||||
'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
||||
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
|
||||
'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
|
||||
'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
||||
p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
|
||||
p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
|
||||
p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
|
||||
|
@ -145,12 +159,28 @@ def main(argv):
|
|||
p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
|
||||
p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
|
||||
p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
|
||||
p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
|
||||
choices=['2','3'])
|
||||
p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
|
||||
p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates")
|
||||
p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
|
||||
p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
|
||||
p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
|
||||
choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
|
||||
p.add_option('--disable_retry_writes', dest='retry_writes_disabled', default=False, action='callback', callback=optional_arg(True), help='Disable retryWrites feature')
|
||||
|
||||
options, arguments = p.parse_args()
|
||||
host = options.host
|
||||
host_to_check = options.host_to_check if options.host_to_check else options.host
|
||||
rdns_lookup = options.rdns_lookup
|
||||
if (rdns_lookup):
|
||||
host_to_check = socket.getnameinfo((host_to_check, 0), 0)[0]
|
||||
port = options.port
|
||||
port_to_check = options.port_to_check if options.port_to_check else options.port
|
||||
user = options.user
|
||||
passwd = options.passwd
|
||||
authdb = options.authdb
|
||||
|
||||
query_type = options.query_type
|
||||
collection = options.collection
|
||||
sample_time = options.sample_time
|
||||
|
@ -164,9 +194,15 @@ def main(argv):
|
|||
action = options.action
|
||||
perf_data = options.perf_data
|
||||
max_lag = options.max_lag
|
||||
mongo_version = options.mongo_version
|
||||
database = options.database
|
||||
ssl = options.ssl
|
||||
replicaset = options.replicaset
|
||||
insecure = options.insecure
|
||||
ssl_ca_cert_file = options.ssl_ca_cert_file
|
||||
cert_file = options.cert_file
|
||||
auth_mechanism = options.auth_mechanism
|
||||
retry_writes_disabled = options.retry_writes_disabled
|
||||
|
||||
if action == 'replica_primary' and replicaset is None:
|
||||
return "replicaset must be passed in when using replica_primary check"
|
||||
|
@ -177,31 +213,35 @@ def main(argv):
|
|||
# moving the login up here and passing in the connection
|
||||
#
|
||||
start = time.time()
|
||||
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset)
|
||||
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
if err != 0:
|
||||
return err
|
||||
|
||||
# Autodetect mongo-version and force pymongo to let us know if it can connect or not.
|
||||
err, mongo_version = check_version(con)
|
||||
if err != 0:
|
||||
return err
|
||||
|
||||
conn_time = time.time() - start
|
||||
conn_time = round(conn_time, 0)
|
||||
|
||||
if action == "connections":
|
||||
return check_connections(con, warning, critical, perf_data)
|
||||
elif action == "replication_lag":
|
||||
return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
|
||||
return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, False, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
elif action == "replication_lag_percent":
|
||||
return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
|
||||
return check_rep_lag(con, host_to_check, port_to_check, rdns_lookup, warning, critical, True, perf_data, max_lag, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
elif action == "replset_state":
|
||||
return check_replset_state(con, perf_data, warning, critical)
|
||||
elif action == "memory":
|
||||
return check_memory(con, warning, critical, perf_data, options.mapped_memory)
|
||||
return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
|
||||
elif action == "memory_mapped":
|
||||
return check_memory_mapped(con, warning, critical, perf_data)
|
||||
elif action == "queues":
|
||||
return check_queues(con, warning, critical, perf_data)
|
||||
elif action == "lock":
|
||||
return check_lock(con, warning, critical, perf_data)
|
||||
return check_lock(con, warning, critical, perf_data, mongo_version)
|
||||
elif action == "current_lock":
|
||||
return check_current_lock(con, host, warning, critical, perf_data)
|
||||
return check_current_lock(con, host, port, warning, critical, perf_data)
|
||||
elif action == "flushing":
|
||||
return check_flushing(con, warning, critical, True, perf_data)
|
||||
elif action == "last_flush_time":
|
||||
|
@ -223,22 +263,26 @@ def main(argv):
|
|||
return check_database_size(con, database, warning, critical, perf_data)
|
||||
elif action == "database_indexes":
|
||||
return check_database_indexes(con, database, warning, critical, perf_data)
|
||||
elif action == "collection_documents":
|
||||
return check_collection_documents(con, database, collection, warning, critical, perf_data)
|
||||
elif action == "collection_indexes":
|
||||
return check_collection_indexes(con, database, collection, warning, critical, perf_data)
|
||||
elif action == "collection_size":
|
||||
return check_collection_size(con, database, collection, warning, critical, perf_data)
|
||||
elif action == "collection_storageSize":
|
||||
return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
|
||||
elif action == "journaled":
|
||||
return check_journaled(con, warning, critical, perf_data)
|
||||
elif action == "write_data_files":
|
||||
return check_write_to_datafiles(con, warning, critical, perf_data)
|
||||
elif action == "opcounters":
|
||||
return check_opcounters(con, host, warning, critical, perf_data)
|
||||
return check_opcounters(con, host, port, warning, critical, perf_data)
|
||||
elif action == "asserts":
|
||||
return check_asserts(con, host, warning, critical, perf_data)
|
||||
return check_asserts(con, host, port, warning, critical, perf_data)
|
||||
elif action == "replica_primary":
|
||||
return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
|
||||
return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
|
||||
elif action == "queries_per_second":
|
||||
return check_queries_per_second(con, query_type, warning, critical, perf_data)
|
||||
return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
|
||||
elif action == "page_faults":
|
||||
check_page_faults(con, sample_time, warning, critical, perf_data)
|
||||
elif action == "chunks_balance":
|
||||
|
@ -255,30 +299,73 @@ def main(argv):
|
|||
return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
|
||||
|
||||
|
||||
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None):
|
||||
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None, retry_writes_disabled=False):
|
||||
from pymongo.errors import ConnectionFailure
|
||||
from pymongo.errors import PyMongoError
|
||||
import ssl as SSL
|
||||
|
||||
con_args = dict()
|
||||
|
||||
if ssl:
|
||||
if insecure:
|
||||
con_args['ssl_cert_reqs'] = SSL.CERT_NONE
|
||||
else:
|
||||
con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED
|
||||
con_args['ssl'] = ssl
|
||||
if ssl_ca_cert_file:
|
||||
con_args['ssl_ca_certs'] = ssl_ca_cert_file
|
||||
if ssl_cert:
|
||||
con_args['ssl_certfile'] = ssl_cert
|
||||
|
||||
if retry_writes_disabled:
|
||||
con_args['retryWrites'] = False
|
||||
|
||||
try:
|
||||
# ssl connection for pymongo > 2.3
|
||||
if pymongo.version >= "2.3":
|
||||
if replica is None:
|
||||
con = pymongo.MongoClient(host, port)
|
||||
con = pymongo.MongoClient(host, port, **con_args)
|
||||
else:
|
||||
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10)
|
||||
con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
|
||||
else:
|
||||
if replica is None:
|
||||
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
||||
else:
|
||||
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
||||
#con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
|
||||
|
||||
# we must authenticate the connection, otherwise we won't be able to perform certain operations
|
||||
if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
|
||||
con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
|
||||
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
|
||||
con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
|
||||
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
|
||||
con.the_database.authenticate(user, mechanism='MONGODB-X509')
|
||||
|
||||
try:
|
||||
result = con.admin.command("ismaster")
|
||||
except ConnectionFailure:
|
||||
print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
|
||||
sys.exit(2)
|
||||
|
||||
if 'arbiterOnly' in result and result['arbiterOnly'] == True:
|
||||
print("OK - State: 7 (Arbiter on port %s)" % (port))
|
||||
sys.exit(0)
|
||||
|
||||
if user and passwd:
|
||||
db = con["admin"]
|
||||
if not db.authenticate(user, passwd):
|
||||
db = con[authdb]
|
||||
try:
|
||||
db.authenticate(user, password=passwd)
|
||||
except PyMongoError:
|
||||
sys.exit("Username/Password incorrect")
|
||||
except Exception, e:
|
||||
|
||||
# Ping to check that the server is responding.
|
||||
con.admin.command("ping")
|
||||
|
||||
except Exception as e:
|
||||
if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
|
||||
# We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
|
||||
# This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
|
||||
print "OK - State: 7 (Arbiter)"
|
||||
print("OK - State: 7 (Arbiter)")
|
||||
sys.exit(0)
|
||||
return exit_with_general_critical(e), None
|
||||
return 0, con
|
||||
|
@ -288,7 +375,7 @@ def exit_with_general_warning(e):
|
|||
if isinstance(e, SystemExit):
|
||||
return e
|
||||
else:
|
||||
print "WARNING - General MongoDB warning:", e
|
||||
print("WARNING - General MongoDB warning:", e)
|
||||
return 1
|
||||
|
||||
|
||||
|
@ -296,19 +383,27 @@ def exit_with_general_critical(e):
|
|||
if isinstance(e, SystemExit):
|
||||
return e
|
||||
else:
|
||||
print "CRITICAL - General MongoDB Error:", e
|
||||
print("CRITICAL - General MongoDB Error:", e)
|
||||
return 2
|
||||
|
||||
|
||||
def set_read_preference(db):
|
||||
if pymongo.version >= "2.1":
|
||||
if pymongo.version >= "2.2":
|
||||
pymongo.read_preferences.Secondary
|
||||
else:
|
||||
db.read_preference = pymongo.ReadPreference.SECONDARY
|
||||
|
||||
def check_version(con):
|
||||
try:
|
||||
server_info = con.server_info()
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e), None
|
||||
return 0, int(server_info['version'].split('.')[0].strip())
|
||||
|
||||
def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
|
||||
warning = warning or 3
|
||||
critical = critical or 6
|
||||
message = "Connection took %i seconds" % conn_time
|
||||
message = "Connection took %.3f seconds" % conn_time
|
||||
message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
|
||||
|
||||
return check_levels(conn_time, warning, critical, message)
|
||||
|
@ -330,13 +425,17 @@ def check_connections(con, warning, critical, perf_data):
|
|||
(available, "available_connections")])
|
||||
return check_levels(used_percent, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
|
||||
def check_rep_lag(con, host, port, rdns_lookup, warning, critical, percent, perf_data, max_lag, ssl=False, user=None, passwd=None, replicaset=None, authdb="admin", insecure=None, ssl_ca_cert_file=None, cert_file=None, auth_mechanism=None, retry_writes_disabled=False):
|
||||
# Get mongo to tell us replica set member name when connecting locally
|
||||
if "127.0.0.1" == host:
|
||||
if not "me" in list(con.admin.command("ismaster","1").keys()):
|
||||
print("UNKNOWN - This is not replicated MongoDB")
|
||||
return 3
|
||||
|
||||
host = con.admin.command("ismaster","1")["me"].split(':')[0]
|
||||
|
||||
if percent:
|
||||
|
@ -348,16 +447,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
rs_status = {}
|
||||
slaveDelays = {}
|
||||
try:
|
||||
set_read_preference(con.admin)
|
||||
#set_read_preference(con.admin)
|
||||
|
||||
# Get replica set status
|
||||
try:
|
||||
rs_status = con.admin.command("replSetGetStatus")
|
||||
except pymongo.errors.OperationFailure, e:
|
||||
if e.code == None and str(e).find('failed: not running with --replSet"'):
|
||||
print "OK - Not running with replSet"
|
||||
return 0
|
||||
|
||||
except pymongo.errors.OperationFailure as e:
|
||||
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
||||
print("UNKNOWN - Not running with replSet")
|
||||
return 3
|
||||
serverVersion = tuple(con.server_info()['version'].split('.'))
|
||||
if serverVersion >= tuple("2.0.0".split(".")):
|
||||
#
|
||||
|
@ -377,24 +475,32 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
for member in rs_status["members"]:
|
||||
if member["stateStr"] == "PRIMARY":
|
||||
primary_node = member
|
||||
if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
|
||||
|
||||
# if rdns_lookup is true then lookup both values back to their rdns value so we can compare hostname vs fqdn
|
||||
if rdns_lookup:
|
||||
member_host, member_port = split_host_port(member.get('name'))
|
||||
member_host = "{0}:{1}".format(socket.getnameinfo((member_host, 0), 0)[0], member_port)
|
||||
if member_host == "{0}:{1}".format(socket.getnameinfo((host, 0), 0)[0], port):
|
||||
host_node = member
|
||||
# Exact match
|
||||
elif member.get('name') == "{0}:{1}".format(host, port):
|
||||
host_node = member
|
||||
|
||||
# Check if we're in the middle of an election and don't have a primary
|
||||
if primary_node is None:
|
||||
print "WARNING - No primary defined. In an election?"
|
||||
print("WARNING - No primary defined. In an election?")
|
||||
return 1
|
||||
|
||||
# Check if we failed to find the current host
|
||||
# below should never happen
|
||||
if host_node is None:
|
||||
print "CRITICAL - Unable to find host '" + host + "' in replica set."
|
||||
print("CRITICAL - Unable to find host '" + host + "' in replica set.")
|
||||
return 2
|
||||
|
||||
# Is the specified host the primary?
|
||||
if host_node["stateStr"] == "PRIMARY":
|
||||
if max_lag == False:
|
||||
print "OK - This is the primary."
|
||||
print("OK - This is the primary.")
|
||||
return 0
|
||||
else:
|
||||
#get the maximal replication lag
|
||||
|
@ -407,7 +513,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
data = data + member['name'] + " lag=%d;" % replicationLag
|
||||
maximal_lag = max(maximal_lag, replicationLag)
|
||||
if percent:
|
||||
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
|
||||
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
if err != 0:
|
||||
return err
|
||||
primary_timediff = replication_get_time_diff(con)
|
||||
|
@ -419,8 +525,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
|
||||
return check_levels(maximal_lag, warning, critical, message)
|
||||
elif host_node["stateStr"] == "ARBITER":
|
||||
print "OK - This is an arbiter"
|
||||
return 0
|
||||
print("UNKNOWN - This is an arbiter")
|
||||
return 3
|
||||
|
||||
# Find the difference in optime between current node and PRIMARY
|
||||
|
||||
|
@ -439,7 +545,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
|
||||
|
||||
if percent:
|
||||
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
|
||||
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
if err != 0:
|
||||
return err
|
||||
primary_timediff = replication_get_time_diff(con)
|
||||
|
@ -471,19 +577,19 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
|
||||
# Check if we're in the middle of an election and don't have a primary
|
||||
if primary_node is None:
|
||||
print "WARNING - No primary defined. In an election?"
|
||||
print("WARNING - No primary defined. In an election?")
|
||||
sys.exit(1)
|
||||
|
||||
# Is the specified host the primary?
|
||||
if host_node["stateStr"] == "PRIMARY":
|
||||
print "OK - This is the primary."
|
||||
print("OK - This is the primary.")
|
||||
sys.exit(0)
|
||||
|
||||
# Find the difference in optime between current node and PRIMARY
|
||||
optime_lag = abs(primary_node[1] - host_node["optimeDate"])
|
||||
lag = optime_lag.seconds
|
||||
if percent:
|
||||
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]))
|
||||
err, con = mongo_connect(split_host_port(primary_node['name'])[0], int(split_host_port(primary_node['name'])[1]), ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, auth_mechanism, retry_writes_disabled=retry_writes_disabled)
|
||||
if err != 0:
|
||||
return err
|
||||
primary_timediff = replication_get_time_diff(con)
|
||||
|
@ -495,26 +601,34 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|||
message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
|
||||
return check_levels(lag, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_memory(con, warning, critical, perf_data, mapped_memory):
|
||||
#
|
||||
# These thresholds are basically meaningless, and must be customized to your system's ram
|
||||
#
|
||||
|
||||
# Get the total system merory and calculate based on that how much memory used by Mongodb is ok or not.
|
||||
#
|
||||
# Check the memory usage of mongo. Alerting on this may be hard to get right
|
||||
# because it'll try to get as much memory as it can. And that's probably
|
||||
# a good thing.
|
||||
#
|
||||
def check_memory(con, warning, critical, perf_data, mapped_memory, host):
|
||||
# Get the total system memory of this system (This is totally bogus if you
|
||||
# are running this command remotely) and calculate based on that how much
|
||||
# memory used by Mongodb is ok or not.
|
||||
meminfo = open('/proc/meminfo').read()
|
||||
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
|
||||
if matched:
|
||||
if matched:
|
||||
mem_total_kB = int(matched.groups()[0])
|
||||
|
||||
# Old way
|
||||
#critical = critical or 16
|
||||
# The new way. if using >80% then warn, if >90% then critical level
|
||||
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
|
||||
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
|
||||
if host != "127.0.0.1" and not warning:
|
||||
# Running remotely and value was not set by user, use hardcoded value
|
||||
warning = 12
|
||||
else:
|
||||
# running locally or user provided value
|
||||
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
|
||||
|
||||
if host != "127.0.0.1" and not critical:
|
||||
critical = 16
|
||||
else:
|
||||
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
|
||||
|
||||
# debugging
|
||||
#print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
|
||||
|
@ -522,7 +636,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
|
|||
try:
|
||||
data = get_server_status(con)
|
||||
if not data['mem']['supported'] and not mapped_memory:
|
||||
print "OK - Platform not supported for memory info"
|
||||
print("OK - Platform not supported for memory info")
|
||||
return 0
|
||||
#
|
||||
# convert to gigs
|
||||
|
@ -559,7 +673,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
|
|||
else:
|
||||
return check_levels(mem_resident, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -572,7 +686,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|||
try:
|
||||
data = get_server_status(con)
|
||||
if not data['mem']['supported']:
|
||||
print "OK - Platform not supported for memory info"
|
||||
print("OK - Platform not supported for memory info")
|
||||
return 0
|
||||
#
|
||||
# convert to gigs
|
||||
|
@ -589,38 +703,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|||
message += " %.2fGB mappedWithJournal" % mem_mapped_journal
|
||||
except:
|
||||
mem_mapped_journal = 0
|
||||
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
||||
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
||||
|
||||
if not mem_mapped == -1:
|
||||
return check_levels(mem_mapped, warning, critical, message)
|
||||
else:
|
||||
print "OK - Server does not provide mem.mapped info"
|
||||
print("OK - Server does not provide mem.mapped info")
|
||||
return 0
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_lock(con, warning, critical, perf_data):
|
||||
#
|
||||
# Return the percentage of the time there was a global Lock
|
||||
#
|
||||
def check_lock(con, warning, critical, perf_data, mongo_version):
|
||||
warning = warning or 10
|
||||
critical = critical or 30
|
||||
try:
|
||||
data = get_server_status(con)
|
||||
#
|
||||
# calculate percentage
|
||||
#
|
||||
lockTime = data['globalLock']['lockTime']
|
||||
totalTime = data['globalLock']['totalTime']
|
||||
if lockTime > totalTime:
|
||||
lock_percentage = 0.00
|
||||
else:
|
||||
lock_percentage = float(lockTime) / float(totalTime) * 100
|
||||
message = "Lock Percentage: %.2f%%" % lock_percentage
|
||||
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
|
||||
return check_levels(lock_percentage, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
return exit_with_general_critical(e)
|
||||
if mongo_version == 2:
|
||||
try:
|
||||
data = get_server_status(con)
|
||||
lockTime = data['globalLock']['lockTime']
|
||||
totalTime = data['globalLock']['totalTime']
|
||||
#
|
||||
# calculate percentage
|
||||
#
|
||||
if lockTime > totalTime:
|
||||
lock_percentage = 0.00
|
||||
else:
|
||||
lock_percentage = float(lockTime) / float(totalTime) * 100
|
||||
message = "Lock Percentage: %.2f%%" % lock_percentage
|
||||
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
|
||||
return check_levels(lock_percentage, warning, critical, message)
|
||||
except Exception as e:
|
||||
print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
|
||||
return exit_with_general_critical(e)
|
||||
else:
|
||||
print("OK - MongoDB version 3 doesn't report on global locks")
|
||||
return 0
|
||||
|
||||
|
||||
def check_flushing(con, warning, critical, avg, perf_data):
|
||||
|
@ -632,19 +753,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
|
|||
critical = critical or 15000
|
||||
try:
|
||||
data = get_server_status(con)
|
||||
if avg:
|
||||
flush_time = float(data['backgroundFlushing']['average_ms'])
|
||||
stat_type = "Average"
|
||||
else:
|
||||
flush_time = float(data['backgroundFlushing']['last_ms'])
|
||||
stat_type = "Last"
|
||||
try:
|
||||
data['backgroundFlushing']
|
||||
if avg:
|
||||
flush_time = float(data['backgroundFlushing']['average_ms'])
|
||||
stat_type = "Average"
|
||||
else:
|
||||
flush_time = float(data['backgroundFlushing']['last_ms'])
|
||||
stat_type = "Last"
|
||||
|
||||
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
|
||||
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
|
||||
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
|
||||
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
|
||||
|
||||
return check_levels(flush_time, warning, critical, message)
|
||||
return check_levels(flush_time, warning, critical, message)
|
||||
except Exception:
|
||||
print("OK - flushing stats not available for this storage engine")
|
||||
return 0
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -655,6 +781,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|||
data = get_server_status(con)
|
||||
|
||||
try:
|
||||
data['indexCounters']
|
||||
serverVersion = tuple(con.server_info()['version'].split('.'))
|
||||
if serverVersion >= tuple("2.4.0".split(".")):
|
||||
miss_ratio = float(data['indexCounters']['missRatio'])
|
||||
|
@ -662,19 +789,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|||
miss_ratio = float(data['indexCounters']['btree']['missRatio'])
|
||||
except KeyError:
|
||||
not_supported_msg = "not supported on this platform"
|
||||
if data['indexCounters'].has_key('note'):
|
||||
print "OK - MongoDB says: " + not_supported_msg
|
||||
try:
|
||||
data['indexCounters']
|
||||
if 'note' in data['indexCounters']:
|
||||
print("OK - MongoDB says: " + not_supported_msg)
|
||||
return 0
|
||||
else:
|
||||
print("WARNING - Can't get counter from MongoDB")
|
||||
return 1
|
||||
except Exception:
|
||||
print("OK - MongoDB says: " + not_supported_msg)
|
||||
return 0
|
||||
else:
|
||||
print "WARNING - Can't get counter from MongoDB"
|
||||
return 1
|
||||
|
||||
message = "Miss Ratio: %.2f" % miss_ratio
|
||||
message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
|
||||
|
||||
return check_levels(miss_ratio, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
def check_replset_quorum(con, perf_data):
|
||||
|
@ -698,7 +830,7 @@ def check_replset_quorum(con, perf_data):
|
|||
message = "Cluster is not quorate and cannot operate"
|
||||
|
||||
return check_levels(state, warning, critical, message)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -713,44 +845,63 @@ def check_replset_state(con, perf_data, warning="", critical=""):
|
|||
except:
|
||||
critical = [8, 4, -1]
|
||||
|
||||
ok = range(-1, 8) # should include the range of all posiible values
|
||||
ok = list(range(-1, 8)) # should include the range of all posiible values
|
||||
try:
|
||||
worst_state = -2
|
||||
message = ""
|
||||
try:
|
||||
try:
|
||||
set_read_preference(con.admin)
|
||||
data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
|
||||
except:
|
||||
data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
|
||||
state = int(data['myState'])
|
||||
except pymongo.errors.OperationFailure, e:
|
||||
if e.code == None and str(e).find('failed: not running with --replSet"'):
|
||||
state = -1
|
||||
members = data['members']
|
||||
my_state = int(data['myState'])
|
||||
worst_state = my_state
|
||||
for member in members:
|
||||
their_state = int(member['state'])
|
||||
message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
|
||||
if state_is_worse(their_state, worst_state, warning, critical):
|
||||
worst_state = their_state
|
||||
message += performance_data(perf_data, [(my_state, "state")])
|
||||
|
||||
if state == 8:
|
||||
message = "State: %i (Down)" % state
|
||||
elif state == 4:
|
||||
message = "State: %i (Fatal error)" % state
|
||||
elif state == 0:
|
||||
message = "State: %i (Starting up, phase1)" % state
|
||||
elif state == 3:
|
||||
message = "State: %i (Recovering)" % state
|
||||
elif state == 5:
|
||||
message = "State: %i (Starting up, phase2)" % state
|
||||
elif state == 1:
|
||||
message = "State: %i (Primary)" % state
|
||||
elif state == 2:
|
||||
message = "State: %i (Secondary)" % state
|
||||
elif state == 7:
|
||||
message = "State: %i (Arbiter)" % state
|
||||
elif state == -1:
|
||||
message = "Not running with replSet"
|
||||
else:
|
||||
message = "State: %i (Unknown state)" % state
|
||||
message += performance_data(perf_data, [(state, "state")])
|
||||
return check_levels(state, warning, critical, message, ok)
|
||||
except Exception, e:
|
||||
except pymongo.errors.OperationFailure as e:
|
||||
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
||||
worst_state = -1
|
||||
|
||||
return check_levels(worst_state, warning, critical, message, ok)
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
def state_is_worse(state, worst_state, warning, critical):
|
||||
if worst_state in critical:
|
||||
return False
|
||||
if worst_state in warning:
|
||||
return state in critical
|
||||
return (state in warning) or (state in critical)
|
||||
|
||||
def state_text(state):
|
||||
if state == 8:
|
||||
return "Down"
|
||||
elif state == 4:
|
||||
return "Fatal error"
|
||||
elif state == 0:
|
||||
return "Starting up, phase1"
|
||||
elif state == 3:
|
||||
return "Recovering"
|
||||
elif state == 5:
|
||||
return "Starting up, phase2"
|
||||
elif state == 1:
|
||||
return "Primary"
|
||||
elif state == 2:
|
||||
return "Secondary"
|
||||
elif state == 7:
|
||||
return "Arbiter"
|
||||
elif state == -1:
|
||||
return "Not running with replSet"
|
||||
else:
|
||||
return "Unknown state"
|
||||
|
||||
|
||||
def check_databases(con, warning, critical, perf_data=None):
|
||||
try:
|
||||
|
@ -764,7 +915,7 @@ def check_databases(con, warning, critical, perf_data=None):
|
|||
message = "Number of DBs: %.0f" % count
|
||||
message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
|
||||
return check_levels(count, warning, critical, message)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -786,7 +937,7 @@ def check_collections(con, warning, critical, perf_data=None):
|
|||
message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
|
||||
return check_levels(count, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -823,21 +974,21 @@ def check_database_size(con, database, warning, critical, perf_data):
|
|||
try:
|
||||
set_read_preference(con.admin)
|
||||
data = con[database].command('dbstats')
|
||||
storage_size = data['storageSize'] / 1024 / 1024
|
||||
storage_size = data['storageSize'] // 1024 // 1024
|
||||
if perf_data:
|
||||
perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
|
||||
#perfdata += " database=%s" %(database)
|
||||
|
||||
if storage_size >= critical:
|
||||
print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
|
||||
print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
||||
return 2
|
||||
elif storage_size >= warning:
|
||||
print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
|
||||
print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
|
||||
print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
||||
return 0
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -851,20 +1002,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
|
|||
try:
|
||||
set_read_preference(con.admin)
|
||||
data = con[database].command('dbstats')
|
||||
index_size = data['indexSize'] / 1024 / 1024
|
||||
index_size = data['indexSize'] / 1024 // 1024
|
||||
if perf_data:
|
||||
perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
|
||||
|
||||
if index_size >= critical:
|
||||
print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
|
||||
print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
||||
return 2
|
||||
elif index_size >= warning:
|
||||
print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
|
||||
print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
|
||||
print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
||||
return 0
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_collection_documents(con, database, collection, warning, critical, perf_data):
|
||||
perfdata = ""
|
||||
try:
|
||||
set_read_preference(con.admin)
|
||||
data = con[database].command('collstats', collection)
|
||||
documents = data['count']
|
||||
if perf_data:
|
||||
perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
|
||||
|
||||
if documents >= critical:
|
||||
print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
||||
return 2
|
||||
elif documents >= warning:
|
||||
print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
||||
return 0
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -883,15 +1056,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
|
|||
perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
|
||||
|
||||
if total_index_size >= critical:
|
||||
print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
|
||||
print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
||||
return 2
|
||||
elif total_index_size >= warning:
|
||||
print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
|
||||
print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
|
||||
print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
||||
return 0
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -908,7 +1081,7 @@ def check_queues(con, warning, critical, perf_data):
|
|||
message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
|
||||
return check_levels(total_queues, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
def check_collection_size(con, database, collection, warning, critical, perf_data):
|
||||
|
@ -923,18 +1096,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
|
|||
perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
|
||||
|
||||
if size >= critical:
|
||||
print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
|
||||
print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
||||
return 2
|
||||
elif size >= warning:
|
||||
print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
|
||||
print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
|
||||
print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
||||
return 0
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
||||
|
||||
def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
|
||||
warning = warning or 100
|
||||
critical = critical or 1000
|
||||
perfdata = ""
|
||||
try:
|
||||
set_read_preference(con.admin)
|
||||
data = con[database].command('collstats', collection)
|
||||
storageSize = data['storageSize'] / 1024 / 1024
|
||||
if perf_data:
|
||||
perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
|
||||
|
||||
if storageSize >= critical:
|
||||
print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
||||
return 2
|
||||
elif storageSize >= warning:
|
||||
print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
||||
return 1
|
||||
else:
|
||||
print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
||||
return 0
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
|
||||
warning = warning or 250
|
||||
critical = critical or 500
|
||||
|
||||
|
@ -955,10 +1153,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|||
diff_query = num - last_count['data'][query_type]['count']
|
||||
diff_ts = ts - last_count['data'][query_type]['ts']
|
||||
|
||||
if diff_ts == 0:
|
||||
message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
|
||||
return check_levels(0, warning, critical, message)
|
||||
|
||||
query_per_sec = float(diff_query) / float(diff_ts)
|
||||
|
||||
# update the count now
|
||||
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
if mongo_version == 2:
|
||||
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
else:
|
||||
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
|
||||
message = "Queries / Sec: %f" % query_per_sec
|
||||
message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
|
||||
|
@ -967,17 +1172,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|||
# since it is the first run insert it
|
||||
query_per_sec = 0
|
||||
message = "First run of check.. no data"
|
||||
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
if mongo_version == 2:
|
||||
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
else:
|
||||
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
|
||||
except TypeError:
|
||||
#
|
||||
# since it is the first run insert it
|
||||
query_per_sec = 0
|
||||
message = "First run of check.. no data"
|
||||
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
if mongo_version == 2:
|
||||
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
else:
|
||||
db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
||||
|
||||
return check_levels(query_per_sec, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1024,7 +1236,7 @@ def check_oplog(con, warning, critical, perf_data):
|
|||
message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
|
||||
return check_levels(-approx_level, -warning, -critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1042,7 +1254,7 @@ Under very high write situations it is normal for this value to be nonzero. """
|
|||
message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
|
||||
return check_levels(j_commits_in_wl, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1058,7 +1270,7 @@ def check_journaled(con, warning, critical, perf_data):
|
|||
message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
|
||||
return check_levels(journaled, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1075,11 +1287,11 @@ than the amount physically written to disk."""
|
|||
message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
|
||||
return check_levels(writes, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def get_opcounters(data, opcounters_name, host):
|
||||
def get_opcounters(data, opcounters_name, host, port):
|
||||
try:
|
||||
insert = data[opcounters_name]['insert']
|
||||
query = data[opcounters_name]['query']
|
||||
|
@ -1087,21 +1299,21 @@ def get_opcounters(data, opcounters_name, host):
|
|||
delete = data[opcounters_name]['delete']
|
||||
getmore = data[opcounters_name]['getmore']
|
||||
command = data[opcounters_name]['command']
|
||||
except KeyError, e:
|
||||
except KeyError as e:
|
||||
return 0, [0] * 100
|
||||
total_commands = insert + query + update + delete + getmore + command
|
||||
new_vals = [total_commands, insert, query, update, delete, getmore, command]
|
||||
return maintain_delta(new_vals, host, opcounters_name)
|
||||
return maintain_delta(new_vals, host, port, opcounters_name)
|
||||
|
||||
|
||||
def check_opcounters(con, host, warning, critical, perf_data):
|
||||
def check_opcounters(con, host, port, warning, critical, perf_data):
|
||||
""" A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
|
||||
warning = warning or 10000
|
||||
critical = critical or 15000
|
||||
|
||||
data = get_server_status(con)
|
||||
err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
|
||||
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
|
||||
err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
|
||||
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
|
||||
if err1 == 0 and err2 == 0:
|
||||
delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
|
||||
delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
|
||||
|
@ -1109,14 +1321,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
|
|||
message = "Test succeeded , old values missing"
|
||||
message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
|
||||
message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
|
||||
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
|
||||
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
|
||||
(per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
|
||||
return check_levels(per_minute_delta[0], warning, critical, message)
|
||||
else:
|
||||
return exit_with_general_critical("problem reading data from temp file")
|
||||
|
||||
|
||||
def check_current_lock(con, host, warning, critical, perf_data):
|
||||
def check_current_lock(con, host, port, warning, critical, perf_data):
|
||||
""" A function to get current lock percentage and not a global one, as check_lock function does"""
|
||||
warning = warning or 10
|
||||
critical = critical or 30
|
||||
|
@ -1125,7 +1337,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|||
lockTime = float(data['globalLock']['lockTime'])
|
||||
totalTime = float(data['globalLock']['totalTime'])
|
||||
|
||||
err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
|
||||
err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
|
||||
if err == 0:
|
||||
lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
|
||||
message = "Current Lock Percentage: %.2f%%" % lock_percentage
|
||||
|
@ -1135,7 +1347,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|||
return exit_with_general_warning("problem reading data from temp file")
|
||||
|
||||
|
||||
def check_page_faults(con, host, warning, critical, perf_data):
|
||||
def check_page_faults(con, host, port, warning, critical, perf_data):
|
||||
""" A function to get page_faults per second from the system"""
|
||||
warning = warning or 10
|
||||
critical = critical or 30
|
||||
|
@ -1147,7 +1359,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|||
# page_faults unsupported on the underlaying system
|
||||
return exit_with_general_critical("page_faults unsupported on the underlaying system")
|
||||
|
||||
err, delta = maintain_delta([page_faults], host, "page_faults")
|
||||
err, delta = maintain_delta([page_faults], host, port, "page_faults")
|
||||
if err == 0:
|
||||
page_faults_ps = delta[1] / delta[0]
|
||||
message = "Page faults : %.2f ps" % page_faults_ps
|
||||
|
@ -1157,7 +1369,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|||
return exit_with_general_warning("problem reading data from temp file")
|
||||
|
||||
|
||||
def check_asserts(con, host, warning, critical, perf_data):
|
||||
def check_asserts(con, host, port, warning, critical, perf_data):
|
||||
""" A function to get asserts from the system"""
|
||||
warning = warning or 1
|
||||
critical = critical or 10
|
||||
|
@ -1172,7 +1384,7 @@ def check_asserts(con, host, warning, critical, perf_data):
|
|||
user = asserts['user']
|
||||
rollovers = asserts['rollovers']
|
||||
|
||||
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
|
||||
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
|
||||
|
||||
if err == 0:
|
||||
if delta[5] != 0:
|
||||
|
@ -1206,7 +1418,7 @@ def get_stored_primary_server_name(db):
|
|||
return stored_primary_server
|
||||
|
||||
|
||||
def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
||||
def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
|
||||
""" A function to check if the primary server of a replica set has changed """
|
||||
if warning is None and critical is None:
|
||||
warning = 1
|
||||
|
@ -1229,7 +1441,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
|||
saved_primary = "None"
|
||||
if current_primary != saved_primary:
|
||||
last_primary_server_record = {"server": current_primary}
|
||||
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
|
||||
if mongo_version == 2:
|
||||
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
||||
else:
|
||||
db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
||||
message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
|
||||
primary_status = 1
|
||||
return check_levels(primary_status, warning, critical, message)
|
||||
|
@ -1251,9 +1466,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|||
|
||||
try:
|
||||
#on linux servers only
|
||||
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
|
||||
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
|
||||
except KeyError:
|
||||
print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
|
||||
print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
|
||||
sys.exit(1)
|
||||
|
||||
message = "Page Faults: %i" % (page_faults)
|
||||
|
@ -1261,7 +1476,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|||
message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
|
||||
check_levels(page_faults, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1277,35 +1492,35 @@ def chunks_balance(con, database, collection, warning, critical):
|
|||
shards = col.distinct("shard")
|
||||
|
||||
except:
|
||||
print "WARNING - Can't get chunks infos from MongoDB"
|
||||
print("WARNING - Can't get chunks infos from MongoDB")
|
||||
sys.exit(1)
|
||||
|
||||
if nscount == 0:
|
||||
print "WARNING - Namespace %s is not sharded" % (nsfilter)
|
||||
print("WARNING - Namespace %s is not sharded" % (nsfilter))
|
||||
sys.exit(1)
|
||||
|
||||
avgchunksnb = nscount / len(shards)
|
||||
warningnb = avgchunksnb * warning / 100
|
||||
criticalnb = avgchunksnb * critical / 100
|
||||
avgchunksnb = nscount // len(shards)
|
||||
warningnb = avgchunksnb * warning // 100
|
||||
criticalnb = avgchunksnb * critical // 100
|
||||
|
||||
for shard in shards:
|
||||
delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
|
||||
message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
|
||||
|
||||
if delta >= criticalnb and delta > 0:
|
||||
print "CRITICAL - Chunks not well balanced " + message
|
||||
print("CRITICAL - Chunks not well balanced " + message)
|
||||
sys.exit(2)
|
||||
elif delta >= warningnb and delta > 0:
|
||||
print "WARNING - Chunks not well balanced " + message
|
||||
print("WARNING - Chunks not well balanced " + message)
|
||||
sys.exit(1)
|
||||
|
||||
print "OK - Chunks well balanced across shards"
|
||||
print("OK - Chunks well balanced across shards")
|
||||
sys.exit(0)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
exit_with_general_critical(e)
|
||||
|
||||
print "OK - Chunks well balanced across shards"
|
||||
print("OK - Chunks well balanced across shards")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
@ -1321,7 +1536,7 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|||
data = con.admin.command(son.SON([('isMaster', 1)]))
|
||||
|
||||
if data['ismaster'] == True:
|
||||
print "OK - This server is primary"
|
||||
print("OK - This server is primary")
|
||||
return 0
|
||||
|
||||
phost = data['primary'].split(':')[0]
|
||||
|
@ -1339,17 +1554,17 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|||
|
||||
return check_levels(pconn_time, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def check_collection_state(con, database, collection):
|
||||
try:
|
||||
con[database][collection].find_one()
|
||||
print "OK - Collection %s.%s is reachable " % (database, collection)
|
||||
print("OK - Collection %s.%s is reachable " % (database, collection))
|
||||
return 0
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
|
@ -1361,14 +1576,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
|
|||
|
||||
return check_levels(count, warning, critical, message)
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return exit_with_general_critical(e)
|
||||
|
||||
|
||||
def build_file_name(host, action):
|
||||
def build_file_name(host, port, action):
|
||||
#done this way so it will work when run independently and from shell
|
||||
module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
|
||||
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
|
||||
|
||||
if (port == 27017):
|
||||
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
|
||||
else:
|
||||
return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
|
||||
|
||||
|
||||
def ensure_dir(f):
|
||||
|
@ -1381,7 +1600,7 @@ def write_values(file_name, string):
|
|||
f = None
|
||||
try:
|
||||
f = open(file_name, 'w')
|
||||
except IOError, e:
|
||||
except IOError as e:
|
||||
#try creating
|
||||
if (e.errno == 2):
|
||||
ensure_dir(file_name)
|
||||
|
@ -1400,11 +1619,11 @@ def read_values(file_name):
|
|||
data = f.read()
|
||||
f.close()
|
||||
return 0, data
|
||||
except IOError, e:
|
||||
except IOError as e:
|
||||
if (e.errno == 2):
|
||||
#no previous data
|
||||
return 1, ''
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
return 2, None
|
||||
|
||||
|
||||
|
@ -1420,8 +1639,8 @@ def calc_delta(old, new):
|
|||
return 0, delta
|
||||
|
||||
|
||||
def maintain_delta(new_vals, host, action):
|
||||
file_name = build_file_name(host, action)
|
||||
def maintain_delta(new_vals, host, port, action):
|
||||
file_name = build_file_name(host, port, action)
|
||||
err, data = read_values(file_name)
|
||||
old_vals = data.split(';')
|
||||
new_vals = [str(int(time.time()))] + new_vals
|
||||
|
@ -1442,8 +1661,8 @@ def replication_get_time_diff(con):
|
|||
col = 'oplog.$main'
|
||||
firstc = local[col].find().sort("$natural", 1).limit(1)
|
||||
lastc = local[col].find().sort("$natural", -1).limit(1)
|
||||
first = firstc.next()
|
||||
last = lastc.next()
|
||||
first = next(firstc)
|
||||
last = next(lastc)
|
||||
tfirst = first["ts"]
|
||||
tlast = last["ts"]
|
||||
delta = tlast.time - tfirst.time
|
||||
|
|
Loading…
Reference in New Issue