diff --git a/nagios-nrpe/files/plugins/check_ceph_df b/nagios-nrpe/files/plugins/check_ceph_df new file mode 100755 index 00000000..0f798aa1 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_df @@ -0,0 +1,232 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import os +import subprocess +import sys + +__version__ = '1.7.1' + +# default ceph values +CEPH_COMMAND = '/usr/bin/ceph' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph df' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-n','--name', help='ceph client name') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-p','--pool', help='ceph pool name') + parser.add_argument('-d','--detail', help="show pool details on warn and critical", action='store_true') + parser.add_argument('-W','--warn', help="warn above this percent RAW USED", type=float) + parser.add_argument('-C','--critical', help="critical alert above this percent RAW USED", type=float) + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + args = parser.parse_args() + + # validate args + ceph_exec = args.exe if args.exe else CEPH_COMMAND + if not os.path.exists(ceph_exec): + print("ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.warn or not args.critical or args.warn > args.critical: + print("ERROR: warn and critical level must be set and critical must be greater than warn") + return STATUS_UNKNOWN + + # build command + ceph_df = [ceph_exec] + if args.monaddress: + ceph_df.append('-m') + ceph_df.append(args.monaddress) + if args.conf: + ceph_df.append('-c') + ceph_df.append(args.conf) + if args.id: + ceph_df.append('--id') + ceph_df.append(args.id) + if args.name: + ceph_df.append('--name') + ceph_df.append(args.name) + if args.keyring: + ceph_df.append('--keyring') + ceph_df.append(args.keyring) + ceph_df.append('df') + + #print ceph_df + + # exec command + p = subprocess.Popen(ceph_df,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + # parse output + # print "DEBUG: output:", output + # print "DEBUG: err:", err + if output: + output = output.decode('utf-8') + # parse output + # if detail switch was not set only show global values and compare to warning and critical + # otherwise show space for pools too + result=output.splitlines() + # values for GLOBAL are in 3rd line of output + globalline = result[2] + globalvals = globalline.split() + # Luminous vs Minic output (27.3TiB vs 27.3 TiB) + if len(globalvals) == 7: + gv = [] + gv.append("{}{}".format(globalvals[0], globalvals[1])) + gv.append("{}{}".format(globalvals[2], globalvals[3])) + gv.append("{}{}".format(globalvals[4], globalvals[5])) + gv.append(globalvals[6]) + globalvals = gv + #print "XXX: globalvals: {} {}".format(len(globalvals), globalvals) + # Nautilus output + if len(globalvals) == 10: + gv = [] + gv.append("{}{}".format(globalvals[1], globalvals[2])) + gv.append("{}{}".format(globalvals[3], globalvals[4])) + gv.append("{}{}".format(globalvals[5], globalvals[6])) + gv.append(globalvals[9]) + globalvals = gv + #print "XXX: globalvals: {} {}".format(len(globalvals), globalvals) + + # prepare pool values + # pool output starts in line 4 with the bare word POOLS: followed by the output + poollines = result[3:] + + if args.pool: + for line in poollines: + if args.pool in line: + poolvals = line.split() + # Luminous vs Minic output (27.3TiB vs 27.3 TiB) + if len(poolvals) == 8: + pv = [] + pv.append(poolvals[0]) # NAME + pv.append(poolvals[1]) # ID + pv.append("{}{}".format(poolvals[2], poolvals[3])) # USED 27.3 TiB + pv.append(poolvals[4]) # %USED + pv.append("{}{}".format(poolvals[5], poolvals[6])) # MAX AVAIL 27.3 TiB + # pv.append(poolvals[7]) # OBJECTS + poolvals = pv + #print "XXX: poolvals: {} {}".format(len(poolvals), poolvals) + # Nautilus output + if len(poolvals) == 10: + pv = [] + pv.append(poolvals[0]) # NAME + pv.append(poolvals[1]) # ID + pv.append("{}{}".format(poolvals[2], poolvals[3])) # USED 27.3 TiB + pv.append(poolvals[7]) # %USED + pv.append("{}{}".format(poolvals[8], poolvals[9])) # MAX AVAIL 27.3 TiB + # pv.append(poolvals[7]) # OBJECTS, not used + poolvals = pv + #print "XXX: poolvals: {} {}".format(len(poolvals), poolvals) + # Octopus >= v15.2.8 (pgs added to ceph-df) + if len(poolvals) == 11: + pv = [] + pv.append(poolvals[0]) # NAME + pv.append(poolvals[1]) # ID + #pv.append(poolvals[2]) # PGS, not used + pv.append("{}{}".format(poolvals[3], poolvals[4])) # USED 27.3 TiB + pv.append(poolvals[8]) # %USED + pv.append("{}{}".format(poolvals[9], poolvals[10])) # MAX AVAIL 27.3 TiB + # pv.append(poolvals[7]) # OBJECTS, not used + poolvals = pv + #print "XXX: poolvals: {} {}".format(len(poolvals), poolvals) + + + pool_used = poolvals[2] + pool_usage_percent = float(poolvals[3]) + pool_available_space = poolvals[4] + # pool_objects = float(poolvals[5]) # not used + + if pool_usage_percent > args.critical: + print('CRITICAL: %s%% usage in Pool \'%s\' is above %s%% (%s used) | Usage=%s%%;%s;%s;;' % (pool_usage_percent, args.pool, args.critical, pool_used, pool_usage_percent, args.warn, args.critical)) + return STATUS_ERROR + if pool_usage_percent > args.warn: + print('WARNING: %s%% usage in Pool \'%s\' is above %s%% (%s used) | Usage=%s%%;%s;%s;;' % (pool_usage_percent, args.pool, args.warn, pool_used, pool_usage_percent, args.warn, args.critical)) + return STATUS_WARNING + else: + print('%s%% usage in Pool \'%s\' | Usage=%s%%;%s;%s;;' % (pool_usage_percent, args.pool, pool_usage_percent, args.warn, args.critical)) + return STATUS_OK + else: + # print 'DEBUG:', globalvals + # finally 4th element contains percentual value + # print 'DEBUG USAGE:', globalvals[3] + global_usage_percent = float(globalvals[3]) + global_available_space = globalvals[1] + global_total_space = globalvals[0] + # print 'DEBUG WARNLEVEL:', args.warn + # print 'DEBUG CRITICALLEVEL:', args.critical + if global_usage_percent > args.critical: + if args.detail: + poollines.insert(0, '\n') + poolout = '\n '.join(poollines) + else: + poolout = '' + print('CRITICAL: global RAW usage of %s%% is above %s%% (%s of %s free)%s | Usage=%s%%;%s;%s;;' % (global_usage_percent, args.critical, global_available_space, global_total_space, poolout, global_usage_percent, args.warn, args.critical)) + return STATUS_ERROR + elif global_usage_percent > args.warn: + if args.detail: + poollines.insert(0, '\n') + poolout = '\n '.join(poollines) + else: + poolout = '' + print('WARNING: global RAW usage of %s%% is above %s%% (%s of %s free)%s | Usage=%s%%;%s;%s;;' % (global_usage_percent, args.warn, global_available_space, global_total_space, poolout, global_usage_percent, args.warn, args.critical)) + return STATUS_WARNING + else: + print('RAW usage %s%% | Usage=%s%%;%s;%s;;' % (global_usage_percent, global_usage_percent, args.warn, args.critical)) + return STATUS_OK + + #for + elif err: + # read only first line of error + one_line = err.split('\n')[0] + if '-1 ' in one_line: + idx = one_line.rfind('-1 ') + print('ERROR: %s: %s' % (ceph_exec, one_line[idx+len('-1 '):])) + else: + print(one_line) + + return STATUS_UNKNOWN + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_health b/nagios-nrpe/files/plugins/check_ceph_health new file mode 100755 index 00000000..ede44914 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_health @@ -0,0 +1,200 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013-2016 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import os +import subprocess +import sys +import re +import json + +__version__ = '1.7.0' + +# default ceph values +CEPH_ADM_COMMAND = '/usr/sbin/cephadm' +CEPH_COMMAND = '/usr/bin/ceph' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph health' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-A','--admexe', help='cephadm executable [%s]' % CEPH_ADM_COMMAND) + parser.add_argument('--cluster', help='ceph cluster name') + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-n','--name', help='ceph client name') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('--check', help='regexp of which check(s) to check (luminous+) ' + "Can be inverted, e.g. '^((?!(PG_DEGRADED|OBJECT_MISPLACED)$).)*$'") + parser.add_argument('-w','--whitelist', help='whitelist regexp for ceph health warnings') + parser.add_argument('-d','--detail', help="exec 'ceph health detail'", action='store_true') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-a','--cephadm', help='uses cephadm to execute the command', action='store_true') + parser.add_argument('-s','--skip-muted', help='skip muted checks', action='store_true') + args = parser.parse_args() + + # validate args + cephadm_exec = args.admexe if args.admexe else CEPH_ADM_COMMAND + ceph_exec = args.exe if args.exe else CEPH_COMMAND + + if args.cephadm: + if not os.path.exists(cephadm_exec): + print("ERROR: cephadm executable '%s' doesn't exist" % cephadm_exec) + return STATUS_UNKNOWN + else: + if not os.path.exists(ceph_exec): + print("ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + # build command + ceph_health = [ceph_exec] + + if args.cephadm: + # Prepend the command with the cephadm binary and the shell command + ceph_health = [cephadm_exec, 'shell'] + ceph_health + + if args.monaddress: + ceph_health.append('-m') + ceph_health.append(args.monaddress) + if args.cluster: + ceph_health.append('--cluster') + ceph_health.append(args.cluster) + if args.conf: + ceph_health.append('-c') + ceph_health.append(args.conf) + if args.id: + ceph_health.append('--id') + ceph_health.append(args.id) + if args.name: + ceph_health.append('--name') + ceph_health.append(args.name) + if args.keyring: + ceph_health.append('--keyring') + ceph_health.append(args.keyring) + ceph_health.append('health') + if args.detail: + ceph_health.append('detail') + + ceph_health.append('--format') + ceph_health.append('json') + #print(ceph_health) + + # exec command + p = subprocess.Popen(ceph_health,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + try: + output = json.loads(output) + except ValueError: + output = dict() + + # parse output + # print "output:", output + #print "err:", err + if output: + ret = STATUS_OK + msg = "" + extended = [] + if 'checks' in output: + #luminous + for check,status in output['checks'].items(): + # skip check if not selected + if args.check and not re.search(args.check, check): + continue + + if args.skip_muted and ('muted' in status and status['muted']): + continue + + check_detail = "%s( %s )" % (check, status['summary']['message']) + + if status["severity"] == "HEALTH_ERR": + extended.append(msg) + msg = "CRITICAL: %s" % check_detail + ret = STATUS_ERROR + continue + + if args.whitelist and re.search(args.whitelist,status['summary']['message']): + continue + + check_msg = "WARNING: %s" % check_detail + if not msg: + msg = check_msg + ret = STATUS_WARNING + else: + extended.append(check_msg) + else: + #pre-luminous + for status in output["summary"]: + if status != "HEALTH_OK": + if status == "HEALTH_ERROR": + msg = "CRITICAL: %s" % status['summary'] + ret = STATUS_ERROR + continue + + if args.whitelist and re.search(args.whitelist,status['summary']): + continue + + if not msg: + msg = "WARNING: %s" % status['summary'] + ret = STATUS_WARNING + else: + extended.append("WARNING: %s" % status['summary']) + + if msg: + print(msg) + else: + print("HEALTH OK") + if extended: print('\n'.join(extended)) + return ret + + + elif err: + # read only first line of error + one_line = err.split('\n')[0] + if '-1 ' in one_line: + idx = one_line.rfind('-1 ') + print('ERROR: %s: %s' % (ceph_exec, one_line[idx+len('-1 '):])) + else: + print(one_line) + + return STATUS_UNKNOWN + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_mds b/nagios-nrpe/files/plugins/check_ceph_mds new file mode 100755 index 00000000..4e654c05 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_mds @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# Copyright (c) 2015 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import print_function +import argparse +import socket +import os +import re +import subprocess +import sys +import json + +__version__ = '1.6.0' + +# default ceph values +CEPH_EXEC = '/usr/bin/ceph' +CEPH_COMMAND = 'mds stat -f json' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + # parse args + parser = argparse.ArgumentParser(description="'ceph mds stat' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_EXEC) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor to use for queries (address[:port])') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-n','--name', help='mds daemon name', required=True) + parser.add_argument('-f','--filesystem', help='mds filesystem name', required=True) + args = parser.parse_args() + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + # validate args + ceph_exec = args.exe if args.exe else CEPH_EXEC + if not os.path.exists(ceph_exec): + print("MDS ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.conf and not os.path.exists(args.conf): + print("MDS ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("MDS ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.extend(CEPH_COMMAND.split(' ')) + + # exec command + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if p.returncode != 0 or not output: + print("MDS ERROR: %s" % err) + return STATUS_ERROR + + # load json output and parse + mds_stat = None + try: + mds_stat = json.loads(output) + except Exception as e: + print("MDS ERROR: could not parse '%s' output: %s: %s" % (CEPH_COMMAND,output,e)) + return STATUS_UNKNOWN + + return check_target_mds(mds_stat, args.filesystem, args.name) + +def check_target_mds(mds_stat, fs_name, name): + # find mds from standby list + standby_mdss = _get_standby_mds(mds_stat) + for mds in standby_mdss: + if mds.get_name() == name: + print("MDS OK: %s" % (mds)) + return STATUS_OK + + # find mds from active list + active_mdss = _get_active_mds(mds_stat, fs_name) + + if active_mdss: + for mds in active_mdss: + if mds.get_name() != name: + continue + # target mds in active list + print("MDS %s: %s" % ("WARN" if mds.is_laggy() else "OK", mds)) + return STATUS_WARNING if mds.is_laggy() else STATUS_OK + + # mds not found + print("MDS ERROR: MDS '%s' is not found (offline?)" % (name)) + return STATUS_ERROR + else: + # fs not found in map, perhaps user input error + print("MDS ERROR: FS '%s' is not found in fsmap" % (fs_name)) + return STATUS_ERROR + +def _get_standby_mds(mds_stat): + mds_array = [] + for mds in mds_stat['fsmap']['standbys']: + name = mds['name'] + state = mds['state'] + laggy_since = mds['laggy_since'] if 'laggy_since' in mds else None + mds_array.append(MDS(name, state)) + + return mds_array + +def _get_active_mds(mds_stat, fs_name): + mds_fs = mds_stat['fsmap']['filesystems'] + + # find filesystem in stat + for i in range(len(mds_fs)): + mdsmap = mds_fs[i]['mdsmap'] + if mdsmap['fs_name'] != fs_name: + continue + # put mds to array + mds_array = [] + infos = mds_stat['fsmap']['filesystems'][i]['mdsmap']['info'] + for gid in infos: + name = infos[gid]['name'] + state = infos[gid]['state'] + laggy_since = infos[gid]['laggy_since'] if 'laggy_since' in infos[gid] else None + mds_array.append(MDS(name, state, laggy_since)) + + return mds_array + + # no fs found + return None + +class MDS(object): + def __init__(self, name, state, laggy_since=None): + self.name = name + self.state = state + self.laggy_since = laggy_since + + def get_name(self): + return self.name + + def get_state(self): + return self.state + + def is_laggy(self): + return self.laggy_since is not None + + def __str__(self): + msg = "MDS '%s' is %s" % (self.name, self.state) + if self.laggy_since is not None: + msg += " (laggy or crashed)" + return msg + +# main +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_mgr b/nagios-nrpe/files/plugins/check_ceph_mgr new file mode 100755 index 00000000..019e4a3f --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_mgr @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import os +import subprocess +import sys +import json + +__version__ = '1.0.0' + +# default ceph values +CEPH_EXEC = '/usr/bin/ceph' +CEPH_COMMAND = 'mgr dump -f json' + +CEPH_MGR_DUMP_EXAMPLE = ''' +$ ceph --version +ceph version 12.2.7 (3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5) luminous (stable) +$ ceph mgr dump -f json|jq . +{ + "epoch": 165, + "active_gid": 248001409, + "active_name": "zhdk0013", + "active_addr": "10.10.10.9:6800/810408", + "available": true, + "standbys": [ + { + "gid": 247991934, + "name": "zhdk0009", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + }, + { + "gid": 248011196, + "name": "zhdk0025", + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ] + } + ], + "modules": [ + "balancer", + "restful", + "status" + ], + "available_modules": [ + "balancer", + "dashboard", + "influx", + "localpool", + "prometheus", + "restful", + "selftest", + "status", + "zabbix" + ], + "services": {} +} +''' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + + +def main(): + # parse args + parser = argparse.ArgumentParser(description="'ceph mgr dump' nagios plugin.") + parser.add_argument('-e', '--exe', help='ceph executable [%s]' % CEPH_EXEC) + parser.add_argument('-c', '--conf', help='alternative ceph conf file') + parser.add_argument('-m', '--monaddress', help='ceph monitor to use for queries (address[:port])') + parser.add_argument('-i', '--id', help='ceph client id') + parser.add_argument('-n', '--name', help='ceph client name') + parser.add_argument('-k', '--keyring', help='ceph client keyring file') + parser.add_argument('-V', '--version', help='show version and exit', action='store_true') + args = parser.parse_args() + + if args.version: + print("version {}".format(__version__)) + return STATUS_OK + + # validate args + ceph_exec = args.exe if args.exe else CEPH_EXEC + if not os.path.exists(ceph_exec): + print("MGR ERROR: ceph executable '{}' doesn't exist".format(ceph_exec)) + return STATUS_UNKNOWN + + if args.conf and not os.path.exists(args.conf): + print("MGR ERROR: ceph conf file '{}' doesn't exist".format(args.conf)) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("MGR ERROR: keyring file '{}' doesn't exist".format(args.keyring)) + return STATUS_UNKNOWN + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.name: + ceph_cmd.append('--name') + ceph_cmd.append(args.name) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.extend(CEPH_COMMAND.split(' ')) + + # exec command + p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, err = p.communicate() + + if p.returncode != 0 or not output: + print("MGR ERROR: {}".format(err)) + return STATUS_UNKNOWN + + # load json output and parse + mgr_dump = None + try: + mgr_dump = json.loads(output) + except Exception as e: + print("MGR ERROR: could not parse '{}' output: {}: {}".format(ceph_cmd, output, e)) + return STATUS_UNKNOWN + + # check active + if 'active_name' not in mgr_dump: + print("MGR CRITICAL: not active mgr found") + print("JSON: {}".format(json.dumps(mgr_dump))) + return STATUS_ERROR + + active_mgr_name = mgr_dump['active_name'] + # check standby + standby_mgr_names = [] + for standby_mgr in mgr_dump['standbys']: + standby_mgr_names.append(standby_mgr['name']) + + if len(standby_mgr_names) <= 0: + print("MGR WARN: active: {} but no standbys".format(active_mgr_name)) + return STATUS_WARNING + else: + print("MGR OK: active: {}, standbys: {}".format(active_mgr_name, + ", ".join(standby_mgr_names))) + return STATUS_OK + +# main +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_mon b/nagios-nrpe/files/plugins/check_ceph_mon new file mode 100755 index 00000000..db417676 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_mon @@ -0,0 +1,163 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# Copyright (c) 2015 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import socket +import os +import re +import subprocess +import sys +import json + +__version__ = '1.5.0' + +# default ceph values +CEPH_EXEC = '/usr/bin/ceph' +CEPH_COMMAND = 'quorum_status' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +## +# ceph quorum_status output example +## +ceph_quorum_status_output_example = '''{ + "quorum_leader_name" : "s0001", + "monmap" : { + "mons" : [ + { + "name" : "s0001", + "addr" : "[2001:620:5ca1:8000::1001]:6789/0", + "rank" : 0 + }, + { + "name" : "s0003", + "addr" : "[2001:620:5ca1:8000::1003]:6789/0", + "rank" : 1 + } + ], + "created" : "2014-12-15 08:28:35.153650", + "epoch" : 2, + "modified" : "2014-12-15 08:28:40.371878", + "fsid" : "22348d2b-b69d-46cc-9a79-ca93cd6bae84" + }, + "quorum_names" : [ + "s0001", + "s0003" + ], + "quorum" : [ + 0, + 1 + ], + "election_epoch" : 24 +}''' + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph quorum_status' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_EXEC) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor to use for queries (address[:port])') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-I','--monid', help='mon ID to be checked for availability') + args = parser.parse_args() + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + # validate args + ceph_exec = args.exe if args.exe else CEPH_EXEC + if not os.path.exists(ceph_exec): + print("MON ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.conf and not os.path.exists(args.conf): + print("MON ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("MON ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.monid: + print("MON ERROR: no MON ID given, use -I/--monid parameter") + return STATUS_UNKNOWN + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.append(CEPH_COMMAND) + + # exec command + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if p.returncode != 0 or not output: + print("MON ERROR: %s" % err) + return STATUS_ERROR + + # load json output and parse + quorum_status = False + try: + quorum_status = json.loads(output) + except Exception as e: + print("MON ERROR: could not parse '%s' output: %s: %s" % (CEPH_COMMAND,output,e)) + return STATUS_UNKNOWN + + #print "XXX: quorum_status['quorum_names']:", quorum_status['quorum_names'] + + # do our checks + is_monitor = False + for mon in quorum_status['monmap']['mons']: + if mon['name'] == args.monid: + is_monitor = True + if not is_monitor: + print("MON WARN: mon '%s' is not in monmap: %s" % (args.monid,quorum_status['monmap']['mons'])) + return STATUS_WARNING + + in_quorum = args.monid in quorum_status['quorum_names'] + if in_quorum: + print("MON OK") + return STATUS_OK + else: + print("MON WARN: no MON '%s' found in quorum" % args.monid) + return STATUS_WARNING + +# main +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_osd b/nagios-nrpe/files/plugins/check_ceph_osd new file mode 100755 index 00000000..88a37488 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_osd @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# 1.5.2 (2019-06-16) Martin Seener: fixed regex to work with Ceph Nautilus (14.2.x) + +from __future__ import print_function +import argparse +import os +import re +import subprocess +import sys +import socket + +__version__ = '1.5.2' + +# default ceph values +CEPH_COMMAND = '/usr/bin/ceph' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph osd' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-H','--host', help='osd host', required=True) + parser.add_argument('-I','--osdid', help='osd id', required=False) + parser.add_argument('-C','--crit', help='Number of failed OSDs to trigger critical (default=2)',type=int,default=2, required=False) + parser.add_argument('-o','--out', help='check osds that are set OUT', default=False, action='store_true', required=False) + args = parser.parse_args() + + # validate args + ceph_exec = args.exe if args.exe else CEPH_COMMAND + if not os.path.exists(ceph_exec): + print("OSD ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("OSD ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("OSD ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.osdid: + args.osdid = '[^ ]*' + + if not args.host: + print("OSD ERROR: no OSD hostname given") + return STATUS_UNKNOWN + + try: + addrinfo = socket.getaddrinfo(args.host, None, 0, socket.SOCK_STREAM) + args.host = addrinfo[0][-1][0] + if addrinfo[0][0] == socket.AF_INET6: + args.host = "[%s]" % args.host + except: + print('OSD ERROR: could not resolve %s' % args.host) + return STATUS_UNKNOWN + + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.append('osd') + ceph_cmd.append('dump') + + # exec command + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + output = output.decode('utf8') + + if err or not output: + print("OSD ERROR: %s" % err) + return STATUS_ERROR + + # escape IPv4 host address + osd_host = args.host.replace('.', '\.') + # escape IPv6 host address + osd_host = osd_host.replace('[', '\[') + osd_host = osd_host.replace(']', '\]') + up = re.findall(r"^(osd\.%s) up.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + if args.out: + down = re.findall(r"^(osd\.%s) down.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_in = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + else: + down = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_in = down + down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + + if down: + print("OSD %s: Down OSD%s on %s: %s" % ('CRITICAL' if len(down)>=args.crit else 'WARNING' ,'s' if len(down)>1 else '', args.host, " ".join(down))) + print("Up OSDs: " + " ".join(up)) + print("Down+In OSDs: " + " ".join(down_in)) + print("Down+Out OSDs: " + " ".join(down_out)) + print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit)) + if len(down)>=args.crit: + return STATUS_ERROR + else: + return STATUS_WARNING + + if up: + print("OSD OK") + print("Up OSDs: " + " ".join(up)) + print("Down+In OSDs: " + " ".join(down_in)) + print("Down+Out OSDs: " + " ".join(down_out)) + print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit)) + return STATUS_OK + + print("OSD WARN: no OSD.%s found on host %s" % (args.osdid, args.host)) + return STATUS_WARNING + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_osd_db b/nagios-nrpe/files/plugins/check_ceph_osd_db new file mode 100755 index 00000000..6a01836b --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_osd_db @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2020 Binero AB https://binero.com +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import re +import subprocess +import sys +import socket +import json + + +CEPH_COMMAND = '/usr/bin/ceph' + +STATUS_OK = 0 +STATUS_CRITICAL = 2 +STATUS_UNKNOWN = 3 + + +def main(): + parser = argparse.ArgumentParser(description="'ceph osd' nagios plugin.") + + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-H','--host', help='osd host', required=True) + parser.add_argument('-C','--critical', help='critical threshold', default=60) + + args = parser.parse_args() + + ceph_exec = args.exe if args.exe else CEPH_COMMAND + if not os.path.exists(ceph_exec): + print("UNKNOWN: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.conf and not os.path.exists(args.conf): + print("UNKNOWN: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("UNKNOWN: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.host: + print("UNKNOWN: no OSD hostname given") + return STATUS_UNKNOWN + + try: + addrinfo = socket.getaddrinfo(args.host, None, 0, socket.SOCK_STREAM) + args.host = addrinfo[0][-1][0] + if addrinfo[0][0] == socket.AF_INET6: + args.host = "[%s]" % args.host + except Exception: + print('UNKNOWN: could not resolve %s' % args.host) + return STATUS_UNKNOWN + + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + + ceph_cmd.append('osd') + ceph_cmd.append('dump') + + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if err or not output: + print("CRITICAL: %s" % err) + return STATUS_CRITICAL + + # escape IPv4 host address + osd_host = args.host.replace('.', '\.') + # escape IPv6 host address + osd_host = osd_host.replace('[', '\[') + osd_host = osd_host.replace(']', '\]') + + osds_up = re.findall(r"^(osd\.[^ ]*) up.*%s:" % (osd_host), output, re.MULTILINE) + + final_status = STATUS_OK + lines = [] + + for osd in osds_up: + daemon_ceph_cmd = [ceph_exec, '--format', 'json'] + daemon_ceph_cmd.append('daemon') + daemon_ceph_cmd.append(osd) + daemon_ceph_cmd.append('perf') + daemon_ceph_cmd.append('dump') + + p = subprocess.Popen(daemon_ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if err or not output: + print("CRITICAL: %s" % err) + return STATUS_CRITICAL + + try: + data = json.loads(output) + except Exception: + print("CRITICAL: failed to load json") + return STATUS_CRITICAL + + bluefs = data.get('bluefs', None) + + if not bluefs: + continue + + db_total_bytes = bluefs.get('db_total_bytes') + db_used_bytes = bluefs.get('db_used_bytes') + perc = (float(db_used_bytes) / float(db_total_bytes) * 100) + + if perc >= args.critical and final_status == STATUS_OK: + final_status = STATUS_CRITICAL + + lines.append("%s=%.2f%%" % (osd, perc)) + + if final_status == STATUS_OK: + print("OK: %s" % (' '.join(lines))) + else: + print("CRITICAL: %s" % (' '.join(lines))) + + return final_status + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_osd_df b/nagios-nrpe/files/plugins/check_ceph_osd_df new file mode 100755 index 00000000..fb1c2806 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_osd_df @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# check_ceph_osd_df - Check OSD DF output +# Copyright (c) 2020 noris network AG https://www.noris.de +# +# This plugin will not output perfdata as there is likely a lot of output +# which should be gathered using other tools. +# +# Parts based on code from check_ceph_df which is +# Copyright (c) 2013 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import os +import subprocess +import sys +import json +from operator import itemgetter + +# Semver +__version__ = '1.0.0' + +# default ceph values +CEPH_COMMAND = '/usr/bin/ceph' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph osd df' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-n','--name', help='ceph client name') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-W','--warn', help="warn above this percent USED", type=float) + parser.add_argument('-C','--critical', help="critical alert above this percent USED", type=float) + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + args = parser.parse_args() + + # validate args + ceph_exec = args.exe if args.exe else CEPH_COMMAND + if not os.path.exists(ceph_exec): + print("ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.warn or not args.critical or args.warn > args.critical: + print("ERROR: warn and critical level must be set and critical must be greater than warn") + return STATUS_UNKNOWN + + # build command + ceph_osd_df = [ceph_exec] + if args.monaddress: + ceph_osd_df.append('-m') + ceph_osd_df.append(args.monaddress) + if args.conf: + ceph_osd_df.append('-c') + ceph_osd_df.append(args.conf) + if args.id: + ceph_osd_df.append('--id') + ceph_osd_df.append(args.id) + if args.name: + ceph_osd_df.append('--name') + ceph_osd_df.append(args.name) + if args.keyring: + ceph_osd_df.append('--keyring') + ceph_osd_df.append(args.keyring) + ceph_osd_df.append('osd') + ceph_osd_df.append('df') + ceph_osd_df.append('--format=json') + + # exec command + p = subprocess.Popen(ceph_osd_df,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + # parse output + # print "DEBUG: output:", output + # print "DEBUG: err:", err + if output: + # parse output + try: + result = json.loads(output) + check_return_value = STATUS_OK + nodes_sorted = sorted(result["nodes"], key=itemgetter('utilization','id')) + + warn_crit_osds = [] + + for node in reversed(nodes_sorted): + if node["utilization"] >= args.warn and check_return_value is not STATUS_ERROR: + check_return_value = STATUS_WARNING + warn_crit_osds.append("{}={:04.2f}".format(node["name"], node["utilization"])) + + if node["utilization"] >= args.critical: + check_return_value = STATUS_ERROR + warn_crit_osds.append("{}={:04.2f}".format(node["name"], node["utilization"])) + + if check_return_value == STATUS_OK: + print("OK: All OSDs within limits") + return STATUS_OK + elif check_return_value == STATUS_WARNING: + print("WARNING: OSD usage above warn threshold: {:.4054}".format(", ".join(warn_crit_osds))) + return STATUS_WARNING + elif check_return_value == STATUS_ERROR: + print("CRITICAL: OSD usage above critical or warn threshold: {:.4041}".format(", ".join(warn_crit_osds))) + return STATUS_ERROR + except: + print("ERROR: {}".format(sys.exc_info()[0])) + return STATUS_UNKNOWN + elif err: + # read only first line of error + one_line = err.split('\n')[0] + if '-1 ' in one_line: + idx = one_line.rfind('-1 ') + print('ERROR: %s: %s' % (ceph_exec, one_line[idx+len('-1 '):])) + else: + print(one_line) + + return STATUS_UNKNOWN + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_rgw b/nagios-nrpe/files/plugins/check_ceph_rgw new file mode 100755 index 00000000..39773f79 --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_rgw @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2014 Catalyst IT http://www.catalyst.net.nz +# Copyright (c) 2015 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import print_function +import argparse +import os +import re +import subprocess +import sys +import json + +__version__ = '1.5.1' + +# default ceph values +RGW_COMMAND = '/usr/bin/radosgw-admin' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'radosgw-admin bucket stats' nagios plugin.") + parser.add_argument('-d','--detail', help='output perf data for all buckets', action='store_true') + parser.add_argument('-B','--byte', help='output perf data in Byte instead of KB', action='store_true') + parser.add_argument('-e','--exe', help='radosgw-admin executable [%s]' % RGW_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-n','--name', help='ceph client name (type.id)') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + args = parser.parse_args() + + # validate args + rgw_exec = args.exe if args.exe else RGW_COMMAND + if not os.path.exists(rgw_exec): + print("RGW ERROR: radosgw-admin executable '%s' doesn't exist" % rgw_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("RGW ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + # build command + rgw_cmd = [rgw_exec] + if args.conf: + rgw_cmd.append('-c') + rgw_cmd.append(args.conf) + if args.id: + rgw_cmd.append('--id') + rgw_cmd.append(args.id) + if args.name: + rgw_cmd.append('-n') + rgw_cmd.append(args.name) + rgw_cmd.append('bucket') + rgw_cmd.append('stats') + + # exec command + p = subprocess.Popen(rgw_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if p.returncode != 0 or not output: + print("RGW ERROR: %s :: %s" % (output, err)) + return STATUS_ERROR + + bucket_stats = json.loads(output) + #print bucket_stats + + buckets = [] + for i in bucket_stats: + if type(i) is dict: + bucket_name = i['bucket'] + usage_dict = i['usage'] + if usage_dict and 'rgw.main' in usage_dict: + bucket_usage_kb = usage_dict['rgw.main']['size_kb_actual'] + else: + bucket_usage_kb = 0 + buckets.append((bucket_name, bucket_usage_kb)) + buckets_total_kb = sum([b[1] for b in buckets]) + + if args.byte: + status = "RGW OK: {} buckets, {} KB total | /={}B ".format(len(buckets),buckets_total_kb,buckets_total_kb*1024) + else: + status = "RGW OK: {} buckets, {} KB total | /={}KB ".format(len(buckets),buckets_total_kb,buckets_total_kb) + #print buckets + if buckets and args.detail: + if args.byte: + status = status + " ".join(["{}={}B".format(b[0],b[1]*1024) for b in buckets]) + else: + status = status + " ".join(["{}={}KB".format(b[0],b[1]) for b in buckets]) + + print(status) + return STATUS_OK + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nagios-nrpe/files/plugins/check_ceph_rgw_api b/nagios-nrpe/files/plugins/check_ceph_rgw_api new file mode 100755 index 00000000..1235f98d --- /dev/null +++ b/nagios-nrpe/files/plugins/check_ceph_rgw_api @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2014 Catalyst IT http://www.catalyst.net.nz +# Copyright (c) 2015 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import requests +import warnings +import json +import argparse +import sys +from awsauth import S3Auth + +__version__ = '1.7.2' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_CRITICAL = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'radosgw api bucket stats' nagios plugin.") + parser.add_argument('-H', '--host', help="Server URL for the radosgw api (example: http://objects.dreamhost.com/)", required=True) + parser.add_argument('-k', '--insecure', help="Allow insecure server connections when using SSL", action="store_false") + parser.add_argument('-e', '--admin_entry', help="The entry point for an admin request URL [default is '%(default)s']", default="admin") + parser.add_argument('-a', '--access_key', help="S3 access key", required=True) + parser.add_argument('-s', '--secret_key', help="S3 secret key", required=True) + parser.add_argument('-d', '--detail', help="output perf data for all buckets", action="store_true") + parser.add_argument('-b', '--byte', help="output perf data in Byte instead of KB", action="store_true") + parser.add_argument('-v', '--version', help='show version and exit', action="store_true") + args = parser.parse_args() + + if args.version: + print("version {0}".format(__version__)) + return STATUS_OK + + # helpers for default schema + if not args.host.startswith("http"): + args.host = "http://{0}".format(args.host) + # and for request_uri + if not args.host.endswith("/"): + args.host = "{0}/".format(args.host) + + url = "{0}{1}/bucket?format=json&stats=True".format(args.host, + args.admin_entry) + + try: + # Inversion of condition, when '--insecure' is defined we disable + # requests warning about certificate hostname mismatch. + if not args.insecure: + warnings.filterwarnings('ignore', message='Unverified HTTPS request') + + response = requests.get(url, verify=args.insecure, + auth=S3Auth(args.access_key, args.secret_key, + args.host)) + + if response.status_code == requests.codes.ok: + bucket_stats = response.json() + else: + # no usage caps or wrong admin entry + print("RGW ERROR [{0}]: {1}".format(response.status_code, + response.content.decode('utf-8'))) + return STATUS_WARNING + +# DNS, connection errors, etc + except requests.exceptions.RequestException as e: + print("RGW ERROR: {0}".format(e)) + return STATUS_UNKNOWN + + #print(bucket_stats) + buckets = [] + for i in bucket_stats: + if type(i) is dict: + bucket_name = i['bucket'] + usage_dict = i['usage'] + if usage_dict and 'rgw.main' in usage_dict: + bucket_usage_kb = usage_dict['rgw.main']['size_kb_actual'] + else: + bucket_usage_kb = 0 + buckets.append((bucket_name, bucket_usage_kb)) + buckets_total_kb = sum([b[1] for b in buckets]) + + status = "RGW OK: {0} buckets, {1} KB total | /={2}{3} " + + if args.byte: + status = status.format(len(buckets), buckets_total_kb, buckets_total_kb*1024, "B") + else: + status = status.format(len(buckets), buckets_total_kb, buckets_total_kb, "KB") + #print(buckets) + if buckets and args.detail: + if args.byte: + status = status + " ".join(["{}={}B".format(b[0], b[1]*1024) for b in buckets]) + else: + status = status + " ".join(["{}={}KB".format(b[0], b[1]) for b in buckets]) + + print(status) + return STATUS_OK + +if __name__ == "__main__": + sys.exit(main())