diff --git a/CHANGELOG.md b/CHANGELOG.md index 31b66e69..3007e593 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ The **patch** part changes is incremented if multiple releases happen the same m * fail2ban: add variable fail2ban_sshd_port to configure sshd port * kvm-host: release 23.08 for migrate-vm.sh * nagios-nrpe: add a NRPE check-local command with completion. +* nagios-nrpe: add a proper monitoring plugin for glusterfs (on servers, not for clients) * php: add new variable to disable oveeriding settings of php-fpm default pool (www) * policy_pam: New role allowing to manage password policy with pam_pwquality & pam_pwhistory * userlogrotate: add a userlogpurge script disabled by default diff --git a/evolinux-users/templates/sudoers.j2 b/evolinux-users/templates/sudoers.j2 index 29a22da7..60b5b782 100644 --- a/evolinux-users/templates/sudoers.j2 +++ b/evolinux-users/templates/sudoers.j2 @@ -23,7 +23,8 @@ nagios ALL = NOPASSWD: /sbin/megacli -LdInfo -Lall -aALL -NoLog nagios ALL = NOPASSWD: /sbin/megacli -AdpBbuCmd -GetBbuStatus -aALL -NoLog nagios ALL = NOPASSWD: /sbin/ssacli controller all show status nagios ALL = NOPASSWD: /sbin/ssacli controller slot=0 logicaldrive all show - +nagios ALL = NOPASSWD: /usr/local/lib/nagios/plugins/check_gluster.rb + nagios ALL = (clamav) NOPASSWD: /usr/bin/clamscan /tmp/safe.txt %{{ evolinux_sudo_group }} ALL=(ALL:ALL) ALL diff --git a/nagios-nrpe/files/plugins/check_gluster.rb b/nagios-nrpe/files/plugins/check_gluster.rb new file mode 100755 index 00000000..35cb44ef --- /dev/null +++ b/nagios-nrpe/files/plugins/check_gluster.rb @@ -0,0 +1,370 @@ +#!/usr/bin/env ruby + +# We use old ruby 1.8.7 +# rubocop:disable Style/HashSyntax + +# We match all exceptions except SystemExit to redefine exit status for nagios +# rubocop:disable Lint/RescueException + +# Other disables +# rubocop:disable Metrics/AbcSize, Metrics/LineLength, Metrics/MethodLength, Metrics/ClassLength + +# Nagios exit codes +module NagiosStatus + OK = 0 + WARNING = 1 + CRITICAL = 2 +end + +# Top level module +module CheckGluster + require 'optparse' + + PARAMS = { + # gluster command + :gluster_command => '/usr/sbin/gluster', + :glusterd_pid_path => '/var/run/glusterd.pid', + # Daemons to check with gluster volume status + :check_running_daemons => { + 'bitrot' => ['Bitrot Daemon', 'Scrubber Daemon'], + 'heal' => ['Self-heal Daemon'], + 'nfs' => ['NFS Server'] + }, + # Parameters to re-run failed command + :rerun => { :times => 10, :delay => 0.5 }, + # persistent state file + :state_file => '/var/run/check_gluster_state.yaml', + :state_ttl => 1200, + :self_name => '127.0.0.1' + }.freeze + + # Parse command line options + class OptionsParser + attr_reader :checks, :state_file, :state_ttl, :self_name + + def initialize + @checks = PARAMS[:check_running_daemons].keys + @state_file = PARAMS[:state_file] + @state_ttl = PARAMS[:state_ttl] + @self_name = PARAMS[:self_name] + end + + def parse! + option_parser = OptionParser.new + + option_parser.on( + '-c', + '--checks ' + @checks.join(','), + Array, + 'Checks to run. Default is ' + @checks.join(',') + ) do |list| + list.each do |check| + raise OptionParser::InvalidOption, check.to_s unless @checks.include? check + end + @checks = list + end + + option_parser.on( + '-s', + '--state-file /path/to/file', + String, + 'State storage file. Default is ' + @state_file + ) do |x| + @state_file = x.to_s + end + + option_parser.on( + '-n', + '--self-name address', + String, + 'IP of self as far as glusterd know. Default is ' + @self_name + ) do |x| + @self_name = x.to_s + end + + option_parser.on( + '-t', + '--ttl seconds', + Integer, + 'State storage TTL. Default is ' + @state_ttl.to_s + ) do |x| + @state_ttl = x.to_i + end + + begin + option_parser.parse! + @checks + rescue OptionParser::InvalidOption, OptionParser::MissingArgument, OptionParser::InvalidArgument => ex + puts "Error: #{ex.message}" + puts option_parser.help + exit(NagiosStatus::CRITICAL) + end + end + end + + # Actual check class + class Check + begin + require 'open3' + require 'rubygems' + $VERBOSE = nil + require 'crack' + $VERBOSE = true + require 'socket' + require 'yaml' + rescue Exception => ex + puts ex.message + exit(NagiosStatus::CRITICAL) + end + + def initialize(*var) + var = var.shift + @state_file = var.nil? && var[:state_file].nil? ? PARAMS[:state_file] : var[:state_file] + @state_ttl = var.nil? && var[:state_ttl].nil? ? PARAMS[:state_ttl] : var[:state_ttl] + end + + def check_glusterd + pid = File.open(PARAMS[:glusterd_pid_path], &:readline).to_i + Process.kill('CHLD', pid) + rescue Errno::ESRCH + raise 'Glusterd is not running' + end + + def get_services_from_checks(checks) + services = [] + checks.each do |check| + PARAMS[:check_running_daemons][check].each do |service| + services.concat(any_to_array(service)) + end + end + services + end + + def gluster(command, xml = true) + data = { 'stdout' => '', 'error' => '', 'status' => 1 } + command = '--xml ' + command if xml + Open3.popen3(PARAMS[:gluster_command] + ' ' + command) do |stdin, stdout, stderr, wait_thr| + stdin.close + data['stdout'] = stdout.read + data['sterr'] = stderr.read + data['status'] = wait_thr.value + end + raise data['status'].to_s + ' : ' + data['stdout'] unless data['status'].success? + return data['stdout'] unless xml + data = Crack::XML.parse(data['stdout']) + if data['cliOutput']['opRet'].to_i != 0 || data['cliOutput']['opErrno'].to_i != 0 + raise "Error on #{command} : #{data['cliOutput']['opErrstr']}" + end + data['cliOutput'] + end + + def peer_status + peers = gluster('peer status')['peerStatus']['peer'] + peers = any_to_array(peers) + peers.each do |peer| + raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1 + raise "#{peer['hostname']} has wrong state #{peer['state']}" unless peer['state'].to_i == 3 + end + end + + def pool_list + peers = gluster('pool list')['peerStatus']['peer'] + peers = any_to_array(peers) + peers.each do |peer| + raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1 + raise "#{peer['hostname']} has wrong state #{peer['state']}" if peer['hostname'] != 'localhost' && peer['state'].to_i != 3 + end + end + + def volume_list + volumes = gluster('volume list')['volList']['volume'] + any_to_array(volumes) + end + + def volume_info(volume) + volume = gluster("volume info #{volume}")['volInfo']['volumes']['volume'] + raise "volume #{volume['name']} is stopped" unless volume['status'].to_i == 1 + # volume info is not well formated XML + volume['bricks']['brick'].map { |s| s.gsub(/.*/, '').split(':') } + end + + def volume_status(volume, bricks, checks, self_name) + nodes = gluster("volume status #{volume}")['volStatus']['volumes']['volume']['node'] + nodes = any_to_array(nodes) + # Check that all bricks are running + bricks.each do |brick| + raise "volume #{brick[0]}:#{brick[1]} is not running" unless check_running(nodes, brick[0], brick[1]) + # Check that processes are running + get_services_from_checks(checks).each do |daemon| + brick_to_check = brick[0] + if brick[0] == self_name + brick_to_check = 'localhost' + end + raise "Daemon #{brick[0]}:\"#{daemon}\" is not started" unless check_running(nodes, daemon, brick_to_check) + end + end + end + + def check_running(nodes, hostname, path) + nodes.each do |node| + return true if node['hostname'] == hostname && node['path'] == path && node['status'].to_i == 1 + end + false + end + + def volume_heal_info(volume) + info = gluster("volume heal #{volume} info", false).split("\n") + info = parse_heal(info) + if @state_ttl > 0 + info = merge_heal(load_state(@state_file), info) + store_state(@state_file, info) + end + split_entries = [] + time = Time.now.to_i + info.each do |brick, data| + raise "Brick #{brick} is disconnected" unless data['status'] + data['content'].each do |line, saved_time| + if @state_ttl == 0 + split_entries.push(brick + line) + elsif time - saved_time >= @state_ttl + split_entries.push(brick + line + '=' + (time - saved_time).to_s + 's') + end + end + end + raise 'Files are split-brained : ' + split_entries.join(' ') unless split_entries.empty? + end + + def parse_heal(info) + bricks = {} + brick = '' + time = Time.now.to_i + info.each do |line| + if line =~ /^Brick / + brick = line.gsub(/^Brick /, '') + bricks[brick] = { 'content' => {} } + elsif line =~ /^Status: Connected/ + bricks[brick]['status'] = true + elsif line =~ /^Status.*/ + bricks[brick]['status'] = false + elsif line =~ /^Number of entries: / + bricks[brick]['entries'] = line.gsub(/^Number of entries: /, '').to_i + else + line.gsub!(/\s+$/, '') + bricks[brick]['content'][line] = time if line != '' + end + end + bricks + end + + def merge_heal(old, new) + old.each do |brick, data| + data['content'].each do |line, time| + new[brick]['content'][line] = time if new.key?(brick) && new[brick]['content'].key?(line) + end + end + new + end + + def store_state(file, data) + fh = File.open(file, File::WRONLY | File::CREAT | File::TRUNC, 0600) + fh.flock(File::LOCK_EX) + fh.write(data.to_yaml) + fh.close + end + + def load_state(file) + return {} unless File.exist?(file) + fh = File.open(file, File::RDONLY) + fh.flock(File::LOCK_EX) + data = fh.read + fh.close + begin + data = YAML.load(data) + raise LoadError if data.class.to_s != 'Hash' + rescue Psych::SyntaxError, LoadError + raise 'file ' + file.to_s + ' has incorrect format' + end + data + end + + # rubocop:disable Metrics/CyclomaticComplexity + def volume_bitrot_scrub_status(volume) + info = [] + i = 0 + # Try to run bitrot scrub status not more than 10 times + while info.empty? + i += 1 + begin + info = gluster("volume bitrot #{volume} scrub status", false).split("\n") + rescue ArgumentError => ex + raise ex if ex.message != 'invalid byte sequence in UTF-8' || i > PARAMS[:rerun][:times] + sleep PARAMS[:rerun][:delay] + end + end + status = parse_bitrot_status(info) + raise 'BitRot is not enabled' unless status['state'] + status['nodes'].each do |node, _v| + raise "BitRot error on #{node}" unless status['nodes'][node]['errors'] == 0 + end + end + # rubocop:enable Metrics/CyclomaticComplexity + + def parse_bitrot_status(info) + info.map! { |s| s.gsub(/localhost/, Socket.gethostname) } + status = { 'state' => false, 'nodes' => {} } + node = '' + info.each do |line| + if line =~ /^State of scrub: Active/ + status['state'] = true + elsif line =~ /^Node: / + node = line.gsub(/^Node: /, '') + status['nodes'][node] = {} + elsif line =~ /^Error count: / + status['nodes'][node]['errors'] = line.gsub(/^Error count: /, '').to_i + end + end + status + end + + def any_to_array(any) + return [any] if any.class.to_s != 'Array' + any + end + + private :get_services_from_checks, :gluster, :any_to_array + private :check_running, :parse_heal, :parse_bitrot_status + private :merge_heal, :store_state, :load_state + end +end + +begin + # Parse parameters + option_parser = CheckGluster::OptionsParser.new + option_parser.parse! + checks = option_parser.checks + self_name = option_parser.self_name + + # Run gluster checks + gluster_check = CheckGluster::Check.new(:state_file => option_parser.state_file, :state_ttl => option_parser.state_ttl) + gluster_check.check_glusterd + gluster_check.peer_status + gluster_check.pool_list + gluster_check.volume_list.each do |volume| + bricks = gluster_check.volume_info(volume) + gluster_check.volume_status(volume, bricks, checks, self_name) + gluster_check.volume_heal_info(volume) if checks.include? 'heal' + gluster_check.volume_bitrot_scrub_status(volume) if checks.include? 'bitrot' + end + + puts 'OK: Gluster cluster is healthy' + exit(NagiosStatus::OK) +# Normal exit +rescue SystemExit => ex + raise ex +# Anything else goes to nagios error with critial status +rescue Exception => ex + puts "Error: #{ex.message}" + exit(NagiosStatus::CRITICAL) +end + +# rubocop:enable all