nagios-nrpe: Add proper plugin to monitor glusterfs health
All checks were successful
Ansible Lint |Total|New|Outstanding|Fixed|Trend |:-:|:-:|:-:|:-:|:-: |2622|1|2621|3|:+1: Reference build: <a href="https://jenkins.evolix.org/job/gitea/job/ansible-roles/job/unstable/339//ansiblelint">Evolix » ansible-roles » unstable #339</a>
gitea/ansible-roles/pipeline/head This commit looks good

This commit is contained in:
Mathieu Trossevin 2023-09-05 15:21:08 +02:00
parent 7ad296e74f
commit cfca604670
Signed by: mtrossevin
GPG key ID: D1DBB7EA828374E9
3 changed files with 373 additions and 1 deletions

View file

@ -34,6 +34,7 @@ The **patch** part changes is incremented if multiple releases happen the same m
* fail2ban: add variable fail2ban_sshd_port to configure sshd port
* kvm-host: release 23.08 for migrate-vm.sh
* nagios-nrpe: add a NRPE check-local command with completion.
* nagios-nrpe: add a proper monitoring plugin for glusterfs (on servers, not for clients)
* php: add new variable to disable oveeriding settings of php-fpm default pool (www)
* policy_pam: New role allowing to manage password policy with pam_pwquality & pam_pwhistory
* userlogrotate: add a userlogpurge script disabled by default

View file

@ -23,7 +23,8 @@ nagios ALL = NOPASSWD: /sbin/megacli -LdInfo -Lall -aALL -NoLog
nagios ALL = NOPASSWD: /sbin/megacli -AdpBbuCmd -GetBbuStatus -aALL -NoLog
nagios ALL = NOPASSWD: /sbin/ssacli controller all show status
nagios ALL = NOPASSWD: /sbin/ssacli controller slot=0 logicaldrive all show
nagios ALL = NOPASSWD: /usr/local/lib/nagios/plugins/check_gluster.rb
nagios ALL = (clamav) NOPASSWD: /usr/bin/clamscan /tmp/safe.txt
%{{ evolinux_sudo_group }} ALL=(ALL:ALL) ALL

View file

@ -0,0 +1,370 @@
#!/usr/bin/env ruby
# We use old ruby 1.8.7
# rubocop:disable Style/HashSyntax
# We match all exceptions except SystemExit to redefine exit status for nagios
# rubocop:disable Lint/RescueException
# Other disables
# rubocop:disable Metrics/AbcSize, Metrics/LineLength, Metrics/MethodLength, Metrics/ClassLength
# Nagios exit codes
module NagiosStatus
OK = 0
WARNING = 1
CRITICAL = 2
end
# Top level module
module CheckGluster
require 'optparse'
PARAMS = {
# gluster command
:gluster_command => '/usr/sbin/gluster',
:glusterd_pid_path => '/var/run/glusterd.pid',
# Daemons to check with gluster volume status
:check_running_daemons => {
'bitrot' => ['Bitrot Daemon', 'Scrubber Daemon'],
'heal' => ['Self-heal Daemon'],
'nfs' => ['NFS Server']
},
# Parameters to re-run failed command
:rerun => { :times => 10, :delay => 0.5 },
# persistent state file
:state_file => '/var/run/check_gluster_state.yaml',
:state_ttl => 1200,
:self_name => '127.0.0.1'
}.freeze
# Parse command line options
class OptionsParser
attr_reader :checks, :state_file, :state_ttl, :self_name
def initialize
@checks = PARAMS[:check_running_daemons].keys
@state_file = PARAMS[:state_file]
@state_ttl = PARAMS[:state_ttl]
@self_name = PARAMS[:self_name]
end
def parse!
option_parser = OptionParser.new
option_parser.on(
'-c',
'--checks ' + @checks.join(','),
Array,
'Checks to run. Default is ' + @checks.join(',')
) do |list|
list.each do |check|
raise OptionParser::InvalidOption, check.to_s unless @checks.include? check
end
@checks = list
end
option_parser.on(
'-s',
'--state-file /path/to/file',
String,
'State storage file. Default is ' + @state_file
) do |x|
@state_file = x.to_s
end
option_parser.on(
'-n',
'--self-name address',
String,
'IP of self as far as glusterd know. Default is ' + @self_name
) do |x|
@self_name = x.to_s
end
option_parser.on(
'-t',
'--ttl seconds',
Integer,
'State storage TTL. Default is ' + @state_ttl.to_s
) do |x|
@state_ttl = x.to_i
end
begin
option_parser.parse!
@checks
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, OptionParser::InvalidArgument => ex
puts "Error: #{ex.message}"
puts option_parser.help
exit(NagiosStatus::CRITICAL)
end
end
end
# Actual check class
class Check
begin
require 'open3'
require 'rubygems'
$VERBOSE = nil
require 'crack'
$VERBOSE = true
require 'socket'
require 'yaml'
rescue Exception => ex
puts ex.message
exit(NagiosStatus::CRITICAL)
end
def initialize(*var)
var = var.shift
@state_file = var.nil? && var[:state_file].nil? ? PARAMS[:state_file] : var[:state_file]
@state_ttl = var.nil? && var[:state_ttl].nil? ? PARAMS[:state_ttl] : var[:state_ttl]
end
def check_glusterd
pid = File.open(PARAMS[:glusterd_pid_path], &:readline).to_i
Process.kill('CHLD', pid)
rescue Errno::ESRCH
raise 'Glusterd is not running'
end
def get_services_from_checks(checks)
services = []
checks.each do |check|
PARAMS[:check_running_daemons][check].each do |service|
services.concat(any_to_array(service))
end
end
services
end
def gluster(command, xml = true)
data = { 'stdout' => '', 'error' => '', 'status' => 1 }
command = '--xml ' + command if xml
Open3.popen3(PARAMS[:gluster_command] + ' ' + command) do |stdin, stdout, stderr, wait_thr|
stdin.close
data['stdout'] = stdout.read
data['sterr'] = stderr.read
data['status'] = wait_thr.value
end
raise data['status'].to_s + ' : ' + data['stdout'] unless data['status'].success?
return data['stdout'] unless xml
data = Crack::XML.parse(data['stdout'])
if data['cliOutput']['opRet'].to_i != 0 || data['cliOutput']['opErrno'].to_i != 0
raise "Error on #{command} : #{data['cliOutput']['opErrstr']}"
end
data['cliOutput']
end
def peer_status
peers = gluster('peer status')['peerStatus']['peer']
peers = any_to_array(peers)
peers.each do |peer|
raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1
raise "#{peer['hostname']} has wrong state #{peer['state']}" unless peer['state'].to_i == 3
end
end
def pool_list
peers = gluster('pool list')['peerStatus']['peer']
peers = any_to_array(peers)
peers.each do |peer|
raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1
raise "#{peer['hostname']} has wrong state #{peer['state']}" if peer['hostname'] != 'localhost' && peer['state'].to_i != 3
end
end
def volume_list
volumes = gluster('volume list')['volList']['volume']
any_to_array(volumes)
end
def volume_info(volume)
volume = gluster("volume info #{volume}")['volInfo']['volumes']['volume']
raise "volume #{volume['name']} is stopped" unless volume['status'].to_i == 1
# volume info is not well formated XML
volume['bricks']['brick'].map { |s| s.gsub(/<name>.*/, '').split(':') }
end
def volume_status(volume, bricks, checks, self_name)
nodes = gluster("volume status #{volume}")['volStatus']['volumes']['volume']['node']
nodes = any_to_array(nodes)
# Check that all bricks are running
bricks.each do |brick|
raise "volume #{brick[0]}:#{brick[1]} is not running" unless check_running(nodes, brick[0], brick[1])
# Check that processes are running
get_services_from_checks(checks).each do |daemon|
brick_to_check = brick[0]
if brick[0] == self_name
brick_to_check = 'localhost'
end
raise "Daemon #{brick[0]}:\"#{daemon}\" is not started" unless check_running(nodes, daemon, brick_to_check)
end
end
end
def check_running(nodes, hostname, path)
nodes.each do |node|
return true if node['hostname'] == hostname && node['path'] == path && node['status'].to_i == 1
end
false
end
def volume_heal_info(volume)
info = gluster("volume heal #{volume} info", false).split("\n")
info = parse_heal(info)
if @state_ttl > 0
info = merge_heal(load_state(@state_file), info)
store_state(@state_file, info)
end
split_entries = []
time = Time.now.to_i
info.each do |brick, data|
raise "Brick #{brick} is disconnected" unless data['status']
data['content'].each do |line, saved_time|
if @state_ttl == 0
split_entries.push(brick + line)
elsif time - saved_time >= @state_ttl
split_entries.push(brick + line + '=' + (time - saved_time).to_s + 's')
end
end
end
raise 'Files are split-brained : ' + split_entries.join(' ') unless split_entries.empty?
end
def parse_heal(info)
bricks = {}
brick = ''
time = Time.now.to_i
info.each do |line|
if line =~ /^Brick /
brick = line.gsub(/^Brick /, '')
bricks[brick] = { 'content' => {} }
elsif line =~ /^Status: Connected/
bricks[brick]['status'] = true
elsif line =~ /^Status.*/
bricks[brick]['status'] = false
elsif line =~ /^Number of entries: /
bricks[brick]['entries'] = line.gsub(/^Number of entries: /, '').to_i
else
line.gsub!(/\s+$/, '')
bricks[brick]['content'][line] = time if line != ''
end
end
bricks
end
def merge_heal(old, new)
old.each do |brick, data|
data['content'].each do |line, time|
new[brick]['content'][line] = time if new.key?(brick) && new[brick]['content'].key?(line)
end
end
new
end
def store_state(file, data)
fh = File.open(file, File::WRONLY | File::CREAT | File::TRUNC, 0600)
fh.flock(File::LOCK_EX)
fh.write(data.to_yaml)
fh.close
end
def load_state(file)
return {} unless File.exist?(file)
fh = File.open(file, File::RDONLY)
fh.flock(File::LOCK_EX)
data = fh.read
fh.close
begin
data = YAML.load(data)
raise LoadError if data.class.to_s != 'Hash'
rescue Psych::SyntaxError, LoadError
raise 'file ' + file.to_s + ' has incorrect format'
end
data
end
# rubocop:disable Metrics/CyclomaticComplexity
def volume_bitrot_scrub_status(volume)
info = []
i = 0
# Try to run bitrot scrub status not more than 10 times
while info.empty?
i += 1
begin
info = gluster("volume bitrot #{volume} scrub status", false).split("\n")
rescue ArgumentError => ex
raise ex if ex.message != 'invalid byte sequence in UTF-8' || i > PARAMS[:rerun][:times]
sleep PARAMS[:rerun][:delay]
end
end
status = parse_bitrot_status(info)
raise 'BitRot is not enabled' unless status['state']
status['nodes'].each do |node, _v|
raise "BitRot error on #{node}" unless status['nodes'][node]['errors'] == 0
end
end
# rubocop:enable Metrics/CyclomaticComplexity
def parse_bitrot_status(info)
info.map! { |s| s.gsub(/localhost/, Socket.gethostname) }
status = { 'state' => false, 'nodes' => {} }
node = ''
info.each do |line|
if line =~ /^State of scrub: Active/
status['state'] = true
elsif line =~ /^Node: /
node = line.gsub(/^Node: /, '')
status['nodes'][node] = {}
elsif line =~ /^Error count: /
status['nodes'][node]['errors'] = line.gsub(/^Error count: /, '').to_i
end
end
status
end
def any_to_array(any)
return [any] if any.class.to_s != 'Array'
any
end
private :get_services_from_checks, :gluster, :any_to_array
private :check_running, :parse_heal, :parse_bitrot_status
private :merge_heal, :store_state, :load_state
end
end
begin
# Parse parameters
option_parser = CheckGluster::OptionsParser.new
option_parser.parse!
checks = option_parser.checks
self_name = option_parser.self_name
# Run gluster checks
gluster_check = CheckGluster::Check.new(:state_file => option_parser.state_file, :state_ttl => option_parser.state_ttl)
gluster_check.check_glusterd
gluster_check.peer_status
gluster_check.pool_list
gluster_check.volume_list.each do |volume|
bricks = gluster_check.volume_info(volume)
gluster_check.volume_status(volume, bricks, checks, self_name)
gluster_check.volume_heal_info(volume) if checks.include? 'heal'
gluster_check.volume_bitrot_scrub_status(volume) if checks.include? 'bitrot'
end
puts 'OK: Gluster cluster is healthy'
exit(NagiosStatus::OK)
# Normal exit
rescue SystemExit => ex
raise ex
# Anything else goes to nagios error with critial status
rescue Exception => ex
puts "Error: #{ex.message}"
exit(NagiosStatus::CRITICAL)
end
# rubocop:enable all