nagios-nrpe: Add proper plugin to monitor glusterfs health
All checks were successful
Ansible Lint |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|2622|1|2621|3|:+1:
Reference build: <a href="https://jenkins.evolix.org/job/gitea/job/ansible-roles/job/unstable/339//ansiblelint">Evolix » ansible-roles » unstable #339</a>
gitea/ansible-roles/pipeline/head This commit looks good
All checks were successful
Ansible Lint |Total|New|Outstanding|Fixed|Trend
|:-:|:-:|:-:|:-:|:-:
|2622|1|2621|3|:+1:
Reference build: <a href="https://jenkins.evolix.org/job/gitea/job/ansible-roles/job/unstable/339//ansiblelint">Evolix » ansible-roles » unstable #339</a>
gitea/ansible-roles/pipeline/head This commit looks good
This commit is contained in:
parent
7ad296e74f
commit
cfca604670
3 changed files with 373 additions and 1 deletions
|
@ -34,6 +34,7 @@ The **patch** part changes is incremented if multiple releases happen the same m
|
|||
* fail2ban: add variable fail2ban_sshd_port to configure sshd port
|
||||
* kvm-host: release 23.08 for migrate-vm.sh
|
||||
* nagios-nrpe: add a NRPE check-local command with completion.
|
||||
* nagios-nrpe: add a proper monitoring plugin for glusterfs (on servers, not for clients)
|
||||
* php: add new variable to disable oveeriding settings of php-fpm default pool (www)
|
||||
* policy_pam: New role allowing to manage password policy with pam_pwquality & pam_pwhistory
|
||||
* userlogrotate: add a userlogpurge script disabled by default
|
||||
|
|
|
@ -23,7 +23,8 @@ nagios ALL = NOPASSWD: /sbin/megacli -LdInfo -Lall -aALL -NoLog
|
|||
nagios ALL = NOPASSWD: /sbin/megacli -AdpBbuCmd -GetBbuStatus -aALL -NoLog
|
||||
nagios ALL = NOPASSWD: /sbin/ssacli controller all show status
|
||||
nagios ALL = NOPASSWD: /sbin/ssacli controller slot=0 logicaldrive all show
|
||||
|
||||
nagios ALL = NOPASSWD: /usr/local/lib/nagios/plugins/check_gluster.rb
|
||||
|
||||
nagios ALL = (clamav) NOPASSWD: /usr/bin/clamscan /tmp/safe.txt
|
||||
|
||||
%{{ evolinux_sudo_group }} ALL=(ALL:ALL) ALL
|
||||
|
|
370
nagios-nrpe/files/plugins/check_gluster.rb
Executable file
370
nagios-nrpe/files/plugins/check_gluster.rb
Executable file
|
@ -0,0 +1,370 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
# We use old ruby 1.8.7
|
||||
# rubocop:disable Style/HashSyntax
|
||||
|
||||
# We match all exceptions except SystemExit to redefine exit status for nagios
|
||||
# rubocop:disable Lint/RescueException
|
||||
|
||||
# Other disables
|
||||
# rubocop:disable Metrics/AbcSize, Metrics/LineLength, Metrics/MethodLength, Metrics/ClassLength
|
||||
|
||||
# Nagios exit codes
|
||||
module NagiosStatus
|
||||
OK = 0
|
||||
WARNING = 1
|
||||
CRITICAL = 2
|
||||
end
|
||||
|
||||
# Top level module
|
||||
module CheckGluster
|
||||
require 'optparse'
|
||||
|
||||
PARAMS = {
|
||||
# gluster command
|
||||
:gluster_command => '/usr/sbin/gluster',
|
||||
:glusterd_pid_path => '/var/run/glusterd.pid',
|
||||
# Daemons to check with gluster volume status
|
||||
:check_running_daemons => {
|
||||
'bitrot' => ['Bitrot Daemon', 'Scrubber Daemon'],
|
||||
'heal' => ['Self-heal Daemon'],
|
||||
'nfs' => ['NFS Server']
|
||||
},
|
||||
# Parameters to re-run failed command
|
||||
:rerun => { :times => 10, :delay => 0.5 },
|
||||
# persistent state file
|
||||
:state_file => '/var/run/check_gluster_state.yaml',
|
||||
:state_ttl => 1200,
|
||||
:self_name => '127.0.0.1'
|
||||
}.freeze
|
||||
|
||||
# Parse command line options
|
||||
class OptionsParser
|
||||
attr_reader :checks, :state_file, :state_ttl, :self_name
|
||||
|
||||
def initialize
|
||||
@checks = PARAMS[:check_running_daemons].keys
|
||||
@state_file = PARAMS[:state_file]
|
||||
@state_ttl = PARAMS[:state_ttl]
|
||||
@self_name = PARAMS[:self_name]
|
||||
end
|
||||
|
||||
def parse!
|
||||
option_parser = OptionParser.new
|
||||
|
||||
option_parser.on(
|
||||
'-c',
|
||||
'--checks ' + @checks.join(','),
|
||||
Array,
|
||||
'Checks to run. Default is ' + @checks.join(',')
|
||||
) do |list|
|
||||
list.each do |check|
|
||||
raise OptionParser::InvalidOption, check.to_s unless @checks.include? check
|
||||
end
|
||||
@checks = list
|
||||
end
|
||||
|
||||
option_parser.on(
|
||||
'-s',
|
||||
'--state-file /path/to/file',
|
||||
String,
|
||||
'State storage file. Default is ' + @state_file
|
||||
) do |x|
|
||||
@state_file = x.to_s
|
||||
end
|
||||
|
||||
option_parser.on(
|
||||
'-n',
|
||||
'--self-name address',
|
||||
String,
|
||||
'IP of self as far as glusterd know. Default is ' + @self_name
|
||||
) do |x|
|
||||
@self_name = x.to_s
|
||||
end
|
||||
|
||||
option_parser.on(
|
||||
'-t',
|
||||
'--ttl seconds',
|
||||
Integer,
|
||||
'State storage TTL. Default is ' + @state_ttl.to_s
|
||||
) do |x|
|
||||
@state_ttl = x.to_i
|
||||
end
|
||||
|
||||
begin
|
||||
option_parser.parse!
|
||||
@checks
|
||||
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, OptionParser::InvalidArgument => ex
|
||||
puts "Error: #{ex.message}"
|
||||
puts option_parser.help
|
||||
exit(NagiosStatus::CRITICAL)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Actual check class
|
||||
class Check
|
||||
begin
|
||||
require 'open3'
|
||||
require 'rubygems'
|
||||
$VERBOSE = nil
|
||||
require 'crack'
|
||||
$VERBOSE = true
|
||||
require 'socket'
|
||||
require 'yaml'
|
||||
rescue Exception => ex
|
||||
puts ex.message
|
||||
exit(NagiosStatus::CRITICAL)
|
||||
end
|
||||
|
||||
def initialize(*var)
|
||||
var = var.shift
|
||||
@state_file = var.nil? && var[:state_file].nil? ? PARAMS[:state_file] : var[:state_file]
|
||||
@state_ttl = var.nil? && var[:state_ttl].nil? ? PARAMS[:state_ttl] : var[:state_ttl]
|
||||
end
|
||||
|
||||
def check_glusterd
|
||||
pid = File.open(PARAMS[:glusterd_pid_path], &:readline).to_i
|
||||
Process.kill('CHLD', pid)
|
||||
rescue Errno::ESRCH
|
||||
raise 'Glusterd is not running'
|
||||
end
|
||||
|
||||
def get_services_from_checks(checks)
|
||||
services = []
|
||||
checks.each do |check|
|
||||
PARAMS[:check_running_daemons][check].each do |service|
|
||||
services.concat(any_to_array(service))
|
||||
end
|
||||
end
|
||||
services
|
||||
end
|
||||
|
||||
def gluster(command, xml = true)
|
||||
data = { 'stdout' => '', 'error' => '', 'status' => 1 }
|
||||
command = '--xml ' + command if xml
|
||||
Open3.popen3(PARAMS[:gluster_command] + ' ' + command) do |stdin, stdout, stderr, wait_thr|
|
||||
stdin.close
|
||||
data['stdout'] = stdout.read
|
||||
data['sterr'] = stderr.read
|
||||
data['status'] = wait_thr.value
|
||||
end
|
||||
raise data['status'].to_s + ' : ' + data['stdout'] unless data['status'].success?
|
||||
return data['stdout'] unless xml
|
||||
data = Crack::XML.parse(data['stdout'])
|
||||
if data['cliOutput']['opRet'].to_i != 0 || data['cliOutput']['opErrno'].to_i != 0
|
||||
raise "Error on #{command} : #{data['cliOutput']['opErrstr']}"
|
||||
end
|
||||
data['cliOutput']
|
||||
end
|
||||
|
||||
def peer_status
|
||||
peers = gluster('peer status')['peerStatus']['peer']
|
||||
peers = any_to_array(peers)
|
||||
peers.each do |peer|
|
||||
raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1
|
||||
raise "#{peer['hostname']} has wrong state #{peer['state']}" unless peer['state'].to_i == 3
|
||||
end
|
||||
end
|
||||
|
||||
def pool_list
|
||||
peers = gluster('pool list')['peerStatus']['peer']
|
||||
peers = any_to_array(peers)
|
||||
peers.each do |peer|
|
||||
raise "#{peer['hostname']} is disconnected" unless peer['connected'].to_i == 1
|
||||
raise "#{peer['hostname']} has wrong state #{peer['state']}" if peer['hostname'] != 'localhost' && peer['state'].to_i != 3
|
||||
end
|
||||
end
|
||||
|
||||
def volume_list
|
||||
volumes = gluster('volume list')['volList']['volume']
|
||||
any_to_array(volumes)
|
||||
end
|
||||
|
||||
def volume_info(volume)
|
||||
volume = gluster("volume info #{volume}")['volInfo']['volumes']['volume']
|
||||
raise "volume #{volume['name']} is stopped" unless volume['status'].to_i == 1
|
||||
# volume info is not well formated XML
|
||||
volume['bricks']['brick'].map { |s| s.gsub(/<name>.*/, '').split(':') }
|
||||
end
|
||||
|
||||
def volume_status(volume, bricks, checks, self_name)
|
||||
nodes = gluster("volume status #{volume}")['volStatus']['volumes']['volume']['node']
|
||||
nodes = any_to_array(nodes)
|
||||
# Check that all bricks are running
|
||||
bricks.each do |brick|
|
||||
raise "volume #{brick[0]}:#{brick[1]} is not running" unless check_running(nodes, brick[0], brick[1])
|
||||
# Check that processes are running
|
||||
get_services_from_checks(checks).each do |daemon|
|
||||
brick_to_check = brick[0]
|
||||
if brick[0] == self_name
|
||||
brick_to_check = 'localhost'
|
||||
end
|
||||
raise "Daemon #{brick[0]}:\"#{daemon}\" is not started" unless check_running(nodes, daemon, brick_to_check)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def check_running(nodes, hostname, path)
|
||||
nodes.each do |node|
|
||||
return true if node['hostname'] == hostname && node['path'] == path && node['status'].to_i == 1
|
||||
end
|
||||
false
|
||||
end
|
||||
|
||||
def volume_heal_info(volume)
|
||||
info = gluster("volume heal #{volume} info", false).split("\n")
|
||||
info = parse_heal(info)
|
||||
if @state_ttl > 0
|
||||
info = merge_heal(load_state(@state_file), info)
|
||||
store_state(@state_file, info)
|
||||
end
|
||||
split_entries = []
|
||||
time = Time.now.to_i
|
||||
info.each do |brick, data|
|
||||
raise "Brick #{brick} is disconnected" unless data['status']
|
||||
data['content'].each do |line, saved_time|
|
||||
if @state_ttl == 0
|
||||
split_entries.push(brick + line)
|
||||
elsif time - saved_time >= @state_ttl
|
||||
split_entries.push(brick + line + '=' + (time - saved_time).to_s + 's')
|
||||
end
|
||||
end
|
||||
end
|
||||
raise 'Files are split-brained : ' + split_entries.join(' ') unless split_entries.empty?
|
||||
end
|
||||
|
||||
def parse_heal(info)
|
||||
bricks = {}
|
||||
brick = ''
|
||||
time = Time.now.to_i
|
||||
info.each do |line|
|
||||
if line =~ /^Brick /
|
||||
brick = line.gsub(/^Brick /, '')
|
||||
bricks[brick] = { 'content' => {} }
|
||||
elsif line =~ /^Status: Connected/
|
||||
bricks[brick]['status'] = true
|
||||
elsif line =~ /^Status.*/
|
||||
bricks[brick]['status'] = false
|
||||
elsif line =~ /^Number of entries: /
|
||||
bricks[brick]['entries'] = line.gsub(/^Number of entries: /, '').to_i
|
||||
else
|
||||
line.gsub!(/\s+$/, '')
|
||||
bricks[brick]['content'][line] = time if line != ''
|
||||
end
|
||||
end
|
||||
bricks
|
||||
end
|
||||
|
||||
def merge_heal(old, new)
|
||||
old.each do |brick, data|
|
||||
data['content'].each do |line, time|
|
||||
new[brick]['content'][line] = time if new.key?(brick) && new[brick]['content'].key?(line)
|
||||
end
|
||||
end
|
||||
new
|
||||
end
|
||||
|
||||
def store_state(file, data)
|
||||
fh = File.open(file, File::WRONLY | File::CREAT | File::TRUNC, 0600)
|
||||
fh.flock(File::LOCK_EX)
|
||||
fh.write(data.to_yaml)
|
||||
fh.close
|
||||
end
|
||||
|
||||
def load_state(file)
|
||||
return {} unless File.exist?(file)
|
||||
fh = File.open(file, File::RDONLY)
|
||||
fh.flock(File::LOCK_EX)
|
||||
data = fh.read
|
||||
fh.close
|
||||
begin
|
||||
data = YAML.load(data)
|
||||
raise LoadError if data.class.to_s != 'Hash'
|
||||
rescue Psych::SyntaxError, LoadError
|
||||
raise 'file ' + file.to_s + ' has incorrect format'
|
||||
end
|
||||
data
|
||||
end
|
||||
|
||||
# rubocop:disable Metrics/CyclomaticComplexity
|
||||
def volume_bitrot_scrub_status(volume)
|
||||
info = []
|
||||
i = 0
|
||||
# Try to run bitrot scrub status not more than 10 times
|
||||
while info.empty?
|
||||
i += 1
|
||||
begin
|
||||
info = gluster("volume bitrot #{volume} scrub status", false).split("\n")
|
||||
rescue ArgumentError => ex
|
||||
raise ex if ex.message != 'invalid byte sequence in UTF-8' || i > PARAMS[:rerun][:times]
|
||||
sleep PARAMS[:rerun][:delay]
|
||||
end
|
||||
end
|
||||
status = parse_bitrot_status(info)
|
||||
raise 'BitRot is not enabled' unless status['state']
|
||||
status['nodes'].each do |node, _v|
|
||||
raise "BitRot error on #{node}" unless status['nodes'][node]['errors'] == 0
|
||||
end
|
||||
end
|
||||
# rubocop:enable Metrics/CyclomaticComplexity
|
||||
|
||||
def parse_bitrot_status(info)
|
||||
info.map! { |s| s.gsub(/localhost/, Socket.gethostname) }
|
||||
status = { 'state' => false, 'nodes' => {} }
|
||||
node = ''
|
||||
info.each do |line|
|
||||
if line =~ /^State of scrub: Active/
|
||||
status['state'] = true
|
||||
elsif line =~ /^Node: /
|
||||
node = line.gsub(/^Node: /, '')
|
||||
status['nodes'][node] = {}
|
||||
elsif line =~ /^Error count: /
|
||||
status['nodes'][node]['errors'] = line.gsub(/^Error count: /, '').to_i
|
||||
end
|
||||
end
|
||||
status
|
||||
end
|
||||
|
||||
def any_to_array(any)
|
||||
return [any] if any.class.to_s != 'Array'
|
||||
any
|
||||
end
|
||||
|
||||
private :get_services_from_checks, :gluster, :any_to_array
|
||||
private :check_running, :parse_heal, :parse_bitrot_status
|
||||
private :merge_heal, :store_state, :load_state
|
||||
end
|
||||
end
|
||||
|
||||
begin
|
||||
# Parse parameters
|
||||
option_parser = CheckGluster::OptionsParser.new
|
||||
option_parser.parse!
|
||||
checks = option_parser.checks
|
||||
self_name = option_parser.self_name
|
||||
|
||||
# Run gluster checks
|
||||
gluster_check = CheckGluster::Check.new(:state_file => option_parser.state_file, :state_ttl => option_parser.state_ttl)
|
||||
gluster_check.check_glusterd
|
||||
gluster_check.peer_status
|
||||
gluster_check.pool_list
|
||||
gluster_check.volume_list.each do |volume|
|
||||
bricks = gluster_check.volume_info(volume)
|
||||
gluster_check.volume_status(volume, bricks, checks, self_name)
|
||||
gluster_check.volume_heal_info(volume) if checks.include? 'heal'
|
||||
gluster_check.volume_bitrot_scrub_status(volume) if checks.include? 'bitrot'
|
||||
end
|
||||
|
||||
puts 'OK: Gluster cluster is healthy'
|
||||
exit(NagiosStatus::OK)
|
||||
# Normal exit
|
||||
rescue SystemExit => ex
|
||||
raise ex
|
||||
# Anything else goes to nagios error with critial status
|
||||
rescue Exception => ex
|
||||
puts "Error: #{ex.message}"
|
||||
exit(NagiosStatus::CRITICAL)
|
||||
end
|
||||
|
||||
# rubocop:enable all
|
Loading…
Reference in a new issue