908669f073
The checks `cluster_config_has_changed` and `node_tl_has_changed` use a state file to store the previous value of the config hash and the timeline. Previously the check would fail if something changed, but the new value would be saved directly. This behavious has changed. The new value is saved only if `--save` is passed to the check. The mimics the way [check_pgactivity] manages this kind of checks. [check_pgactivity]: https://github.com/OPMDG/check_pgactivity
572 lines
16 KiB
Python
572 lines
16 KiB
Python
import re
|
|
from configparser import ConfigParser
|
|
|
|
import click
|
|
import nagiosplugin
|
|
from typing import List
|
|
|
|
from . import __version__
|
|
from .cluster import (
|
|
ClusterConfigHasChanged,
|
|
ClusterConfigHasChangedSummary,
|
|
ClusterHasLeader,
|
|
ClusterHasLeaderSummary,
|
|
ClusterHasReplica,
|
|
ClusterNodeCount,
|
|
ClusterIsInMaintenance,
|
|
)
|
|
from .node import (
|
|
NodeIsAlive,
|
|
NodeIsAliveSummary,
|
|
NodeIsPendingRestart,
|
|
NodeIsPendingRestartSummary,
|
|
NodeIsPrimary,
|
|
NodeIsPrimarySummary,
|
|
NodeIsReplica,
|
|
NodeIsReplicaSummary,
|
|
NodePatroniVersion,
|
|
NodePatroniVersionSummary,
|
|
NodeTLHasChanged,
|
|
NodeTLHasChangedSummary,
|
|
)
|
|
from .types import ConnectionInfo, Parameters
|
|
from .convert import size_to_byte
|
|
|
|
|
|
def print_version(ctx: click.Context, param: str, value: str) -> None:
|
|
if not value or ctx.resilient_parsing:
|
|
return
|
|
click.echo(f"Version {__version__}")
|
|
ctx.exit()
|
|
|
|
|
|
DEFAULT_CFG = "config.ini"
|
|
|
|
|
|
def configure(ctx: click.Context, param: str, filename: str) -> None:
|
|
"""Use a config file for the parameters
|
|
stolen from https://jwodder.github.io/kbits/posts/click-config/
|
|
"""
|
|
# FIXME should use click-configfile / click-config-file ?
|
|
cfg = ConfigParser()
|
|
cfg.read(filename)
|
|
ctx.default_map = {}
|
|
for sect in cfg.sections():
|
|
command_path = sect.split(".")
|
|
if command_path[0] != "options":
|
|
continue
|
|
defaults = ctx.default_map
|
|
for cmdname in command_path[1:]:
|
|
defaults = defaults.setdefault(cmdname, {})
|
|
defaults.update(cfg[sect])
|
|
try:
|
|
# endpoints is an array of addresses separated by ,
|
|
if isinstance(defaults["endpoints"], str):
|
|
defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
|
|
except KeyError:
|
|
pass
|
|
|
|
|
|
@click.group()
|
|
@click.option(
|
|
"--config",
|
|
type=click.Path(dir_okay=False),
|
|
default=DEFAULT_CFG,
|
|
callback=configure,
|
|
is_eager=True,
|
|
expose_value=False,
|
|
help="Read option defaults from the specified INI file",
|
|
show_default=True,
|
|
)
|
|
@click.option(
|
|
"-e",
|
|
"--endpoints",
|
|
"endpoints",
|
|
type=str,
|
|
multiple=True,
|
|
default=["http://127.0.0.1:8008"],
|
|
help="API endpoint. Can be specified multiple times.",
|
|
show_default=True,
|
|
)
|
|
@click.option(
|
|
"--cert_file",
|
|
"cert_file",
|
|
type=str,
|
|
help="File with the client certificate.",
|
|
)
|
|
@click.option(
|
|
"--key_file",
|
|
"key_file",
|
|
type=str,
|
|
help="File with the client key.",
|
|
)
|
|
@click.option(
|
|
"--ca_file",
|
|
"ca_file",
|
|
type=str,
|
|
help="The CA certificate.",
|
|
)
|
|
@click.option(
|
|
"-v",
|
|
"--verbose",
|
|
"verbose",
|
|
count=True,
|
|
default=0,
|
|
help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
|
|
show_default=False,
|
|
)
|
|
@click.option(
|
|
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
|
|
)
|
|
@click.option(
|
|
"--timeout",
|
|
"timeout",
|
|
default=2,
|
|
type=int,
|
|
help="Timeout in seconds for the API queries (0 to disable)",
|
|
show_default=True,
|
|
)
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def main(
|
|
ctx: click.Context,
|
|
endpoints: List[str],
|
|
cert_file: str,
|
|
key_file: str,
|
|
ca_file: str,
|
|
verbose: int,
|
|
timeout: int,
|
|
) -> None:
|
|
"""Nagios plugin for patroni."""
|
|
# FIXME Not all "is/has" services have the same return code for ok. Check if it's ok
|
|
|
|
# We use this to pass parameters instead of ctx.parent.params because the
|
|
# latter is typed as Optional[Context] and mypy complains with the following
|
|
# error unless we test if ctx.parent is none which looked ugly.
|
|
#
|
|
# error: Item "None" of "Optional[Context]" has an attribute "params" [union-attr]
|
|
ctx.obj = Parameters(
|
|
ConnectionInfo(endpoints, cert_file, key_file, ca_file),
|
|
timeout,
|
|
verbose,
|
|
)
|
|
|
|
|
|
@main.command(name="cluster_node_count") # required otherwise _ are converted to -
|
|
@click.option(
|
|
"-w",
|
|
"--warning",
|
|
"warning",
|
|
type=str,
|
|
help="Warning threshold for the number of nodes.",
|
|
)
|
|
@click.option(
|
|
"-c",
|
|
"--critical",
|
|
"critical",
|
|
type=str,
|
|
help="Critical threshold for the number of nodes.",
|
|
)
|
|
@click.option(
|
|
"--running-warning",
|
|
"running_warning",
|
|
type=str,
|
|
help="Warning threshold for the number of running nodes.",
|
|
)
|
|
@click.option(
|
|
"--running-critical",
|
|
"running_critical",
|
|
type=str,
|
|
help="Critical threshold for the number of running nodes.",
|
|
)
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def cluster_node_count(
|
|
ctx: click.Context,
|
|
warning: str,
|
|
critical: str,
|
|
running_warning: str,
|
|
running_critical: str,
|
|
) -> None:
|
|
"""Count the number of nodes in the cluster.
|
|
|
|
\b
|
|
Check:
|
|
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
|
* `OK`: If they are not provided.
|
|
|
|
\b
|
|
Perfdata:
|
|
* `members`: the member count.
|
|
* all the roles of the nodes in the cluster with their number.
|
|
* all the statuses of the nodes in the cluster with their number.
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
ClusterNodeCount(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext(
|
|
"members",
|
|
warning,
|
|
critical,
|
|
),
|
|
nagiosplugin.ScalarContext(
|
|
"state_running",
|
|
running_warning,
|
|
running_critical,
|
|
),
|
|
nagiosplugin.ScalarContext("member_roles"),
|
|
nagiosplugin.ScalarContext("member_statuses"),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="cluster_has_leader")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def cluster_has_leader(ctx: click.Context) -> None:
|
|
"""Check if the cluster has a leader.
|
|
|
|
Note: there is no difference between a normal and standby leader.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: if there is a leader node.
|
|
* `CRITICAL`: otherwise
|
|
|
|
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
ClusterHasLeader(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
|
|
ClusterHasLeaderSummary(),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="cluster_has_replica")
|
|
@click.option(
|
|
"-w",
|
|
"--warning",
|
|
"warning",
|
|
type=str,
|
|
help="Warning threshold for the number of healthy replica nodes.",
|
|
)
|
|
@click.option(
|
|
"-c",
|
|
"--critical",
|
|
"critical",
|
|
type=str,
|
|
help="Critical threshold for the number of healthy replica nodes.",
|
|
)
|
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def cluster_has_replica(
|
|
ctx: click.Context, warning: str, critical: str, max_lag: str
|
|
) -> None:
|
|
"""Check if the cluster has healthy replicas.
|
|
|
|
\b
|
|
A healthy replica:
|
|
* is in running state
|
|
* has a replica role
|
|
* has a lag lower or equal to max_lag
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
|
* `WARNING` / `CRITICAL`: otherwise
|
|
|
|
\b
|
|
Perfdata:
|
|
* healthy_replica & unhealthy_replica count
|
|
* the lag of each replica labelled with "member name"_lag
|
|
"""
|
|
|
|
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
|
|
nagiosplugin.ScalarContext(
|
|
"healthy_replica",
|
|
warning,
|
|
critical,
|
|
),
|
|
nagiosplugin.ScalarContext("unhealthy_replica"),
|
|
nagiosplugin.ScalarContext("replica_lag"),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="cluster_config_has_changed")
|
|
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
|
|
@click.option(
|
|
"-s",
|
|
"--state-file",
|
|
"state_file",
|
|
type=str,
|
|
help="A state file to store the hash of the configuration.",
|
|
)
|
|
@click.option(
|
|
"--save",
|
|
"save_config",
|
|
is_flag=True,
|
|
default=False,
|
|
help="Set the current configuration hash as the reference for future calls.",
|
|
)
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def cluster_config_has_changed(
|
|
ctx: click.Context, config_hash: str, state_file: str, save_config: bool
|
|
) -> None:
|
|
"""Check if the hash of the configuration has changed.
|
|
|
|
Note: either a hash or a state file must be provided for this service to work.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: The hash didn't change
|
|
* `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)
|
|
|
|
\b
|
|
Perfdata:
|
|
* `is_configuration_changed` is 1 if the configuration has changed
|
|
"""
|
|
# Note: hash cannot be in the perf data = not a number
|
|
if (config_hash is None and state_file is None) or (
|
|
config_hash is not None and state_file is not None
|
|
):
|
|
raise click.UsageError(
|
|
"Either --hash or --state-file should be provided for this service", ctx
|
|
)
|
|
|
|
old_config_hash = config_hash
|
|
if state_file is not None:
|
|
cookie = nagiosplugin.Cookie(state_file)
|
|
cookie.open()
|
|
old_config_hash = cookie.get("hash")
|
|
cookie.close()
|
|
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
ClusterConfigHasChanged(
|
|
ctx.obj.connection_info, old_config_hash, state_file, save_config
|
|
),
|
|
nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
|
|
ClusterConfigHasChangedSummary(old_config_hash),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="cluster_is_in_maintenance")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def cluster_is_in_maintenance(ctx: click.Context) -> None:
|
|
"""Check if the cluster is in maintenance mode or paused.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: If the cluster is in maintenance mode.
|
|
* `CRITICAL`: otherwise.
|
|
|
|
\b
|
|
Perfdata:
|
|
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
ClusterIsInMaintenance(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_is_primary")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_is_primary(ctx: click.Context) -> None:
|
|
"""Check if the node is the primary with the leader lock.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: if the node is a primary with the leader lock.
|
|
* `CRITICAL:` otherwise
|
|
|
|
Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodeIsPrimary(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
|
|
NodeIsPrimarySummary(),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_is_replica")
|
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
|
|
"""Check if the node is a running replica with no noloadbalance tag.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
|
|
* `CRITICAL`: otherwise
|
|
|
|
Perfdata: `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
|
|
"""
|
|
# FIXME add a lag check ??
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodeIsReplica(ctx.obj.connection_info, max_lag),
|
|
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
|
|
NodeIsReplicaSummary(max_lag),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_is_pending_restart")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_is_pending_restart(ctx: click.Context) -> None:
|
|
"""Check if the node is in pending restart state.
|
|
|
|
This situation can arise if the configuration has been modified but
|
|
requiers a restart of PostgreSQL to take effect.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: if the node has pending restart tag.
|
|
* `CRITICAL`: otherwise
|
|
|
|
Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodeIsPendingRestart(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext("is_pending_restart", None, "@1:1"),
|
|
NodeIsPendingRestartSummary(),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_tl_has_changed")
|
|
@click.option(
|
|
"--timeline", "timeline", type=str, help="A timeline number to compare with."
|
|
)
|
|
@click.option(
|
|
"-s",
|
|
"--state-file",
|
|
"state_file",
|
|
type=str,
|
|
help="A state file to store the last tl number into.",
|
|
)
|
|
@click.option(
|
|
"--save",
|
|
"save_tl",
|
|
is_flag=True,
|
|
default=False,
|
|
help="Set the current timeline number as the reference for future calls.",
|
|
)
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_tl_has_changed(
|
|
ctx: click.Context, timeline: str, state_file: str, save_tl: bool
|
|
) -> None:
|
|
"""Check if the timeline has changed.
|
|
|
|
Note: either a timeline or a state file must be provided for this service to work.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: The timeline is the same as last time (`--state_file`) or the inputed timeline (`--timeline`)
|
|
* `CRITICAL`: The tl is not the same.
|
|
|
|
\b
|
|
Perfdata:
|
|
* `is_timeline_changed` is 1 if the tl has changed, 0 otherwise
|
|
* the timeline
|
|
"""
|
|
if (timeline is None and state_file is None) or (
|
|
timeline is not None and state_file is not None
|
|
):
|
|
raise click.UsageError(
|
|
"Either --timeline or --state-file should be provided for this service", ctx
|
|
)
|
|
|
|
old_timeline = timeline
|
|
if state_file is not None:
|
|
cookie = nagiosplugin.Cookie(state_file)
|
|
cookie.open()
|
|
old_timeline = cookie.get("timeline")
|
|
cookie.close()
|
|
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodeTLHasChanged(ctx.obj.connection_info, old_timeline, state_file, save_tl),
|
|
nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
|
|
nagiosplugin.ScalarContext("timeline"),
|
|
NodeTLHasChangedSummary(old_timeline),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_patroni_version")
|
|
@click.option(
|
|
"--patroni-version",
|
|
"patroni_version",
|
|
type=str,
|
|
help="Patroni version to compare to",
|
|
required=True,
|
|
)
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
|
|
"""Check if the version is equal to the input
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: The version is the same as the input `--patroni-version`
|
|
* `CRITICAL`: otherwise.
|
|
|
|
\b
|
|
Perfdata:
|
|
* `is_version_ok` is 1 if version is ok, 0 otherwise
|
|
"""
|
|
# TODO the version cannot be written in perfdata find something else ?
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodePatroniVersion(ctx.obj.connection_info, patroni_version),
|
|
nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
|
|
nagiosplugin.ScalarContext("patroni_version"),
|
|
NodePatroniVersionSummary(patroni_version),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
|
|
|
|
|
@main.command(name="node_is_alive")
|
|
@click.pass_context
|
|
@nagiosplugin.guarded
|
|
def node_is_alive(ctx: click.Context) -> None:
|
|
"""Check if the node is alive ie patroni is running.
|
|
|
|
\b
|
|
Check:
|
|
* `OK`: If patroni is running.
|
|
* `CRITICAL`: otherwise.
|
|
|
|
\b
|
|
Perfdata:
|
|
* `is_running` is 1 if patroni is running, 0 otherwise
|
|
"""
|
|
check = nagiosplugin.Check()
|
|
check.add(
|
|
NodeIsAlive(ctx.obj.connection_info),
|
|
nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
|
|
NodeIsAliveSummary(),
|
|
)
|
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|