check-patroni/check_patroni/cli.py
benoit a4ed20210c Improve doc for node_is_replica
node_is_replica is using the following Patroni endpoints: replica, asynchronous
and synchronous. The first two implement the lag tag. For these endpoints
the state of a replica node doesn't reflect the replication state
(streaming or in archive recovery), we only know if it's running. The
timeline is also not checked.

Therefore, if a cluster is using asynchronous replication, it is recommended
to check for the lag to detect a divegence as soon as possible.
2024-02-26 16:02:53 +01:00

810 lines
25 KiB
Python

import logging
import re
from configparser import ConfigParser
from typing import List
import click
import nagiosplugin
from . import __version__, _log
from .cluster import (
ClusterConfigHasChanged,
ClusterConfigHasChangedSummary,
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterHasScheduledAction,
ClusterIsInMaintenance,
ClusterNodeCount,
)
from .convert import size_to_byte
from .node import (
NodeIsAlive,
NodeIsAliveSummary,
NodeIsLeader,
NodeIsLeaderSummary,
NodeIsPendingRestart,
NodeIsPendingRestartSummary,
NodeIsPrimary,
NodeIsPrimarySummary,
NodeIsReplica,
NodeIsReplicaSummary,
NodePatroniVersion,
NodePatroniVersionSummary,
NodeTLHasChanged,
NodeTLHasChangedSummary,
)
from .types import ConnectionInfo, Parameters
DEFAULT_CFG = "config.ini"
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(levelname)s - %(message)s"))
_log.addHandler(handler)
def print_version(ctx: click.Context, param: str, value: str) -> None:
if not value or ctx.resilient_parsing:
return
click.echo(f"Version {__version__}")
ctx.exit()
def configure(ctx: click.Context, param: str, filename: str) -> None:
"""Use a config file for the parameters
stolen from https://jwodder.github.io/kbits/posts/click-config/
"""
# FIXME should use click-configfile / click-config-file ?
cfg = ConfigParser()
cfg.read(filename)
ctx.default_map = {}
for sect in cfg.sections():
command_path = sect.split(".")
if command_path[0] != "options":
continue
defaults = ctx.default_map
for cmdname in command_path[1:]:
defaults = defaults.setdefault(cmdname, {})
defaults.update(cfg[sect])
try:
# endpoints is an array of addresses separated by ,
if isinstance(defaults["endpoints"], str):
defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
except KeyError:
pass
@click.group()
@click.option(
"--config",
type=click.Path(dir_okay=False),
default=DEFAULT_CFG,
callback=configure,
is_eager=True,
expose_value=False,
help="Read option defaults from the specified INI file",
show_default=True,
)
@click.option(
"-e",
"--endpoints",
"endpoints",
type=str,
multiple=True,
default=["http://127.0.0.1:8008"],
help=(
"Patroni API endpoint. Can be specified multiple times or as a list "
"of comma separated addresses. "
"The node services checks the status of one node, therefore if "
"several addresses are specified they should point to different "
"interfaces on the same node. The cluster services check the "
"status of the cluster, therefore it's better to give a list of "
"all Patroni node addresses."
),
show_default=True,
)
@click.option(
"--cert_file",
"cert_file",
type=click.Path(exists=True),
default=None,
help="File with the client certificate.",
)
@click.option(
"--key_file",
"key_file",
type=click.Path(exists=True),
default=None,
help="File with the client key.",
)
@click.option(
"--ca_file",
"ca_file",
type=click.Path(exists=True),
default=None,
help="The CA certificate.",
)
@click.option(
"-v",
"--verbose",
"verbose",
count=True,
default=0,
help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
show_default=False,
)
@click.option(
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
@click.option(
"--timeout",
"timeout",
default=2,
type=int,
help="Timeout in seconds for the API queries (0 to disable)",
show_default=True,
)
@click.pass_context
@nagiosplugin.guarded
def main(
ctx: click.Context,
endpoints: List[str],
cert_file: str,
key_file: str,
ca_file: str,
verbose: int,
timeout: int,
) -> None:
"""Nagios plugin that uses Patroni's REST API to monitor a Patroni cluster."""
# FIXME Not all "is/has" services have the same return code for ok. Check if it's ok
# We use this to pass parameters instead of ctx.parent.params because the
# latter is typed as Optional[Context] and mypy complains with the following
# error unless we test if ctx.parent is none which looked ugly.
#
# error: Item "None" of "Optional[Context]" has an attribute "params" [union-attr]
# The config file allows endpoints to be specified as a comma separated list of endpoints
# To avoid confusion, We allow the same in command line parameters
tendpoints: List[str] = []
for e in endpoints:
tendpoints += re.split(r"\s*,\s*", e)
endpoints = tendpoints
if verbose == 3:
logging.getLogger("urllib3").addHandler(handler)
logging.getLogger("urllib3").setLevel(logging.DEBUG)
_log.setLevel(logging.DEBUG)
connection_info: ConnectionInfo
if cert_file is None and key_file is None:
connection_info = ConnectionInfo(endpoints, None, ca_file)
else:
connection_info = ConnectionInfo(endpoints, (cert_file, key_file), ca_file)
ctx.obj = Parameters(
connection_info,
timeout,
verbose,
)
@main.command(name="cluster_node_count") # required otherwise _ are converted to -
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of nodes.",
)
@click.option(
"--healthy-warning",
"healthy_warning",
type=str,
help="Warning threshold for the number of healthy nodes (running + streaming).",
)
@click.option(
"--healthy-critical",
"healthy_critical",
type=str,
help="Critical threshold for the number of healthy nodes (running + streaming).",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_node_count(
ctx: click.Context,
warning: str,
critical: str,
healthy_warning: str,
healthy_critical: str,
) -> None:
"""Count the number of nodes in the cluster.
\b
The role refers to the role of the server in the cluster. Possible values
are:
* master or leader
* replica
* standby_leader
* sync_standby
* demoted
* promoted
* uninitialized
\b
The state refers to the state of PostgreSQL. Possible values are:
* initializing new cluster, initdb failed
* running custom bootstrap script, custom bootstrap failed
* starting, start failed
* restarting, restart failed
* running, streaming, in archive recovery
* stopping, stopped, stop failed
* creating replica
* crashed
\b
The "healthy" checks only ensures that:
* a leader has the running state
* a standby_leader has the running or streaming (V3.0.4) state
* a replica or sync-standby has the running or streaming (V3.0.4) state
Since we dont check the lag or timeline, "in archive recovery" is not considered a valid state
for this service. See cluster_has_leader and cluster_has_replica for specialized checks.
\b
Check:
* Compares the number of nodes against the normal and healthy nodes warning and critical thresholds.
* `OK`: If they are not provided.
\b
Perfdata:
* `members`: the member count.
* `healthy_members`: the running and streaming member count.
* all the roles of the nodes in the cluster with their count (start with "role_").
* all the statuses of the nodes in the cluster with their count (start with "state_").
"""
check = nagiosplugin.Check()
check.add(
ClusterNodeCount(ctx.obj.connection_info),
nagiosplugin.ScalarContext(
"members",
warning,
critical,
),
nagiosplugin.ScalarContext(
"healthy_members",
healthy_warning,
healthy_critical,
),
nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("member_statuses"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_leader")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader.
This check applies to any kind of leaders including standby leaders.
A leader is a node with the "leader" role and a "running" state.
A standby leader is a node with a "standby_leader" role and a "streaming"
or "in archive recovery" state. Please note that log shipping could be
stuck because the WAL are not available or applicable. Patroni doesn't
provide information about the origin cluster (timeline or lag), so we
cannot check if there is a problem in that particular case. That's why we
issue a warning when the node is "in archive recovery". We suggest using
other supervision tools to do this (eg. check_pgactivity).
\b
Check:
* `OK`: if there is a leader node.
* 'WARNING': if there is a stanby leader in archive mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
archive recovery", 0 otherwise
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None),
nagiosplugin.ScalarContext("is_leader", None, None),
nagiosplugin.ScalarContext("is_standby_leader", None, None),
ClusterHasLeaderSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_replica")
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of healthy replica nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of healthy replica nodes.",
)
@click.option(
"--sync-warning",
"sync_warning",
type=str,
help="Warning threshold for the number of sync replica.",
)
@click.option(
"--sync-critical",
"sync_critical",
type=str,
help="Critical threshold for the number of sync replica.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context,
warning: str,
critical: str,
sync_warning: str,
sync_critical: str,
max_lag: str,
) -> None:
"""Check if the cluster has healthy replicas and/or if some are sync standbies
\b
For patroni (and this check):
* a replica is `streaming` if the `pg_stat_wal_receiver` say's so.
* a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`.
\b
A healthy replica:
* has a `replica` or `sync_standby` role
* has the same timeline as the leader and
* is in `running` state (patroni < V3.0.4)
* is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
* has a lag lower or equal to `max_lag`
Please note that replica `in archive recovery` could be stuck because the WAL
are not available or applicable (the server's timeline has diverged for the
leader's). We already detect the latter but we will miss the former.
Therefore, it's preferable to check for the lag in addition to the healthy
state if you rely on log shipping to help lagging standbies to catch up.
Since we require a healthy replica to have the same timeline as the
leader, it's possible that we raise alerts when the cluster is performing a
switchover or failover and the standbies are in the process of catching up with
the new leader. The alert shouldn't last long.
\b
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
and if the sync_replica count is compatible with the sync replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata:
* healthy_replica & unhealthy_replica count
* the number of sync_replica, they are included in the previous count
* the lag of each replica labelled with "member name"_lag
* the timeline of each replica labelled with "member name"_timeline
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
"""
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
nagiosplugin.ScalarContext(
"healthy_replica",
warning,
critical,
),
nagiosplugin.ScalarContext(
"sync_replica",
sync_warning,
sync_critical,
),
nagiosplugin.ScalarContext("unhealthy_replica"),
nagiosplugin.ScalarContext("replica_lag"),
nagiosplugin.ScalarContext("replica_timeline"),
nagiosplugin.ScalarContext("replica_sync"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_config_has_changed")
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the hash of the configuration.",
)
@click.option(
"--save",
"save_config",
is_flag=True,
default=False,
help="Set the current configuration hash as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_config_has_changed(
ctx: click.Context, config_hash: str, state_file: str, save_config: bool
) -> None:
"""Check if the hash of the configuration has changed.
Note: either a hash or a state file must be provided for this service to work.
\b
Check:
* `OK`: The hash didn't change
* `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)
\b
Perfdata:
* `is_configuration_changed` is 1 if the configuration has changed
"""
# Note: hash cannot be in the perf data = not a number
if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None
):
raise click.UsageError(
"Either --hash or --state-file should be provided for this service", ctx
)
old_config_hash = config_hash
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_config_hash = cookie.get("hash")
cookie.close()
check = nagiosplugin.Check()
check.add(
ClusterConfigHasChanged(
ctx.obj.connection_info, old_config_hash, state_file, save_config
),
nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
ClusterConfigHasChangedSummary(old_config_hash),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
"""Check if the cluster is in maintenance mode or paused.
\b
Check:
* `OK`: If the cluster is in maintenance mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterIsInMaintenance(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_scheduled_action")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_scheduled_action(ctx: click.Context) -> None:
"""Check if the cluster has a scheduled action (switchover or restart)
\b
Check:
* `OK`: If the cluster has no scheduled action
* `CRITICAL`: otherwise.
\b
Perfdata:
* `scheduled_actions` is 1 if the cluster has scheduled actions.
* `scheduled_switchover` is 1 if the cluster has a scheduled switchover.
* `scheduled_restart` counts the number of scheduled restart in the cluster.
"""
check = nagiosplugin.Check()
check.add(
ClusterHasScheduledAction(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"),
nagiosplugin.ScalarContext("scheduled_switchover"),
nagiosplugin.ScalarContext("scheduled_restart"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
def node_is_primary(ctx: click.Context) -> None:
"""Check if the node is the primary with the leader lock.
This service is not valid for a standby leader, because this kind of node is not a primary.
\b
Check:
* `OK`: if the node is a primary with the leader lock.
* `CRITICAL:` otherwise
Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPrimary(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
NodeIsPrimarySummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_leader")
@click.option(
"--is-standby-leader",
"check_standby_leader",
is_flag=True,
default=False,
help="Check for a standby leader",
)
@click.pass_context
@nagiosplugin.guarded
def node_is_leader(ctx: click.Context, check_standby_leader: bool) -> None:
"""Check if the node is a leader node.
This check applies to any kind of leaders including standby leaders.
To check explicitly for a standby leader use the `--is-standby-leader` option.
\b
Check:
* `OK`: if the node is a leader.
* `CRITICAL:` otherwise
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsLeader(ctx.obj.connection_info, check_standby_leader),
nagiosplugin.ScalarContext("is_leader", None, "@0:0"),
NodeIsLeaderSummary(check_standby_leader),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_replica")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.option(
"--is-sync",
"check_is_sync",
is_flag=True,
default=False,
help="check if the replica is synchronous",
)
@click.option(
"--is-async",
"check_is_async",
is_flag=True,
default=False,
help="check if the replica is asynchronous",
)
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(
ctx: click.Context, max_lag: str, check_is_sync: bool, check_is_async: bool
) -> None:
"""Check if the node is a replica with no noloadbalance tag.
It is possible to check if the node is synchronous or asynchronous. If
nothing is specified any kind of replica is accepted. When checking for a
synchronous replica, it's not possible to specify a lag.
This service is using the following Patroni endpoints: replica, asynchronous
and synchronous. The first two implement the `lag` tag. For these endpoints
the state of a replica node doesn't reflect the replication state
(`streaming` or `in archive recovery`), we only know if it's `running`. The
timeline is also not checked.
Therefore, if a cluster is using asynchronous replication, it is
recommended to check for the lag to detect a divegence as soon as possible.
\b
Check:
* `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
* `CRITICAL`: otherwise
Perfdata: `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
"""
if check_is_sync and max_lag is not None:
raise click.UsageError(
"--is-sync and --max-lag cannot be provided at the same time for this service",
ctx,
)
if check_is_sync and check_is_async:
raise click.UsageError(
"--is-sync and --is-async cannot be provided at the same time for this service",
ctx,
)
check = nagiosplugin.Check()
check.add(
NodeIsReplica(ctx.obj.connection_info, max_lag, check_is_sync, check_is_async),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(max_lag, check_is_sync, check_is_async),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_pending_restart")
@click.pass_context
@nagiosplugin.guarded
def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but
requires a restart of PostgreSQL to take effect.
\b
Check:
* `OK`: if the node has no pending restart tag.
* `CRITICAL`: otherwise
Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPendingRestart(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_pending_restart", None, "0:0"),
NodeIsPendingRestartSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_tl_has_changed")
@click.option(
"--timeline", "timeline", type=str, help="A timeline number to compare with."
)
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the last tl number into.",
)
@click.option(
"--save",
"save_tl",
is_flag=True,
default=False,
help="Set the current timeline number as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(
ctx: click.Context, timeline: str, state_file: str, save_tl: bool
) -> None:
"""Check if the timeline has changed.
Note: either a timeline or a state file must be provided for this service to work.
\b
Check:
* `OK`: The timeline is the same as last time (`--state_file`) or the inputted timeline (`--timeline`)
* `CRITICAL`: The tl is not the same.
\b
Perfdata:
* `is_timeline_changed` is 1 if the tl has changed, 0 otherwise
* the timeline
"""
if (timeline is None and state_file is None) or (
timeline is not None and state_file is not None
):
raise click.UsageError(
"Either --timeline or --state-file should be provided for this service", ctx
)
old_timeline = timeline
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_timeline = cookie.get("timeline")
cookie.close()
check = nagiosplugin.Check()
check.add(
NodeTLHasChanged(ctx.obj.connection_info, old_timeline, state_file, save_tl),
nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
nagiosplugin.ScalarContext("timeline"),
NodeTLHasChangedSummary(old_timeline),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_patroni_version")
@click.option(
"--patroni-version",
"patroni_version",
type=str,
help="Patroni version to compare to",
required=True,
)
@click.pass_context
@nagiosplugin.guarded
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
"""Check if the version is equal to the input
\b
Check:
* `OK`: The version is the same as the input `--patroni-version`
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_version_ok` is 1 if version is ok, 0 otherwise
"""
# TODO the version cannot be written in perfdata find something else ?
check = nagiosplugin.Check()
check.add(
NodePatroniVersion(ctx.obj.connection_info, patroni_version),
nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
nagiosplugin.ScalarContext("patroni_version"),
NodePatroniVersionSummary(patroni_version),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_alive")
@click.pass_context
@nagiosplugin.guarded
def node_is_alive(ctx: click.Context) -> None:
"""Check if the node is alive ie patroni is running. This is
a liveness check as defined in Patroni's documentation.
\b
Check:
* `OK`: If patroni is running.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_running` is 1 if patroni is running, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
NodeIsAlive(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
NodeIsAliveSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)