check-patroni/check_patroni/cli.py

686 lines
20 KiB
Python

import logging
import re
from configparser import ConfigParser
from typing import List
import click
import nagiosplugin
from . import __version__, _log
from .cluster import (
ClusterConfigHasChanged,
ClusterConfigHasChangedSummary,
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterHasScheduledAction,
ClusterIsInMaintenance,
ClusterNodeCount,
)
from .convert import size_to_byte
from .node import (
NodeIsAlive,
NodeIsAliveSummary,
NodeIsPendingRestart,
NodeIsPendingRestartSummary,
NodeIsPrimary,
NodeIsPrimarySummary,
NodeIsReplica,
NodeIsReplicaSummary,
NodePatroniVersion,
NodePatroniVersionSummary,
NodeTLHasChanged,
NodeTLHasChangedSummary,
)
from .types import ConnectionInfo, Parameters
DEFAULT_CFG = "config.ini"
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(levelname)s - %(message)s"))
_log.addHandler(handler)
def print_version(ctx: click.Context, param: str, value: str) -> None:
if not value or ctx.resilient_parsing:
return
click.echo(f"Version {__version__}")
ctx.exit()
def configure(ctx: click.Context, param: str, filename: str) -> None:
"""Use a config file for the parameters
stolen from https://jwodder.github.io/kbits/posts/click-config/
"""
# FIXME should use click-configfile / click-config-file ?
cfg = ConfigParser()
cfg.read(filename)
ctx.default_map = {}
for sect in cfg.sections():
command_path = sect.split(".")
if command_path[0] != "options":
continue
defaults = ctx.default_map
for cmdname in command_path[1:]:
defaults = defaults.setdefault(cmdname, {})
defaults.update(cfg[sect])
try:
# endpoints is an array of addresses separated by ,
if isinstance(defaults["endpoints"], str):
defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
except KeyError:
pass
@click.group()
@click.option(
"--config",
type=click.Path(dir_okay=False),
default=DEFAULT_CFG,
callback=configure,
is_eager=True,
expose_value=False,
help="Read option defaults from the specified INI file",
show_default=True,
)
@click.option(
"-e",
"--endpoints",
"endpoints",
type=str,
multiple=True,
default=["http://127.0.0.1:8008"],
help=(
"Patroni API endpoint. Can be specified multiple times or as a list "
"of comma separated addresses. "
"The node services checks the status of one node, therefore if "
"several addresses are specified they should point to different "
"interfaces on the same node. The cluster services check the "
"status of the cluster, therefore it's better to give a list of "
"all Patroni node addresses."
),
show_default=True,
)
@click.option(
"--cert_file",
"cert_file",
type=click.Path(exists=True),
default=None,
help="File with the client certificate.",
)
@click.option(
"--key_file",
"key_file",
type=click.Path(exists=True),
default=None,
help="File with the client key.",
)
@click.option(
"--ca_file",
"ca_file",
type=click.Path(exists=True),
default=None,
help="The CA certificate.",
)
@click.option(
"-v",
"--verbose",
"verbose",
count=True,
default=0,
help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
show_default=False,
)
@click.option(
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
@click.option(
"--timeout",
"timeout",
default=2,
type=int,
help="Timeout in seconds for the API queries (0 to disable)",
show_default=True,
)
@click.pass_context
@nagiosplugin.guarded
def main(
ctx: click.Context,
endpoints: List[str],
cert_file: str,
key_file: str,
ca_file: str,
verbose: int,
timeout: int,
) -> None:
"""Nagios plugin that uses Patroni's REST API to monitor a Patroni cluster."""
# FIXME Not all "is/has" services have the same return code for ok. Check if it's ok
# We use this to pass parameters instead of ctx.parent.params because the
# latter is typed as Optional[Context] and mypy complains with the following
# error unless we test if ctx.parent is none which looked ugly.
#
# error: Item "None" of "Optional[Context]" has an attribute "params" [union-attr]
# The config file allows endpoints to be specified as a comma separated list of endpoints
# To avoid confusion, We allow the same in command line parameters
tendpoints: List[str] = []
for e in endpoints:
tendpoints += re.split(r"\s*,\s*", e)
endpoints = tendpoints
if verbose == 3:
logging.getLogger("urllib3").addHandler(handler)
logging.getLogger("urllib3").setLevel(logging.DEBUG)
_log.setLevel(logging.DEBUG)
connection_info: ConnectionInfo
if cert_file is None and key_file is None:
connection_info = ConnectionInfo(endpoints, None, ca_file)
else:
connection_info = ConnectionInfo(endpoints, (cert_file, key_file), ca_file)
ctx.obj = Parameters(
connection_info,
timeout,
verbose,
)
@main.command(name="cluster_node_count") # required otherwise _ are converted to -
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of nodes.",
)
@click.option(
"--healthy-warning",
"healthy_warning",
type=str,
help="Warning threshold for the number of healthy nodes (running + streaming).",
)
@click.option(
"--healthy-critical",
"healthy_critical",
type=str,
help="Critical threshold for the number of healthy nodes (running + streaming).",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_node_count(
ctx: click.Context,
warning: str,
critical: str,
healthy_warning: str,
healthy_critical: str,
) -> None:
"""Count the number of nodes in the cluster.
\b
The state refers to the state of PostgreSQL. Possible values are:
* initializing new cluster, initdb failed
* running custom bootstrap script, custom bootstrap failed
* starting, start failed
* restarting, restart failed
* running, streaming (for a replica V3.0.4)
* stopping, stopped, stop failed
* creating replica
* crashed
\b
The role refers to the role of the server in the cluster. Possible values
are:
* master or leader (V3.0.0+)
* replica
* demoted
* promoted
* uninitialized
\b
Check:
* Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
* `OK`: If they are not provided.
\b
Perfdata:
* `members`: the member count.
* `healthy_members`: the running and streaming member count.
* all the roles of the nodes in the cluster with their count (start with "role_").
* all the statuses of the nodes in the cluster with their count (start with "state_").
"""
check = nagiosplugin.Check()
check.add(
ClusterNodeCount(ctx.obj.connection_info),
nagiosplugin.ScalarContext(
"members",
warning,
critical,
),
nagiosplugin.ScalarContext(
"healthy_members",
healthy_warning,
healthy_critical,
),
nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("member_statuses"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_leader")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader.
This check applies to any kind of leaders including standby leaders.
\b
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
ClusterHasLeaderSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_replica")
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of healthy replica nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of healthy replica nodes.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None:
"""Check if the cluster has healthy replicas.
\b
A healthy replica:
* is in running or streaming state (V3.0.4)
* has a replica or sync_standby role
* has a lag lower or equal to max_lag
\b
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata:
* healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag
"""
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
nagiosplugin.ScalarContext(
"healthy_replica",
warning,
critical,
),
nagiosplugin.ScalarContext("unhealthy_replica"),
nagiosplugin.ScalarContext("replica_lag"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_config_has_changed")
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the hash of the configuration.",
)
@click.option(
"--save",
"save_config",
is_flag=True,
default=False,
help="Set the current configuration hash as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_config_has_changed(
ctx: click.Context, config_hash: str, state_file: str, save_config: bool
) -> None:
"""Check if the hash of the configuration has changed.
Note: either a hash or a state file must be provided for this service to work.
\b
Check:
* `OK`: The hash didn't change
* `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)
\b
Perfdata:
* `is_configuration_changed` is 1 if the configuration has changed
"""
# Note: hash cannot be in the perf data = not a number
if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None
):
raise click.UsageError(
"Either --hash or --state-file should be provided for this service", ctx
)
old_config_hash = config_hash
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_config_hash = cookie.get("hash")
cookie.close()
check = nagiosplugin.Check()
check.add(
ClusterConfigHasChanged(
ctx.obj.connection_info, old_config_hash, state_file, save_config
),
nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
ClusterConfigHasChangedSummary(old_config_hash),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
"""Check if the cluster is in maintenance mode or paused.
\b
Check:
* `OK`: If the cluster is in maintenance mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterIsInMaintenance(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_scheduled_action")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_scheduled_action(ctx: click.Context) -> None:
"""Check if the cluster has a scheduled action (switchover or restart)
\b
Check:
* `OK`: If the cluster has no scheduled action
* `CRITICAL`: otherwise.
\b
Perfdata:
* `scheduled_actions` is 1 if the cluster has scheduled actions.
* `scheduled_switchover` is 1 if the cluster has a scheduled switchover.
* `scheduled_restart` counts the number of scheduled restart in the cluster.
"""
check = nagiosplugin.Check()
check.add(
ClusterHasScheduledAction(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"),
nagiosplugin.ScalarContext("scheduled_switchover"),
nagiosplugin.ScalarContext("scheduled_restart"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
def node_is_primary(ctx: click.Context) -> None:
"""Check if the node is the primary with the leader lock.
\b
Check:
* `OK`: if the node is a primary with the leader lock.
* `CRITICAL:` otherwise
Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPrimary(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
NodeIsPrimarySummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_replica")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.option(
"--is-sync",
"check_is_sync",
is_flag=True,
default=False,
help="check if the replica is synchronous",
)
@click.option(
"--is-async",
"check_is_async",
is_flag=True,
default=False,
help="check if the replica is asynchronous",
)
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(
ctx: click.Context, max_lag: str, check_is_sync: bool, check_is_async: bool
) -> None:
"""Check if the node is a running replica with no noloadbalance tag.
It is possible to check if the node is synchronous or asynchronous. If nothing is specified any kind of replica is accepted.
When checking for a synchronous replica, it's not possible to specify a lag.
\b
Check:
* `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
* `CRITICAL`: otherwise
Perfdata: `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
"""
if check_is_sync and max_lag is not None:
raise click.UsageError(
"--is-sync and --max-lag cannot be provided at the same time for this service",
ctx,
)
if check_is_sync and check_is_async:
raise click.UsageError(
"--is-sync and --is-async cannot be provided at the same time for this service",
ctx,
)
check = nagiosplugin.Check()
check.add(
NodeIsReplica(ctx.obj.connection_info, max_lag, check_is_sync, check_is_async),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(max_lag, check_is_sync, check_is_async),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_pending_restart")
@click.pass_context
@nagiosplugin.guarded
def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but
requiers a restart of PostgreSQL to take effect.
\b
Check:
* `OK`: if the node has no pending restart tag.
* `CRITICAL`: otherwise
Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPendingRestart(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_pending_restart", None, "0:0"),
NodeIsPendingRestartSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_tl_has_changed")
@click.option(
"--timeline", "timeline", type=str, help="A timeline number to compare with."
)
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the last tl number into.",
)
@click.option(
"--save",
"save_tl",
is_flag=True,
default=False,
help="Set the current timeline number as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(
ctx: click.Context, timeline: str, state_file: str, save_tl: bool
) -> None:
"""Check if the timeline has changed.
Note: either a timeline or a state file must be provided for this service to work.
\b
Check:
* `OK`: The timeline is the same as last time (`--state_file`) or the inputted timeline (`--timeline`)
* `CRITICAL`: The tl is not the same.
\b
Perfdata:
* `is_timeline_changed` is 1 if the tl has changed, 0 otherwise
* the timeline
"""
if (timeline is None and state_file is None) or (
timeline is not None and state_file is not None
):
raise click.UsageError(
"Either --timeline or --state-file should be provided for this service", ctx
)
old_timeline = timeline
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_timeline = cookie.get("timeline")
cookie.close()
check = nagiosplugin.Check()
check.add(
NodeTLHasChanged(ctx.obj.connection_info, old_timeline, state_file, save_tl),
nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
nagiosplugin.ScalarContext("timeline"),
NodeTLHasChangedSummary(old_timeline),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_patroni_version")
@click.option(
"--patroni-version",
"patroni_version",
type=str,
help="Patroni version to compare to",
required=True,
)
@click.pass_context
@nagiosplugin.guarded
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
"""Check if the version is equal to the input
\b
Check:
* `OK`: The version is the same as the input `--patroni-version`
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_version_ok` is 1 if version is ok, 0 otherwise
"""
# TODO the version cannot be written in perfdata find something else ?
check = nagiosplugin.Check()
check.add(
NodePatroniVersion(ctx.obj.connection_info, patroni_version),
nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
nagiosplugin.ScalarContext("patroni_version"),
NodePatroniVersionSummary(patroni_version),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_alive")
@click.pass_context
@nagiosplugin.guarded
def node_is_alive(ctx: click.Context) -> None:
"""Check if the node is alive ie patroni is running. This is
a liveness check as defined in Patroni's documentation.
\b
Check:
* `OK`: If patroni is running.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_running` is 1 if patroni is running, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
NodeIsAlive(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
NodeIsAliveSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)