check-patroni/check_patroni/cli.py

623 lines
18 KiB
Python

import logging
import re
from configparser import ConfigParser
import click
import nagiosplugin
from typing import List
from . import __version__
from .cluster import (
ClusterConfigHasChanged,
ClusterConfigHasChangedSummary,
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterNodeCount,
ClusterIsInMaintenance,
)
from .node import (
NodeIsAlive,
NodeIsAliveSummary,
NodeIsPendingRestart,
NodeIsPendingRestartSummary,
NodeIsPrimary,
NodeIsPrimarySummary,
NodeIsReplica,
NodeIsReplicaSummary,
NodePatroniVersion,
NodePatroniVersionSummary,
NodeTLHasChanged,
NodeTLHasChangedSummary,
)
from .types import ConnectionInfo, Parameters
from .convert import size_to_byte
_log = logging.getLogger(__name__)
DEFAULT_CFG = "config.ini"
def print_version(ctx: click.Context, param: str, value: str) -> None:
if not value or ctx.resilient_parsing:
return
click.echo(f"Version {__version__}")
ctx.exit()
def configure(ctx: click.Context, param: str, filename: str) -> None:
"""Use a config file for the parameters
stolen from https://jwodder.github.io/kbits/posts/click-config/
"""
# FIXME should use click-configfile / click-config-file ?
cfg = ConfigParser()
cfg.read(filename)
ctx.default_map = {}
for sect in cfg.sections():
command_path = sect.split(".")
if command_path[0] != "options":
continue
defaults = ctx.default_map
for cmdname in command_path[1:]:
defaults = defaults.setdefault(cmdname, {})
defaults.update(cfg[sect])
try:
# endpoints is an array of addresses separated by ,
if isinstance(defaults["endpoints"], str):
defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
except KeyError:
pass
@click.group()
@click.option(
"--config",
type=click.Path(dir_okay=False),
default=DEFAULT_CFG,
callback=configure,
is_eager=True,
expose_value=False,
help="Read option defaults from the specified INI file",
show_default=True,
)
@click.option(
"-e",
"--endpoints",
"endpoints",
type=str,
multiple=True,
default=["http://127.0.0.1:8008"],
help=(
"Patroni API endpoint. Can be specified multiple times or as a list "
"of comma separated addresses. "
"The node services checks the status of one node, therefore if "
"several addresses are specified they should point to different "
"interfaces on the same node. The cluster services check the "
"status of the cluster, therefore it's better to give a list of "
"all Patroni node addresses."
),
show_default=True,
)
@click.option(
"--cert_file",
"cert_file",
type=click.Path(exists=True),
default=None,
help="File with the client certificate.",
)
@click.option(
"--key_file",
"key_file",
type=click.Path(exists=True),
default=None,
help="File with the client key.",
)
@click.option(
"--ca_file",
"ca_file",
type=click.Path(exists=True),
default=None,
help="The CA certificate.",
)
@click.option(
"-v",
"--verbose",
"verbose",
count=True,
default=0,
help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
show_default=False,
)
@click.option(
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
@click.option(
"--timeout",
"timeout",
default=2,
type=int,
help="Timeout in seconds for the API queries (0 to disable)",
show_default=True,
)
@click.pass_context
@nagiosplugin.guarded
def main(
ctx: click.Context,
endpoints: List[str],
cert_file: str,
key_file: str,
ca_file: str,
verbose: int,
timeout: int,
) -> None:
"""Nagios plugin that uses Patroni's REST API to monitor a Patroni cluster."""
# FIXME Not all "is/has" services have the same return code for ok. Check if it's ok
# We use this to pass parameters instead of ctx.parent.params because the
# latter is typed as Optional[Context] and mypy complains with the following
# error unless we test if ctx.parent is none which looked ugly.
#
# error: Item "None" of "Optional[Context]" has an attribute "params" [union-attr]
# The config file allows endpoints to be specified as a comma separated list of endpoints
# To avoid confusion, We allow the same in command line parameters
tendpoints: List[str] = []
for e in endpoints:
tendpoints += re.split(r"\s*,\s*", e)
endpoints = tendpoints
if verbose == 3:
logging.basicConfig(format="%(levelname)s - %(message)s", level=logging.DEBUG)
logging.getLogger("urllib3").setLevel(logging.DEBUG)
connection_info: ConnectionInfo
if cert_file is None and key_file is None:
connection_info = ConnectionInfo(endpoints, None, ca_file)
else:
connection_info = ConnectionInfo(endpoints, (cert_file, key_file), ca_file)
ctx.obj = Parameters(
connection_info,
timeout,
verbose,
)
@main.command(name="cluster_node_count") # required otherwise _ are converted to -
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of nodes.",
)
@click.option(
"--running-warning",
"running_warning",
type=str,
help="Warning threshold for the number of running nodes.",
)
@click.option(
"--running-critical",
"running_critical",
type=str,
help="Critical threshold for the number of running nodes.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_node_count(
ctx: click.Context,
warning: str,
critical: str,
running_warning: str,
running_critical: str,
) -> None:
"""Count the number of nodes in the cluster.
\b
The state refers to the state of PostgreSQL. Possible values are:
* initializing new cluster, initdb failed
* running custom bootstrap script, custom bootstrap failed
* starting, start failed
* restarting, restart failed
* running
* stopping, stopped, stop failed
* creating replica
* crashed
\b
The role refers to the role of the server in the cluster. Possible values
are:
* master or leader (V3.0.0+)
* replica
* demoted
* promoted
* uninitialized
\b
Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`: If they are not provided.
\b
Perfdata:
* `members`: the member count.
* all the roles of the nodes in the cluster with their number (start with "role_").
* all the statuses of the nodes in the cluster with their number (start with "state_").
"""
check = nagiosplugin.Check()
check.add(
ClusterNodeCount(ctx.obj.connection_info),
nagiosplugin.ScalarContext(
"members",
warning,
critical,
),
nagiosplugin.ScalarContext(
"state_running",
running_warning,
running_critical,
),
nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("member_statuses"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_leader")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader.
Note: there is no difference between a normal and standby leader.
\b
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
ClusterHasLeaderSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_replica")
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of healthy replica nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of healthy replica nodes.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None:
"""Check if the cluster has healthy replicas.
\b
A healthy replica:
* is in running state
* has a replica role
* has a lag lower or equal to max_lag
\b
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata:
* healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag
"""
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
nagiosplugin.ScalarContext(
"healthy_replica",
warning,
critical,
),
nagiosplugin.ScalarContext("unhealthy_replica"),
nagiosplugin.ScalarContext("replica_lag"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_config_has_changed")
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the hash of the configuration.",
)
@click.option(
"--save",
"save_config",
is_flag=True,
default=False,
help="Set the current configuration hash as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_config_has_changed(
ctx: click.Context, config_hash: str, state_file: str, save_config: bool
) -> None:
"""Check if the hash of the configuration has changed.
Note: either a hash or a state file must be provided for this service to work.
\b
Check:
* `OK`: The hash didn't change
* `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)
\b
Perfdata:
* `is_configuration_changed` is 1 if the configuration has changed
"""
# Note: hash cannot be in the perf data = not a number
if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None
):
raise click.UsageError(
"Either --hash or --state-file should be provided for this service", ctx
)
old_config_hash = config_hash
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_config_hash = cookie.get("hash")
cookie.close()
check = nagiosplugin.Check()
check.add(
ClusterConfigHasChanged(
ctx.obj.connection_info, old_config_hash, state_file, save_config
),
nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
ClusterConfigHasChangedSummary(old_config_hash),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
"""Check if the cluster is in maintenance mode or paused.
\b
Check:
* `OK`: If the cluster is in maintenance mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterIsInMaintenance(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
def node_is_primary(ctx: click.Context) -> None:
"""Check if the node is the primary with the leader lock.
\b
Check:
* `OK`: if the node is a primary with the leader lock.
* `CRITICAL:` otherwise
Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPrimary(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
NodeIsPrimarySummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_replica")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
"""Check if the node is a running replica with no noloadbalance tag.
\b
Check:
* `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
* `CRITICAL`: otherwise
Perfdata: `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
"""
# FIXME add a lag check ??
check = nagiosplugin.Check()
check.add(
NodeIsReplica(ctx.obj.connection_info, max_lag),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(max_lag),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_pending_restart")
@click.pass_context
@nagiosplugin.guarded
def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but
requiers a restart of PostgreSQL to take effect.
\b
Check:
* `OK`: if the node has no pending restart tag.
* `CRITICAL`: otherwise
Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPendingRestart(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_pending_restart", None, "0:0"),
NodeIsPendingRestartSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_tl_has_changed")
@click.option(
"--timeline", "timeline", type=str, help="A timeline number to compare with."
)
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the last tl number into.",
)
@click.option(
"--save",
"save_tl",
is_flag=True,
default=False,
help="Set the current timeline number as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(
ctx: click.Context, timeline: str, state_file: str, save_tl: bool
) -> None:
"""Check if the timeline has changed.
Note: either a timeline or a state file must be provided for this service to work.
\b
Check:
* `OK`: The timeline is the same as last time (`--state_file`) or the inputted timeline (`--timeline`)
* `CRITICAL`: The tl is not the same.
\b
Perfdata:
* `is_timeline_changed` is 1 if the tl has changed, 0 otherwise
* the timeline
"""
if (timeline is None and state_file is None) or (
timeline is not None and state_file is not None
):
raise click.UsageError(
"Either --timeline or --state-file should be provided for this service", ctx
)
old_timeline = timeline
if state_file is not None:
cookie = nagiosplugin.Cookie(state_file)
cookie.open()
old_timeline = cookie.get("timeline")
cookie.close()
check = nagiosplugin.Check()
check.add(
NodeTLHasChanged(ctx.obj.connection_info, old_timeline, state_file, save_tl),
nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
nagiosplugin.ScalarContext("timeline"),
NodeTLHasChangedSummary(old_timeline),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_patroni_version")
@click.option(
"--patroni-version",
"patroni_version",
type=str,
help="Patroni version to compare to",
required=True,
)
@click.pass_context
@nagiosplugin.guarded
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
"""Check if the version is equal to the input
\b
Check:
* `OK`: The version is the same as the input `--patroni-version`
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_version_ok` is 1 if version is ok, 0 otherwise
"""
# TODO the version cannot be written in perfdata find something else ?
check = nagiosplugin.Check()
check.add(
NodePatroniVersion(ctx.obj.connection_info, patroni_version),
nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
nagiosplugin.ScalarContext("patroni_version"),
NodePatroniVersionSummary(patroni_version),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_alive")
@click.pass_context
@nagiosplugin.guarded
def node_is_alive(ctx: click.Context) -> None:
"""Check if the node is alive ie patroni is running. This is
a liveness check as defined in Patroni's documentation.
\b
Check:
* `OK`: If patroni is running.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `is_running` is 1 if patroni is running, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
NodeIsAlive(ctx.obj.connection_info),
nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
NodeIsAliveSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)