check-patroni/check_patroni/cli.py

import logging
import re
from configparser import ConfigParser

import click
import nagiosplugin
from typing import List

from . import __version__
from .cluster import (
    ClusterConfigHasChanged,
    ClusterConfigHasChangedSummary,
    ClusterHasLeader,
    ClusterHasLeaderSummary,
    ClusterHasReplica,
    ClusterNodeCount,
    ClusterIsInMaintenance,
)
from .node import (
    NodeIsAlive,
    NodeIsAliveSummary,
    NodeIsPendingRestart,
    NodeIsPendingRestartSummary,
    NodeIsPrimary,
    NodeIsPrimarySummary,
    NodeIsReplica,
    NodeIsReplicaSummary,
    NodePatroniVersion,
    NodePatroniVersionSummary,
    NodeTLHasChanged,
    NodeTLHasChangedSummary,
)
from .types import ConnectionInfo, Parameters
from .convert import size_to_byte

_log = logging.getLogger(__name__)
DEFAULT_CFG = "config.ini"


def print_version(ctx: click.Context, param: str, value: str) -> None:
    if not value or ctx.resilient_parsing:
        return
    click.echo(f"Version {__version__}")
    ctx.exit()


def configure(ctx: click.Context, param: str, filename: str) -> None:
    """Use a config file for the parameters
    stolen from https://jwodder.github.io/kbits/posts/click-config/
    """
    # FIXME should use click-configfile / click-config-file ?
    cfg = ConfigParser()
    cfg.read(filename)
    ctx.default_map = {}
    for sect in cfg.sections():
        command_path = sect.split(".")
        if command_path[0] != "options":
            continue
        defaults = ctx.default_map
        for cmdname in command_path[1:]:
            defaults = defaults.setdefault(cmdname, {})
        defaults.update(cfg[sect])
        try:
            # endpoints is an array of addresses separated by ,
            if isinstance(defaults["endpoints"], str):
                defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
        except KeyError:
            pass


@click.group()
@click.option(
    "--config",
    type=click.Path(dir_okay=False),
    default=DEFAULT_CFG,
    callback=configure,
    is_eager=True,
    expose_value=False,
    help="Read option defaults from the specified INI file",
    show_default=True,
)
@click.option(
    "-e",
    "--endpoints",
    "endpoints",
    type=str,
    multiple=True,
    default=["http://127.0.0.1:8008"],
    help=(
        "Patroni API endpoint. Can be specified multiple times or as a list "
        "of comma separated addresses. "
        "The node services checks the status of one node, therefore if "
        "several addresses are specified they should point to different "
        "interfaces on the same node. The cluster services check the "
        "status of the cluster, therefore it's better to give a list of "
        "all Patroni node addresses."
    ),
    show_default=True,
)
@click.option(
    "--cert_file",
    "cert_file",
    type=click.Path(exists=True),
    default=None,
    help="File with the client certificate.",
)
@click.option(
    "--key_file",
    "key_file",
    type=click.Path(exists=True),
    default=None,
    help="File with the client key.",
)
@click.option(
    "--ca_file",
    "ca_file",
    type=click.Path(exists=True),
    default=None,
    help="The CA certificate.",
)
@click.option(
    "-v",
    "--verbose",
    "verbose",
    count=True,
    default=0,
    help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
    show_default=False,
)
@click.option(
    "--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
@click.option(
    "--timeout",
    "timeout",
    default=2,
    type=int,
    help="Timeout in seconds for the API queries (0 to disable)",
    show_default=True,
)
@click.pass_context
@nagiosplugin.guarded
def main(
    ctx: click.Context,
    endpoints: List[str],
    cert_file: str,
    key_file: str,
    ca_file: str,
    verbose: int,
    timeout: int,
) -> None:
    """Nagios plugin that uses Patroni's REST API to monitor a Patroni cluster."""
    # FIXME Not all "is/has" services have the same return code for ok. Check if it's ok

    # We use this to pass parameters instead of ctx.parent.params because the
    # latter is typed as Optional[Context] and mypy complains with the following
    # error unless we test if ctx.parent is none which looked ugly.
    #
    # error: Item "None" of "Optional[Context]" has an attribute "params"  [union-attr]

    # The config file allows endpoints to be specified as a comma separated list of endpoints
    # To avoid confusion, We allow the same in command line parameters
    tendpoints: List[str] = []
    for e in endpoints:
        tendpoints += re.split(r"\s*,\s*", e)
    endpoints = tendpoints

    if verbose == 3:
        logging.basicConfig(format="%(levelname)s - %(message)s", level=logging.DEBUG)
        logging.getLogger("urllib3").setLevel(logging.DEBUG)

    connection_info: ConnectionInfo
    if cert_file is None and key_file is None:
        connection_info = ConnectionInfo(endpoints, None, ca_file)
    else:
        connection_info = ConnectionInfo(endpoints, (cert_file, key_file), ca_file)

    ctx.obj = Parameters(
        connection_info,
        timeout,
        verbose,
    )


@main.command(name="cluster_node_count")  # required otherwise _ are converted to -
@click.option(
    "-w",
    "--warning",
    "warning",
    type=str,
    help="Warning threshold for the number of nodes.",
)
@click.option(
    "-c",
    "--critical",
    "critical",
    type=str,
    help="Critical threshold for the number of nodes.",
)
@click.option(
    "--running-warning",
    "running_warning",
    type=str,
    help="Warning threshold for the number of running nodes.",
)
@click.option(
    "--running-critical",
    "running_critical",
    type=str,
    help="Critical threshold for the number of running nodes.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_node_count(
    ctx: click.Context,
    warning: str,
    critical: str,
    running_warning: str,
    running_critical: str,
) -> None:
    """Count the number of nodes in the cluster.

    \b
    The state refers to the state of PostgreSQL. Possible values are:
    * initializing new cluster, initdb failed
    * running custom bootstrap script, custom bootstrap failed
    * starting, start failed
    * restarting, restart failed
    * running
    * stopping, stopped, stop failed
    * creating replica
    * crashed

    \b
    The role refers to the role of the server in the cluster. Possible values
    are:
    * master or leader (V3.0.0+)
    * replica
    * demoted
    * promoted
    * uninitialized

    \b
    Check:
    * Compares the number of nodes against the normal and running node warning and critical thresholds.
    * `OK`:  If they are not provided.

    \b
    Perfdata:
    * `members`: the member count.
    * all the roles of the nodes in the cluster with their number (start with "role_").
    * all the statuses of the nodes in the cluster with their number (start with "state_").
    """
    check = nagiosplugin.Check()
    check.add(
        ClusterNodeCount(ctx.obj.connection_info),
        nagiosplugin.ScalarContext(
            "members",
            warning,
            critical,
        ),
        nagiosplugin.ScalarContext(
            "state_running",
            running_warning,
            running_critical,
        ),
        nagiosplugin.ScalarContext("member_roles"),
        nagiosplugin.ScalarContext("member_statuses"),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="cluster_has_leader")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_leader(ctx: click.Context) -> None:
    """Check if the cluster has a leader.

    Note: there is no difference between a normal and standby leader.

    \b
    Check:
    * `OK`: if there is a leader node.
    * `CRITICAL`: otherwise

    Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
    """
    check = nagiosplugin.Check()
    check.add(
        ClusterHasLeader(ctx.obj.connection_info),
        nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
        ClusterHasLeaderSummary(),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="cluster_has_replica")
@click.option(
    "-w",
    "--warning",
    "warning",
    type=str,
    help="Warning threshold for the number of healthy replica nodes.",
)
@click.option(
    "-c",
    "--critical",
    "critical",
    type=str,
    help="Critical threshold for the number of healthy replica nodes.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
    ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None:
    """Check if the cluster has healthy replicas.

    \b
    A healthy replica:
    * is in running state
    * has a replica role
    * has a lag lower or equal to max_lag

    \b
    Check:
    * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
    * `WARNING` / `CRITICAL`: otherwise

    \b
    Perfdata:
    * healthy_replica & unhealthy_replica count
    * the lag of each replica labelled with  "member name"_lag
    """

    tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
    check = nagiosplugin.Check()
    check.add(
        ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
        nagiosplugin.ScalarContext(
            "healthy_replica",
            warning,
            critical,
        ),
        nagiosplugin.ScalarContext("unhealthy_replica"),
        nagiosplugin.ScalarContext("replica_lag"),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="cluster_config_has_changed")
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
@click.option(
    "-s",
    "--state-file",
    "state_file",
    type=str,
    help="A state file to store the hash of the configuration.",
)
@click.option(
    "--save",
    "save_config",
    is_flag=True,
    default=False,
    help="Set the current configuration hash as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_config_has_changed(
    ctx: click.Context, config_hash: str, state_file: str, save_config: bool
) -> None:
    """Check if the hash of the configuration has changed.

    Note: either a hash or a state file must be provided for this service to work.

    \b
    Check:
    * `OK`: The hash didn't change
    * `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)

    \b
    Perfdata:
    * `is_configuration_changed` is 1 if the configuration has changed
    """
    # Note: hash cannot be in the perf data = not a number
    if (config_hash is None and state_file is None) or (
        config_hash is not None and state_file is not None
    ):
        raise click.UsageError(
            "Either --hash or --state-file should be provided for this service", ctx
        )

    old_config_hash = config_hash
    if state_file is not None:
        cookie = nagiosplugin.Cookie(state_file)
        cookie.open()
        old_config_hash = cookie.get("hash")
        cookie.close()

    check = nagiosplugin.Check()
    check.add(
        ClusterConfigHasChanged(
            ctx.obj.connection_info, old_config_hash, state_file, save_config
        ),
        nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
        ClusterConfigHasChangedSummary(old_config_hash),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
    """Check if the cluster is in maintenance mode or paused.

    \b
    Check:
    * `OK`: If the cluster is in maintenance mode.
    * `CRITICAL`: otherwise.

    \b
    Perfdata:
    * `is_in_maintenance` is 1 the cluster is in maintenance mode,  0 otherwise
    """
    check = nagiosplugin.Check()
    check.add(
        ClusterIsInMaintenance(ctx.obj.connection_info),
        nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
def node_is_primary(ctx: click.Context) -> None:
    """Check if the node is the primary with the leader lock.

    \b
    Check:
    * `OK`: if the node is a primary with the leader lock.
    * `CRITICAL:` otherwise

    Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
    """
    check = nagiosplugin.Check()
    check.add(
        NodeIsPrimary(ctx.obj.connection_info),
        nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
        NodeIsPrimarySummary(),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_is_replica")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
    """Check if the node is a running replica with no noloadbalance tag.

    \b
    Check:
    * `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
    * `CRITICAL`:  otherwise

    Perfdata: `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
    """
    # FIXME add a lag check ??
    check = nagiosplugin.Check()
    check.add(
        NodeIsReplica(ctx.obj.connection_info, max_lag),
        nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
        NodeIsReplicaSummary(max_lag),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_is_pending_restart")
@click.pass_context
@nagiosplugin.guarded
def node_is_pending_restart(ctx: click.Context) -> None:
    """Check if the node is in pending restart state.

    This situation can arise if the configuration has been modified but
    requiers a restart of PostgreSQL to take effect.

    \b
    Check:
    * `OK`: if the node has no pending restart tag.
    * `CRITICAL`: otherwise

    Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
    """
    check = nagiosplugin.Check()
    check.add(
        NodeIsPendingRestart(ctx.obj.connection_info),
        nagiosplugin.ScalarContext("is_pending_restart", None, "0:0"),
        NodeIsPendingRestartSummary(),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_tl_has_changed")
@click.option(
    "--timeline", "timeline", type=str, help="A timeline number to compare with."
)
@click.option(
    "-s",
    "--state-file",
    "state_file",
    type=str,
    help="A state file to store the last tl number into.",
)
@click.option(
    "--save",
    "save_tl",
    is_flag=True,
    default=False,
    help="Set the current timeline number as the reference for future calls.",
)
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(
    ctx: click.Context, timeline: str, state_file: str, save_tl: bool
) -> None:
    """Check if the timeline has changed.

    Note: either a timeline or a state file must be provided for this service to work.

    \b
    Check:
    * `OK`: The timeline is the same as last time (`--state_file`) or the inputted timeline (`--timeline`)
    * `CRITICAL`: The tl is not the same.

    \b
    Perfdata:
    * `is_timeline_changed` is 1 if the tl has changed, 0 otherwise
    * the timeline
    """
    if (timeline is None and state_file is None) or (
        timeline is not None and state_file is not None
    ):
        raise click.UsageError(
            "Either --timeline or --state-file should be provided for this service", ctx
        )

    old_timeline = timeline
    if state_file is not None:
        cookie = nagiosplugin.Cookie(state_file)
        cookie.open()
        old_timeline = cookie.get("timeline")
        cookie.close()

    check = nagiosplugin.Check()
    check.add(
        NodeTLHasChanged(ctx.obj.connection_info, old_timeline, state_file, save_tl),
        nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
        nagiosplugin.ScalarContext("timeline"),
        NodeTLHasChangedSummary(old_timeline),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_patroni_version")
@click.option(
    "--patroni-version",
    "patroni_version",
    type=str,
    help="Patroni version to compare to",
    required=True,
)
@click.pass_context
@nagiosplugin.guarded
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
    """Check if the version is equal to the input

    \b
    Check:
    * `OK`: The version is the same as the input `--patroni-version`
    * `CRITICAL`: otherwise.

    \b
    Perfdata:
    * `is_version_ok` is 1 if version is ok, 0 otherwise
    """
    # TODO the version cannot be written in perfdata find something else ?
    check = nagiosplugin.Check()
    check.add(
        NodePatroniVersion(ctx.obj.connection_info, patroni_version),
        nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
        nagiosplugin.ScalarContext("patroni_version"),
        NodePatroniVersionSummary(patroni_version),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_is_alive")
@click.pass_context
@nagiosplugin.guarded
def node_is_alive(ctx: click.Context) -> None:
    """Check if the node is alive ie patroni is running. This is
    a liveness check as defined in Patroni's documentation.

    \b
    Check:
    * `OK`: If patroni is running.
    * `CRITICAL`: otherwise.

    \b
    Perfdata:
    * `is_running` is 1 if patroni is running, 0 otherwise
    """
    check = nagiosplugin.Check()
    check.add(
        NodeIsAlive(ctx.obj.connection_info),
        nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
        NodeIsAliveSummary(),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)