diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..158aaae --- /dev/null +++ b/.flake8 @@ -0,0 +1,10 @@ +[flake8] +doctests = True +ignore = + E501, # line too long +exclude = + .git, + .mypy_cache, + .tox, + .venv, +mypy_config = mypy.ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4b9e47f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +check_patroni/__pycache__/ +test/*.state_file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 036c39f..e6f009f 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -1,17 +1,521 @@ -import requests +import click +from configparser import ConfigParser +import nagiosplugin +import re +from typing import List + +from . import __version__ +from .cluster import ( + ClusterConfigHasChanged, + ClusterConfigHasChangedSummary, + ClusterHasLeader, + ClusterHasLeaderSummary, + ClusterHasReplica, + ClusterNodeCount, +) +from .node import ( + NodeIsAlive, + NodeIsAliveSummary, + NodeIsPendingRestart, + NodeIsPendingRestartSummary, + NodeIsPrimary, + NodeIsPrimarySummary, + NodeIsReplica, + NodeIsReplicaSummary, + NodePatroniVersion, + NodePatroniVersionSummary, + NodeTLHasChanged, + NodeTLHasChangedSummary, +) +from .types import ConnectionInfo -def check_is_master(address: str = "127.0.0.1", port: int = 8008): - r = requests.get(f"{address}:{int(port)}/leader") - return r.status_code == 200 +def print_version(ctx: click.Context, param: str, value: str) -> None: + if not value or ctx.resilient_parsing: + return + click.echo(f"Version {__version__}") + ctx.exit() -def check_is_replica(address: str = "127.0.0.1", port: int = 8008): - r = requests.get(f"{address}:{int(port)}/replica") - return r.status_code == 200 +DEFAULT_CFG = "config.ini" -def main() -> None: - print(check_is_master()) - print(check_is_replica()) - print("allgood") +def configure(ctx: click.Context, param: str, filename: str) -> None: + """Use a config file for the parameters + stolen from https://jwodder.github.io/kbits/posts/click-config/ + """ + # FIXME should use click-configfile / click-config-file ? + cfg = ConfigParser() + cfg.read(filename) + ctx.default_map = {} + for sect in cfg.sections(): + command_path = sect.split(".") + if command_path[0] != "options": + continue + defaults = ctx.default_map + for cmdname in command_path[1:]: + defaults = defaults.setdefault(cmdname, {}) + defaults.update(cfg[sect]) + try: + # endpoints is an array of addresses separated by , + if isinstance(defaults["endpoints"], str): + defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"]) + except KeyError: + pass + + +@click.group() +@click.option( + "--config", + type=click.Path(dir_okay=False), + default=DEFAULT_CFG, + callback=configure, + is_eager=True, + expose_value=False, + help="Read option defaults from the specified INI file", + show_default=True, +) +@click.option( + "-e", + "--endpoints", + "endpoints", + type=str, + multiple=True, + default=["http://127.0.0.1:8008"], + help="API endpoint. Can be specified multiple times.", +) +@click.option( + "--cert_file", + "cert_file", + type=str, + help="File with the client certificate.", +) +@click.option( + "--key_file", + "key_file", + type=str, + help="File with the client key.", +) +@click.option( + "--ca_file", + "ca_file", + type=str, + help="The CA certificate.", +) +@click.option( + "-v", + "--verbose", + "verbose", + count=True, + help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)", +) +@click.option( + "--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True +) +@click.option( + "--timeout", + "timeout", + default=2, + type=int, + help="Timeout in seconds for the API queries (0 to disable)", +) +@click.pass_context +@nagiosplugin.guarded +def main( + ctx: click.Context, + endpoints: List[str], + cert_file: str, + key_file: str, + ca_file: str, + verbose: bool, + timeout: int, +) -> None: + """Nagios plugin for patroni.""" + ctx.obj = ConnectionInfo(endpoints, cert_file, key_file, ca_file) + + # TODO Not all "is/has" services have the same return code for ok. Check if it's ok + # Typing + + +@main.command(name="cluster_node_count") # required otherwise _ are converted to - +@click.option( + "-w", + "--warning", + "warning", + type=str, + help="Warning threshold for the number of nodes.", +) +@click.option( + "-c", + "--critical", + "critical", + type=str, + help="Critical threshold for the nimber of nodes.", +) +@click.option( + "--running-warning", + "running_warning", + type=str, + help="Warning threshold for the number of running nodes.", +) +@click.option( + "--running-critical", + "running_critical", + type=str, + help="Critical threshold for the nimber of running nodes.", +) +@click.pass_context +@nagiosplugin.guarded +def cluster_node_count( + ctx: click.Context, + warning: str, + critical: str, + running_warning: str, + running_critical: str, +) -> None: + """Count the number of nodes in the cluster. + + \b + Check: + * Compares the number of nodes against the normal and running node warning and critical thresholds. + * `OK`! If they are not provided. + + \b + Perfdata: + * `members`: the member count. + * all the roles of the nodes in the cluster with their number. + """ + check = nagiosplugin.Check() + check.add( + ClusterNodeCount(ctx.obj), + nagiosplugin.ScalarContext( + "members", + warning, + critical, + ), + nagiosplugin.ScalarContext( + "state_running", + running_warning, + running_critical, + ), + nagiosplugin.ScalarContext("members_roles"), + nagiosplugin.ScalarContext("members_statuses"), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="cluster_has_leader") +@click.pass_context +@nagiosplugin.guarded +def cluster_has_leader(ctx: click.Context) -> None: + """Check if the cluster has a leader. + + \b + Check: + * `OK`: if there is a leader node. + * `CRITICAL`: otherwise + + Perfdata : `has_leader` is 1 if there is a leader node, 0 otherwise + """ + # TODO: Manage primary or standby leader in the same place ? + check = nagiosplugin.Check() + check.add( + ClusterHasLeader(ctx.obj), + nagiosplugin.ScalarContext("has_leader", None, "@0:0"), + ClusterHasLeaderSummary(), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="cluster_has_replica") +@click.option( + "-w", + "--warning", + "warning", + type=str, + help="Warning threshold for the number of nodes.", +) +@click.option( + "-c", + "--critical", + "critical", + type=str, + help="Critical threshold for the number of replica nodes.", +) +@click.option( + "--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag." +) +# FIWME how do we manage maximum_lag_on_failover without doing many api calls +@click.option( + "--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag." +) +@click.pass_context +@nagiosplugin.guarded +def cluster_has_replica( + ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str +) -> None: + """Check if the cluster has replicas and their lag. + + \b + Check: + * `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds. + * `WARNING` / `CRITICAL`: otherwise + + \b + Perfdata : + * replica count + * the lag of each replica labelled with "member name"_lag + """ + # FIXME the idea here would be to make sur we have a replica. + # lag should be check to prune invalid replicas + check = nagiosplugin.Check() + check.add( + ClusterHasReplica(ctx.obj), + nagiosplugin.ScalarContext( + "replica_count", + warning, + critical, + ), + nagiosplugin.ScalarContext( + "replica_lag", + lag_warning, + lag_critical, + ), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="cluster_config_has_changed") +@click.option("--hash", "config_hash", type=str, help="A hash to compare with.") +@click.option( + "-s", + "--state-file", + "state_file", + type=str, + help="A state file to store the tl number into.", +) +@click.pass_context +@nagiosplugin.guarded +def cluster_config_has_changed( + ctx: click.Context, config_hash: str, state_file: str +) -> None: + """Check if the hash of the configuration has changed. + + Note: either a hash or a state file must be provided for this service to work. + + \b + Check: + * `OK`: The hash didn't change + * `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`) + + \b + Perfdata : + * `is_configuration_changed` is 1 if the configuration has changed + """ + # FIXME hash in perfdata ? + if (config_hash is None and state_file is None) or ( + config_hash is not None and state_file is not None + ): + raise click.UsageError( + "Either --hash or --state-file should be provided for this service", ctx + ) + + check = nagiosplugin.Check() + check.add( + ClusterConfigHasChanged(ctx.obj, config_hash, state_file), + nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"), + ClusterConfigHasChangedSummary(), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_is_primary") +@click.pass_context +@nagiosplugin.guarded +def node_is_primary(ctx: click.Context) -> None: + """Check if the node is the primary with the leader lock. + + \b + Check: + * `OK`: if the node is a primary with the leader lock. + * `CRITICAL:` otherwise + + Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise. + """ + check = nagiosplugin.Check() + check.add( + NodeIsPrimary(ctx.obj), + nagiosplugin.ScalarContext("is_primary", None, "@0:0"), + NodeIsPrimarySummary(), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_is_replica") +@click.option("--lag", "lag", type=str, help="maximum allowed lag") +@click.pass_context +@nagiosplugin.guarded +def node_is_replica(ctx: click.Context, lag: str) -> None: + """Check if the node is a running replica with no noloadbalance tag. + + \b + Check: + * `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold. + * `CRITICAL`: otherwise + + Perfdata : `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise. + """ + # add a lag check ?? + check = nagiosplugin.Check() + check.add( + NodeIsReplica(ctx.obj, lag), + nagiosplugin.ScalarContext("is_replica", None, "@0:0"), + NodeIsReplicaSummary(lag), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_is_pending_restart") +@click.pass_context +@nagiosplugin.guarded +def node_is_pending_restart(ctx: click.Context) -> None: + """Check if the node is in pending restart state. + + This situation can arise if the configuration has been modified but + requiers arestart of PostgreSQL. + + \b + Check: + * `OK`: if the node has pending restart tag. + * `CRITICAL`: otherwise + + Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise. + """ + check = nagiosplugin.Check() + check.add( + NodeIsPendingRestart(ctx.obj), + nagiosplugin.ScalarContext("is_pending_restart", None, "@1:1"), + NodeIsPendingRestartSummary(), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_tl_has_changed") +@click.option( + "--timeline", "timeline", type=str, help="A timeline number to compare with." +) +@click.option( + "-s", + "--state-file", + "state_file", + type=str, + help="A state file to store the last tl number into.", +) +@click.pass_context +@nagiosplugin.guarded +def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None: + """Check if the timeline hash changed. + + Note: either a timeline or a state file must be provided for this service to work. + + \b + Check: + * `OK`: The timeline is the same as last time (`--state_file`) or the inputed timeline (`--timeline`) + * `CRITICAL`: The tl is not the same. + + \b + Perfdata : + * `is_configuration_changed` is 1 if the configuration has changed, 0 otherwise + """ + if (timeline is None and state_file is None) or ( + timeline is not None and state_file is not None + ): + raise click.UsageError( + "Either --timeline or --state-file should be provided for this service", ctx + ) + + check = nagiosplugin.Check() + check.add( + NodeTLHasChanged(ctx.obj, timeline, state_file), + nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"), + nagiosplugin.ScalarContext("timeline"), + NodeTLHasChangedSummary(timeline), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_patroni_version") +@click.option( + "--patroni-version", + "patroni_version", + type=str, + help="Patroni version to compare to", + required=True, +) +@click.pass_context +@nagiosplugin.guarded +def node_patroni_version(ctx: click.Context, patroni_version: str) -> None: + """Check if the version is equal to the input + + \b + Check: + * `OK`: The version is the same as the input `--patroni-version` + * `CRITICAL`: otherwise. + + \b + Perfdata : + * `is_version_ok` is 1 if version is ok, 0 otherwise + """ + # TODO the version cannot be written in perfdata find something else ? + check = nagiosplugin.Check() + check.add( + NodePatroniVersion(ctx.obj, patroni_version), + nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"), + nagiosplugin.ScalarContext("patroni_version"), + NodePatroniVersionSummary(patroni_version), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + +@main.command(name="node_is_alive") +@click.pass_context +@nagiosplugin.guarded +def node_is_alive(ctx: click.Context) -> None: + """Check if the node is alive ie patroni is running. + + \b + Check: + * `OK`: If patroni is running. + * `CRITICAL`: otherwise. + + \b + Perfdata : + * `is_running` is 1 if patroni is running, 0 otherwise + """ + check = nagiosplugin.Check() + check.add( + NodeIsAlive(ctx.obj), + nagiosplugin.ScalarContext("is_alive", None, "@0:0"), + NodeIsAliveSummary(), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py new file mode 100644 index 0000000..356b7ee --- /dev/null +++ b/check_patroni/cluster.py @@ -0,0 +1,164 @@ +from collections import Counter +import hashlib +import json +import logging +import nagiosplugin + +from .types import PatroniResource, ConnectionInfo, handle_unknown + +_log = logging.getLogger("nagiosplugin") + + +def replace_chars(text: str) -> str: + return text.replace("'", "").replace(" ", "_") + + +class ClusterNodeCount(PatroniResource): + def probe(self: "ClusterNodeCount") -> nagiosplugin.Metric: + r = self.rest_api("cluster") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + role_counters = Counter() + roles = [] + status_counters = Counter() + statuses = [] + + for member in item_dict["members"]: + roles.append(replace_chars(member["role"])) + statuses.append(replace_chars(member["state"])) + role_counters.update(roles) + status_counters.update(statuses) + + # The actual check: members, running state + yield nagiosplugin.Metric("members", len(item_dict["members"])) + yield nagiosplugin.Metric("state_running", status_counters["running"]) + + # The performance data : role + for role in role_counters: + yield nagiosplugin.Metric( + f"role_{role}", role_counters[role], context="members_roles" + ) + + # The performance data : statuses (except running) + for state in status_counters: + if state != "running": + yield nagiosplugin.Metric( + f"state_{state}", status_counters[state], context="members_statuses" + ) + + +class ClusterHasLeader(PatroniResource): + def probe(self: "ClusterHasLeader") -> nagiosplugin.Metric: + r = self.rest_api("cluster") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + is_leader_found = False + for member in item_dict["members"]: + if member["role"] == "leader" and member["state"] == "running": + is_leader_found = True + break + + return [ + nagiosplugin.Metric( + "has_leader", + 1 if is_leader_found else 0, + ) + ] + + +class ClusterHasLeaderSummary(nagiosplugin.Summary): + def ok(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str: + return "The cluster has a running leader." + + @handle_unknown + def problem(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str: + return "The cluster has no running leader." + + +class ClusterHasReplica(PatroniResource): + def probe(self: "ClusterHasReplica") -> nagiosplugin.Metric: + r = self.rest_api("cluster") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + replicas = [] + for member in item_dict["members"]: + # FIXME are there other acceptable states + if member["role"] == "replica" and member["state"] == "running": + # FIXME which lag ? + replicas.append({"name": member["name"], "lag": member["lag"]}) + break + + # The actual check + yield nagiosplugin.Metric("replica_count", len(replicas)) + + # The performance data : replicas lag + for replica in replicas: + yield nagiosplugin.Metric( + f"{replica['name']}_lag", replica["lag"], context="replica_lag" + ) + + +# FIXME is this needed ?? +# class ClusterHasReplicaSummary(nagiosplugin.Summary): +# def ok(self, results): +# def problem(self, results): + + +class ClusterConfigHasChanged(PatroniResource): + def __init__( + self: "ClusterConfigHasChanged", + connection_info: ConnectionInfo, + config_hash: str, + state_file: str, + ): + super().__init__(connection_info) + self.state_file = state_file + self.config_hash = config_hash + + def probe(self: "ClusterConfigHasChanged") -> nagiosplugin.Metric: + r = self.rest_api("config") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + new_hash = hashlib.md5(r.data).hexdigest() + + if self.state_file is not None: + _log.debug(f"Using state file / cookie {self.state_file}") + cookie = nagiosplugin.Cookie(self.state_file) + cookie.open() + old_hash = cookie.get("hash") + cookie["hash"] = new_hash + cookie.commit() + else: + _log.debug(f"Using input value {self.config_hash}") + old_hash = self.config_hash + + _log.debug(f"hash info: old hash {old_hash}, new hash {new_hash}") + + return [ + nagiosplugin.Metric( + "is_configuration_changed", + 1 if new_hash != old_hash else 0, + ) + ] + + +class ClusterConfigHasChangedSummary(nagiosplugin.Summary): + def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str: + return "The hash of patroni's dynamic configuration has not changed." + + @handle_unknown + def problem( + self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result + ) -> str: + return "The hash of patroni's dynamic configuration has changed." diff --git a/check_patroni/node.py b/check_patroni/node.py new file mode 100644 index 0000000..b5f7828 --- /dev/null +++ b/check_patroni/node.py @@ -0,0 +1,205 @@ +import json +import logging +import nagiosplugin + +from .types import ConnectionInfo, handle_unknown, PatroniResource + +_log = logging.getLogger("nagiosplugin") + + +class NodeIsPrimary(PatroniResource): + def probe(self: "NodeIsPrimary") -> nagiosplugin.Metric: + r = self.rest_api("primary") + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + return [nagiosplugin.Metric("is_primary", 1 if r.status == 200 else 0)] + + +class NodeIsPrimarySummary(nagiosplugin.Summary): + def ok(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str: + return "This node is the primary with the leader lock." + + @handle_unknown + def problem(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str: + return "This node is not the primary with the leader lock." + + +class NodeIsReplica(PatroniResource): + def __init__( + self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str + ) -> None: + super().__init__(connection_info) + self.lag = lag + + def probe(self: "NodeIsReplica") -> nagiosplugin.Metric: + if self.lag is None: + r = self.rest_api("replica") + else: + r = self.rest_api(f"replica?lag={self.lag}") + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + return [nagiosplugin.Metric("is_replica", 1 if r.status == 200 else 0)] + + +class NodeIsReplicaSummary(nagiosplugin.Summary): + def __init__(self: "NodeIsReplicaSummary", lag: str) -> None: + self.lag = lag + + def ok(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str: + if self.lag is None: + return "This node is a running replica with no noloadbalance tag." + return f"This node is a running replica with no noloadbalance tag and the lag is under {self.lag}." + + @handle_unknown + def problem(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str: + if self.lag is None: + return "This node is not a running replica with no noloadbalance tag." + return f"This node is not a running replica with no noloadbalance tag and a lag under {self.lag}." + + +class NodeIsPendingRestart(PatroniResource): + def probe(self: "NodeIsPendingRestart") -> nagiosplugin.Metric: + r = self.rest_api("patroni") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + is_pending_restart = item_dict.get("pending_restart", False) + return [ + nagiosplugin.Metric( + "is_pending_restart", + 1 if is_pending_restart else 0, + ) + ] + + +class NodeIsPendingRestartSummary(nagiosplugin.Summary): + def ok(self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result) -> str: + return "This node doesn't have the pending restart flag." + + @handle_unknown + def problem( + self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result + ) -> str: + return "This node has the pending restart flag." + + +class NodeTLHasChanged(PatroniResource): + def __init__( + self: "NodeTLHasChanged", + connection_info: ConnectionInfo, + timeline: str, + state_file: str, + ) -> None: + super().__init__(connection_info) + self.state_file = state_file + self.timeline = timeline + + def probe(self: "NodeTLHasChanged") -> nagiosplugin.Metric: + r = self.rest_api("patroni") + # FIXME RC <> 200 ? + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + new_tl = item_dict["timeline"] + + if self.state_file is not None: + _log.debug(f"Using state file / cookie {self.state_file}") + cookie = nagiosplugin.Cookie(self.state_file) + cookie.open() + old_tl = cookie.get("timeline") + cookie["timeline"] = new_tl + cookie.commit() + else: + _log.debug(f"Using input value {self.timeline}") + old_tl = self.timeline + + _log.debug(f"Tl data: old tl {old_tl}, new tl {new_tl}") + + # The actual check + yield nagiosplugin.Metric( + "is_timeline_changed", + 1 if str(new_tl) != str(old_tl) else 0, + ) + + # The performance data : the timeline number + yield nagiosplugin.Metric("timeline", new_tl) + + +class NodeTLHasChangedSummary(nagiosplugin.Summary): + def __init__(self: "NodeTLHasChangedSummary", timeline: str) -> None: + self.timeline = timeline + + def ok(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str: + return f"The timeline is still {self.timeline}." + + @handle_unknown + def problem(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str: + return f"The expected timeline was {self.timeline} got {results['timeline'].metric}." + + +class NodePatroniVersion(PatroniResource): + def __init__( + self: "NodePatroniVersion", + connection_info: ConnectionInfo, + patroni_version: str, + ) -> None: + super().__init__(connection_info) + self.patroni_version = patroni_version + + def probe(self: "NodePatroniVersion") -> nagiosplugin.Metric: + r = self.rest_api("patroni") + # FIXME RC <> 200 ? + + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + version = item_dict["patroni"]["version"] + _log.debug( + f"Version data: patroni version {version} input version {self.patroni_version}" + ) + + # The actual check + return [ + nagiosplugin.Metric( + "is_version_ok", + 1 if version == self.patroni_version else 0, + ) + ] + + +class NodePatroniVersionSummary(nagiosplugin.Summary): + def __init__(self: "NodePatroniVersionSummary", patroni_version: str) -> None: + self.patroni_version = patroni_version + + def ok(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str: + return f"Patroni's version is {self.patroni_version}." + + @handle_unknown + def problem(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str: + # FIXME find a way to make the following work, check is perf data can be strings + # return f"The expected patroni version was {self.patroni_version} got {results['patroni_version'].metric}." + return f"Patroni's version is not {self.patroni_version}." + + +class NodeIsAlive(PatroniResource): + def probe(self: "NodeIsAlive") -> nagiosplugin.Metric: + r = self.rest_api("liveness") + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + return [nagiosplugin.Metric("is_alive", 1 if r.status == 200 else 0)] + + +class NodeIsAliveSummary(nagiosplugin.Summary): + def ok(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str: + return "This node is alive (patroni is running)." + + @handle_unknown + def problem(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str: + return "This node is not alive (patroni is not running)." diff --git a/check_patroni/types.py b/check_patroni/types.py new file mode 100644 index 0000000..2b4b7e9 --- /dev/null +++ b/check_patroni/types.py @@ -0,0 +1,63 @@ +import attr +import logging +import nagiosplugin +import urllib3 +from typing import Any, Callable, List + +_log = logging.getLogger("nagiosplugin") + + +@attr.s(auto_attribs=True, frozen=True, slots=True) +class ConnectionInfo: + endpoints: List[str] = ["http://127.0.0.1:8008"] + cert_file: str = "./ssl/benoit-dalibo-cert.pem" + key_file: str = "./ssl/benoit-dalibo-key.pem" + ca_cert: str = "./ssl/CA-cert.pem" + + +@attr.s(auto_attribs=True, slots=True) +class PatroniResource(nagiosplugin.Resource): + conn_info: ConnectionInfo + + def rest_api( + self: "PatroniResource", service: str + ) -> urllib3.response.HTTPResponse: + """Try to connect to all the provided endpoints for the requested service""" + for endpoint in self.conn_info.endpoints: + try: + if endpoint[:5] == "https": + pool = urllib3.PoolManager( + cert_reqs="CERT_REQUIRED", + cert_file=self.conn_info.cert_file, + key_file=self.conn_info.key_file, + ca_certs=self.conn_info.ca_cert, + ) + else: + pool = urllib3.PoolManager() + + _log.debug(f"Trying to connect to {endpoint}/{service}") + return pool.request( + "GET", + f"{endpoint}/{service}", + ) + except nagiosplugin.Timeout as e: + raise e + except Exception as e: + _log.debug(e) + continue + raise nagiosplugin.CheckError("Connection failed for all provided endpoints") + + +HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Result], Any] + + +def handle_unknown(action: HandleUnknown) -> HandleUnknown: + """decorator to handle the unknown state in Summary.problem""" + + def wrapper(summary: nagiosplugin.Summary, results: nagiosplugin.Result) -> Any: + if results.most_significant[0].state.code == 3: + """get the appropriate message for all unknown error""" + return results.most_significant[0].hint + return action(summary, results) + + return wrapper diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..a3066ad --- /dev/null +++ b/config.ini @@ -0,0 +1,9 @@ +[options] +endpoints = https://10.20.199.3:8008, https://10.20.199.4:8008,https://10.20.199.5:8008 +cert_file = ./ssl/benoit-dalibo-cert.pem +key_file = ./ssl/benoit-dalibo-key.pem +ca_file = ./ssl/CA-cert.pem +timeout = 0 + +[options.node_is_replica] +lag=100 diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..07c0723 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,5 @@ +[mypy] +# nagiosplugin => Skipping analyzing "nagiosplugin": found module but no type hints or library stubs [import] +ignore_missing_imports = true +show_error_codes = true +strict = true diff --git a/setup.py b/setup.py index 2c2f93a..556aef8 100644 --- a/setup.py +++ b/setup.py @@ -19,24 +19,29 @@ def get_version() -> str: setup( name="check_patroni", version=get_version(), -# author="Dalibo", -# author_email="contact@dalibo.com", + # author="Dalibo", + # author_email="contact@dalibo.com", packages=find_packages("."), include_package_data=True, -# url="https://github.com/dalibo/pg_activity", + # url="https://github.com/dalibo/pg_activity", license="PostgreSQL", description="Nagios plugin to check on patroni", long_description=long_description, long_description_content_type="text/markdown", -# classifiers=[ -# "Development Status :: 5 - Production/Stable", -# "Environment :: Console", -# "License :: OSI Approved :: PostgreSQL License", -# "Programming Language :: Python :: 3", -# "Topic :: Database", -# ], - keywords="patroni nagios cehck", + # classifiers=[ + # "Development Status :: 5 - Production/Stable", + # "Environment :: Console", + # "License :: OSI Approved :: PostgreSQL License", + # "Programming Language :: Python :: 3", + # "Topic :: Database", + # ], + keywords="patroni nagios check", python_requires=">=3.6", + install_requires=[ + "urllib3 >= 1.26.6", + "nagiosplugin >= 1.3.2", + "click >= 8.0.1", + ], extras_require={ "dev": [ "black", @@ -44,6 +49,10 @@ setup( "flake8", "mypy", ], + "test": [ + "pytest", + "pytest-mock", + ], }, entry_points={ "console_scripts": [ diff --git a/test/__pycache__/test.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..b819608 Binary files /dev/null and b/test/__pycache__/test.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_cluster_config_has_changed.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_cluster_config_has_changed.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..05e68cd Binary files /dev/null and b/test/__pycache__/test_cluster_config_has_changed.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_cluster_has_leader.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_cluster_has_leader.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..982c106 Binary files /dev/null and b/test/__pycache__/test_cluster_has_leader.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_cluster_has_replica.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_cluster_has_replica.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..c8f6dc6 Binary files /dev/null and b/test/__pycache__/test_cluster_has_replica.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_cluster_node_count.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_cluster_node_count.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..62fac79 Binary files /dev/null and b/test/__pycache__/test_cluster_node_count.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_is_alive.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_is_alive.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..3862da3 Binary files /dev/null and b/test/__pycache__/test_node_is_alive.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_is_pending_restart.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_is_pending_restart.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..093fd51 Binary files /dev/null and b/test/__pycache__/test_node_is_pending_restart.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_is_primary.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_is_primary.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..494adb0 Binary files /dev/null and b/test/__pycache__/test_node_is_primary.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_is_replica.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_is_replica.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..7b100c3 Binary files /dev/null and b/test/__pycache__/test_node_is_replica.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_patroni_version.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_patroni_version.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..10dcd15 Binary files /dev/null and b/test/__pycache__/test_node_patroni_version.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/test_node_tl_has_changed.cpython-39-pytest-6.2.4.pyc b/test/__pycache__/test_node_tl_has_changed.cpython-39-pytest-6.2.4.pyc new file mode 100644 index 0000000..4e4d51b Binary files /dev/null and b/test/__pycache__/test_node_tl_has_changed.cpython-39-pytest-6.2.4.pyc differ diff --git a/test/__pycache__/tools.cpython-39.pyc b/test/__pycache__/tools.cpython-39.pyc new file mode 100644 index 0000000..f949e20 Binary files /dev/null and b/test/__pycache__/tools.cpython-39.pyc differ diff --git a/test/json/cluster_config_has_changed.json b/test/json/cluster_config_has_changed.json new file mode 100644 index 0000000..b6c0015 --- /dev/null +++ b/test/json/cluster_config_has_changed.json @@ -0,0 +1,16 @@ +{ + "loop_wait": 10, + "master_start_timeout": 300, + "postgresql": { + "parameters": { + "archive_command": "pgbackrest --stanza=main archive-push %p", + "archive_mode": "on", + "max_connections": 300, + "restore_command": "pgbackrest --stanza=main archive-get %f \"%p\"" + }, + "use_pg_rewind": false, + "use_slot": true + }, + "retry_timeout": 10, + "ttl": 30 +} diff --git a/test/json/cluster_has_leader_ko.json b/test/json/cluster_has_leader_ko.json new file mode 100644 index 0000000..198fe14 --- /dev/null +++ b/test/json/cluster_has_leader_ko.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_has_leader_ok.json b/test/json/cluster_has_leader_ok.json new file mode 100644 index 0000000..547d6c8 --- /dev/null +++ b/test/json/cluster_has_leader_ok.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_has_replica_ok.json b/test/json/cluster_has_replica_ok.json new file mode 100644 index 0000000..547d6c8 --- /dev/null +++ b/test/json/cluster_has_replica_ok.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_node_count.json b/test/json/cluster_node_count.json new file mode 100644 index 0000000..7c7b2e0 --- /dev/null +++ b/test/json/cluster_node_count.json @@ -0,0 +1,32 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "start failed", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "lag": "unknown" + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_node_count_critical.json b/test/json/cluster_node_count_critical.json new file mode 100644 index 0000000..f35ccbd --- /dev/null +++ b/test/json/cluster_node_count_critical.json @@ -0,0 +1,13 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + } + ] +} diff --git a/test/json/cluster_node_count_ok.json b/test/json/cluster_node_count_ok.json new file mode 100644 index 0000000..547d6c8 --- /dev/null +++ b/test/json/cluster_node_count_ok.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_node_count_running_critical.json b/test/json/cluster_node_count_running_critical.json new file mode 100644 index 0000000..e6016fc --- /dev/null +++ b/test/json/cluster_node_count_running_critical.json @@ -0,0 +1,31 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "start failed", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "lag": "unknown" + }, + { + "name": "srv3", + "role": "replica", + "state": "start failed", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "lag": "unknown" + } + ] +} diff --git a/test/json/cluster_node_count_running_warning.json b/test/json/cluster_node_count_running_warning.json new file mode 100644 index 0000000..a53124e --- /dev/null +++ b/test/json/cluster_node_count_running_warning.json @@ -0,0 +1,23 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_node_count_warning.json b/test/json/cluster_node_count_warning.json new file mode 100644 index 0000000..11b7383 --- /dev/null +++ b/test/json/cluster_node_count_warning.json @@ -0,0 +1,23 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/node_is_alive.json b/test/json/node_is_alive.json new file mode 100644 index 0000000..b697269 --- /dev/null +++ b/test/json/node_is_alive.json @@ -0,0 +1,19 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:57:51.693 UTC", + "role": "replica", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "received_location": 1174407088, + "replayed_location": 1174407088, + "replayed_timestamp": null, + "paused": false + }, + "timeline": 58, + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_pending_restart_ko.json b/test/json/node_is_pending_restart_ko.json new file mode 100644 index 0000000..ea4d396 --- /dev/null +++ b/test/json/node_is_pending_restart_ko.json @@ -0,0 +1,27 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "pending_restart": true, + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_pending_restart_ok.json b/test/json/node_is_pending_restart_ok.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/test/json/node_is_pending_restart_ok.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_primary_ko.json b/test/json/node_is_primary_ko.json new file mode 100644 index 0000000..b697269 --- /dev/null +++ b/test/json/node_is_primary_ko.json @@ -0,0 +1,19 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:57:51.693 UTC", + "role": "replica", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "received_location": 1174407088, + "replayed_location": 1174407088, + "replayed_timestamp": null, + "paused": false + }, + "timeline": 58, + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_primary_ok.json b/test/json/node_is_primary_ok.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/test/json/node_is_primary_ok.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_replica_ko.json b/test/json/node_is_replica_ko.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/test/json/node_is_replica_ko.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_is_replica_ok.json b/test/json/node_is_replica_ok.json new file mode 100644 index 0000000..b697269 --- /dev/null +++ b/test/json/node_is_replica_ok.json @@ -0,0 +1,19 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:57:51.693 UTC", + "role": "replica", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "received_location": 1174407088, + "replayed_location": 1174407088, + "replayed_timestamp": null, + "paused": false + }, + "timeline": 58, + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_patroni_version.json b/test/json/node_patroni_version.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/test/json/node_patroni_version.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/json/node_tl_has_changed.json b/test/json/node_tl_has_changed.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/test/json/node_tl_has_changed.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/test/test_cluster_config_has_changed.py b/test/test_cluster_config_has_changed.py new file mode 100644 index 0000000..c0c62e2 --- /dev/null +++ b/test/test_cluster_config_has_changed.py @@ -0,0 +1,103 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main +from tools import my_mock, here + + +def test_cluster_config_has_changed_params(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_config_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_config_has_changed", + "--hash", + "640df9f0211c791723f18fc3ed9dbb95", + "--state-file", + str(here / "fake_file_name.state_file"), + ], + ) + assert result.exit_code == 3 + + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_config_has_changed"] + ) + assert result.exit_code == 3 + + +def test_cluster_config_has_changed_ok_with_hash(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_config_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_config_has_changed", + "--hash", + "640df9f0211c791723f18fc3ed9dbb95", + ], + ) + assert result.exit_code == 0 + + +def test_cluster_config_has_changed_ok_with_state_file(mocker: MockerFixture) -> None: + runner = CliRunner() + + with open(here / "cluster_config_has_changed.state_file", "w") as f: + f.write('{"hash": "640df9f0211c791723f18fc3ed9dbb95"}') + + my_mock(mocker, "cluster_config_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_config_has_changed", + "--state-file", + str(here / "cluster_config_has_changed.state_file"), + ], + ) + assert result.exit_code == 0 + + +def test_cluster_config_has_changed_ko_with_hash(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_config_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_config_has_changed", + "--hash", + "640df9f0211c791723f18fc3edffffff", + ], + ) + assert result.exit_code == 2 + + +def test_cluster_config_has_changed_ko_with_state_file(mocker: MockerFixture) -> None: + runner = CliRunner() + + with open(here / "cluster_config_has_changed.state_file", "w") as f: + f.write('{"hash": "640df9f0211c791723f18fc3edffffff"}') + + my_mock(mocker, "cluster_config_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_config_has_changed", + "--state-file", + str(here / "cluster_config_has_changed.state_file"), + ], + ) + assert result.exit_code == 2 diff --git a/test/test_cluster_has_leader.py b/test/test_cluster_has_leader.py new file mode 100644 index 0000000..cf6aa13 --- /dev/null +++ b/test/test_cluster_has_leader.py @@ -0,0 +1,29 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_cluster_has_leader_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_leader_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_leader"] + ) + assert result.exit_code == 0 + # FIXME Not captured ??? + # assert "CLUSTERHASLEADER OK - has_leader is 1 | has_leader=1;;@0" in result.output + + +def test_cluster_has_leader_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_leader_ko", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_leader"] + ) + assert result.exit_code == 2 + # assert "CLUSTERHASLEADER CRITICAL - has_leader is 0 (outside range @0:0) | has_leader=0;;@0" in result.output diff --git a/test/test_cluster_has_replica.py b/test/test_cluster_has_replica.py new file mode 100644 index 0000000..7d414a7 --- /dev/null +++ b/test/test_cluster_has_replica.py @@ -0,0 +1,36 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +# TODO Lag threshold tests +def test_cluster_has_relica_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_replica"] + ) + assert result.exit_code == 0 + + +def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ko", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--warninng", + "@2", + "--critical", + "@0:1", + ], + ) + assert result.exit_code == 2 diff --git a/test/test_cluster_node_count.py b/test/test_cluster_node_count.py new file mode 100644 index 0000000..015e2f4 --- /dev/null +++ b/test/test_cluster_node_count.py @@ -0,0 +1,115 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_cluster_node_count_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_node_count"] + ) + assert result.exit_code == 0 + + +def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_ok", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_node_count", + "--warning", + "@0:1", + "--critical", + "@2", + "--running-warning", + "@2", + "--running-critical", + "@0:1", + ], + ) + assert result.exit_code == 0 + + +def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_running_warning", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_node_count", + "--running-warning", + "@2", + "--running-critical", + "@0:1", + ], + ) + assert result.exit_code == 1 + + +def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_running_critical", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_node_count", + "--running-warning", + "@2", + "--running-critical", + "@0:1", + ], + ) + assert result.exit_code == 2 + + +def test_cluster_node_count_warning(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_warning", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_node_count", + "--warning", + "@2", + "--critical", + "@0:1", + ], + ) + assert result.exit_code == 1 + + +def test_cluster_node_count_critical(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_node_count_critical", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_node_count", + "--warning", + "@2", + "--critical", + "@0:1", + ], + ) + assert result.exit_code == 2 diff --git a/test/test_node_is_alive.py b/test/test_node_is_alive.py new file mode 100644 index 0000000..6c74562 --- /dev/null +++ b/test/test_node_is_alive.py @@ -0,0 +1,22 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_node_is_alive_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_alive", 200) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_alive"]) + assert result.exit_code == 0 + + +def test_node_is_alive_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_alive", 404) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_alive"]) + assert result.exit_code == 2 diff --git a/test/test_node_is_pending_restart.py b/test/test_node_is_pending_restart.py new file mode 100644 index 0000000..bb47a7a --- /dev/null +++ b/test/test_node_is_pending_restart.py @@ -0,0 +1,26 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_node_is_pending_restart_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_pending_restart_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "node_is_pending_restart"] + ) + assert result.exit_code == 0 + + +def test_node_is_pending_restart_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_pending_restart_ko", 404) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "node_is_pending_restart"] + ) + assert result.exit_code == 2 diff --git a/test/test_node_is_primary.py b/test/test_node_is_primary.py new file mode 100644 index 0000000..c81fc29 --- /dev/null +++ b/test/test_node_is_primary.py @@ -0,0 +1,22 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_node_is_primary_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_primary_ok", 200) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_primary"]) + assert result.exit_code == 0 + + +def test_node_is_primary_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_primary_ko", 404) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_primary"]) + assert result.exit_code == 2 diff --git a/test/test_node_is_replica.py b/test/test_node_is_replica.py new file mode 100644 index 0000000..e5f7254 --- /dev/null +++ b/test/test_node_is_replica.py @@ -0,0 +1,33 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_node_is_replica_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_replica_ok", 200) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_replica"]) + assert result.exit_code == 0 + + +def test_node_is_replica_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_replica_ko", 404) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_replica"]) + assert result.exit_code == 2 + + +def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None: + runner = CliRunner() + + # We don't do the check ourselves, patroni does it and changes the return code + my_mock(mocker, "node_is_replica_ok", 404) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"] + ) + assert result.exit_code == 2 diff --git a/test/test_node_patroni_version.py b/test/test_node_patroni_version.py new file mode 100644 index 0000000..7e62dc9 --- /dev/null +++ b/test/test_node_patroni_version.py @@ -0,0 +1,40 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_node_patroni_version_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_patroni_version", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_patroni_version", + "--patroni-version", + "2.0.2", + ], + ) + assert result.exit_code == 0 + + +def test_node_patroni_version_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_patroni_version", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_patroni_version", + "--patroni-version", + "1.0.0", + ], + ) + assert result.exit_code == 2 diff --git a/test/test_node_tl_has_changed.py b/test/test_node_tl_has_changed.py new file mode 100644 index 0000000..e85fc68 --- /dev/null +++ b/test/test_node_tl_has_changed.py @@ -0,0 +1,104 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock, here + + +def test_node_tl_has_changed_params(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_tl_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_tl_has_changed", + "--timeline", + "58", + "--state-file", + str(here / "fake_file_name.state_file"), + ], + ) + assert result.exit_code == 3 + + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "node_tl_has_changed"] + ) + assert result.exit_code == 3 + + +def test_node_tl_has_changed_ok_with_timeline(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_tl_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_tl_has_changed", + "--timeline", + "58", + ], + ) + assert result.exit_code == 0 + + +def test_node_tl_has_changed_ok_with_state_file(mocker: MockerFixture) -> None: + runner = CliRunner() + + with open(here / "node_tl_has_changed.state_file", "w") as f: + f.write('{"timeline": 58}') + + my_mock(mocker, "node_tl_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_tl_has_changed", + "--state-file", + str(here / "node_tl_has_changed.state_file"), + ], + ) + assert result.exit_code == 0 + + +def test_node_tl_has_changed_ko_with_timeline(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "node_tl_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_tl_has_changed", + "--timeline", + "700", + ], + ) + assert result.exit_code == 2 + + +def test_node_tl_has_changed_ko_with_state_file(mocker: MockerFixture) -> None: + runner = CliRunner() + + with open(here / "node_tl_has_changed.state_file", "w") as f: + f.write('{"timeline": 700}') + + my_mock(mocker, "node_tl_has_changed", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "node_tl_has_changed", + "--state-file", + str(here / "node_tl_has_changed.state_file"), + ], + ) + assert result.exit_code == 2 diff --git a/test/tools.py b/test/tools.py new file mode 100644 index 0000000..8bef76b --- /dev/null +++ b/test/tools.py @@ -0,0 +1,26 @@ +import attr +import pathlib +from pytest_mock import MockerFixture + +from check_patroni.types import PatroniResource + +here = pathlib.Path(__file__).parent + + +def getjson(name: str) -> bytes: + path = here / "json" / f"{name}.json" + with path.open() as f: + return f.read().encode("utf-8") + + +@attr.s(auto_attribs=True, frozen=True, slots=True) +class MockApiReturnCode: + data: bytes + status: int + + +def my_mock(mocker: MockerFixture, json_file: str, status: int) -> None: + def mock_rest_api(self: PatroniResource, service: str) -> MockApiReturnCode: + return MockApiReturnCode(getjson(json_file), status) + + mocker.patch("check_patroni.types.PatroniResource.rest_api", mock_rest_api)