From d4e974da5127b28402d4c34d8290ee35074ec472 Mon Sep 17 00:00:00 2001 From: benoit Date: Fri, 13 Aug 2021 11:00:43 +0200 Subject: [PATCH] --max-lag additions and fixups * add --max-lag to cluster_has_replica * change --lag to --max-lag in node_is_replica * update tests * update README.md --- README.md | 18 ++++---- check_patroni/cli.py | 45 ++++++++++---------- check_patroni/cluster.py | 28 +++++++++---- check_patroni/node.py | 8 ++-- test/test_cluster_has_replica.py | 71 ++++++++++++++++++++++++++++++-- test/test_node_is_replica.py | 2 +- 6 files changed, 125 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 6a7eba9..f9b32b1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Options: Commands: cluster_config_has_changed Check if the hash of the configuration has... cluster_has_leader Check if the cluster has a leader. - cluster_has_replica Check if the cluster has replicas and their... + cluster_has_replica Check if the cluster has healthy replicates. cluster_is_in_maintenance Check if the cluster is in maintenance mode... cluster_node_count Count the number of nodes in the cluster. node_is_alive Check if the node is alive ie patroni is... @@ -78,21 +78,23 @@ Options: ``` Usage: check_patroni cluster_has_replica [OPTIONS] - Check if the cluster has replicas and their lag. + Check if the cluster has healthy replicates. + + A healthy replicate : * is in running state * has a replica role * has a lag + lower or equal to max_lag Check: - * `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds. + * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold. * `WARNING` / `CRITICAL`: otherwise Perfdata : - * replica count + * healthy_replica & unhealthy_replica count * the lag of each replica labelled with "member name"_lag Options: -w, --warning TEXT Warning threshold for the number of nodes. -c, --critical TEXT Critical threshold for the number of replica nodes. - --lag-warning TEXT Warning threshold for the lag. - --lag-critical TEXT Critical threshold for the lag. + --max-lag TEXT maximum allowed lag --help Show this message and exit. ``` @@ -211,8 +213,8 @@ Usage: check_patroni node_is_replica [OPTIONS] noloadbalance tag and the lag is under the maximum threshold, 0 otherwise. Options: - --lag TEXT maximum allowed lag - --help Show this message and exit. + --max-lag TEXT maximum allowed lag + --help Show this message and exit. ``` ### node_patroni_version diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 8b0807a..33fd13c 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -29,6 +29,7 @@ from .node import ( NodeTLHasChangedSummary, ) from .types import ConnectionInfo +from .convert import size_to_byte def print_version(ctx: click.Context, param: str, value: str) -> None: @@ -247,45 +248,41 @@ def cluster_has_leader(ctx: click.Context) -> None: type=str, help="Critical threshold for the number of replica nodes.", ) -@click.option( - "--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag." -) -# FIWME how do we manage maximum_lag_on_failover without doing many api calls -@click.option( - "--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag." -) +@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag") @click.pass_context @nagiosplugin.guarded def cluster_has_replica( - ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str + ctx: click.Context, warning: str, critical: str, max_lag: str ) -> None: - """Check if the cluster has replicas and their lag. + """Check if the cluster has healthy replicates. + + A healthy replicate : + * is in running state + * has a replica role + * has a lag lower or equal to max_lag \b Check: - * `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds. + * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold. * `WARNING` / `CRITICAL`: otherwise \b Perfdata : - * replica count + * healthy_replica & unhealthy_replica count * the lag of each replica labelled with "member name"_lag """ - # FIXME the idea here would be to make sur we have a replica. - # lag should be check to prune invalid replicas + + tmax_lag = size_to_byte(max_lag) if max_lag is not None else None check = nagiosplugin.Check() check.add( - ClusterHasReplica(ctx.obj), + ClusterHasReplica(ctx.obj, tmax_lag), nagiosplugin.ScalarContext( - "replica_count", + "healthy_replica", warning, critical, ), - nagiosplugin.ScalarContext( - "replica_lag", - lag_warning, - lag_critical, - ), + nagiosplugin.ScalarContext("unhealthy_replica"), + nagiosplugin.ScalarContext("replica_lag"), ) check.main( verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] @@ -388,10 +385,10 @@ def node_is_primary(ctx: click.Context) -> None: @main.command(name="node_is_replica") -@click.option("--lag", "lag", type=str, help="maximum allowed lag") +@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag") @click.pass_context @nagiosplugin.guarded -def node_is_replica(ctx: click.Context, lag: str) -> None: +def node_is_replica(ctx: click.Context, max_lag: str) -> None: """Check if the node is a running replica with no noloadbalance tag. \b @@ -404,9 +401,9 @@ def node_is_replica(ctx: click.Context, lag: str) -> None: # FIXME add a lag check ?? check = nagiosplugin.Check() check.add( - NodeIsReplica(ctx.obj, lag), + NodeIsReplica(ctx.obj, max_lag), nagiosplugin.ScalarContext("is_replica", None, "@0:0"), - NodeIsReplicaSummary(lag), + NodeIsReplicaSummary(max_lag), ) check.main( verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index 54e29fb..a1ddf37 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -3,7 +3,7 @@ import hashlib import json import logging import nagiosplugin -from typing import Iterable +from typing import Iterable, Union from .types import PatroniResource, ConnectionInfo, handle_unknown @@ -81,6 +81,14 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary): class ClusterHasReplica(PatroniResource): + def __init__( + self: "ClusterHasReplica", + connection_info: ConnectionInfo, + max_lag: Union[int, None], + ): + super().__init__(connection_info) + self.max_lag = max_lag + def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]: r = self.rest_api("cluster") _log.debug(f"api call status: {r.status}") @@ -88,17 +96,23 @@ class ClusterHasReplica(PatroniResource): item_dict = json.loads(r.data) replicas = [] + healthy_replica = 0 + unhealthy_replica = 0 for member in item_dict["members"]: # FIXME are there other acceptable states - if member["role"] == "replica" and member["state"] == "running": - # FIXME which lag ? - replicas.append({"name": member["name"], "lag": member["lag"]}) - break + if member["role"] == "replica": + if member["state"] == "running" and member["lag"] != "unknown": + replicas.append({"name": member["name"], "lag": member["lag"]}) + if self.max_lag is None or self.max_lag >= int(member["lag"]): + healthy_replica += 1 + continue + unhealthy_replica += 1 # The actual check - yield nagiosplugin.Metric("replica_count", len(replicas)) + yield nagiosplugin.Metric("healthy_replica", healthy_replica) - # The performance data : replicas lag + # The performance data : unheakthy replica count, replicas lag + yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica) for replica in replicas: yield nagiosplugin.Metric( f"{replica['name']}_lag", replica["lag"], context="replica_lag" diff --git a/check_patroni/node.py b/check_patroni/node.py index c1d4942..de51612 100644 --- a/check_patroni/node.py +++ b/check_patroni/node.py @@ -29,16 +29,16 @@ class NodeIsPrimarySummary(nagiosplugin.Summary): class NodeIsReplica(PatroniResource): def __init__( - self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str + self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str ) -> None: super().__init__(connection_info) - self.lag = lag + self.max_lag = max_lag def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]: - if self.lag is None: + if self.max_lag is None: r = self.rest_api("replica") else: - r = self.rest_api(f"replica?lag={self.lag}") + r = self.rest_api(f"replica?lag={self.max_lag}") _log.debug(f"api call status: {r.status}") _log.debug(f"api call data: {r.data}") diff --git a/test/test_cluster_has_replica.py b/test/test_cluster_has_replica.py index 7d414a7..58afc56 100644 --- a/test/test_cluster_has_replica.py +++ b/test/test_cluster_has_replica.py @@ -20,6 +20,48 @@ def test_cluster_has_relica_ok(mocker: MockerFixture) -> None: def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None: runner = CliRunner() + my_mock(mocker, "cluster_has_replica_ok", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--warning", + "@1", + "--critical", + "@0", + ], + ) + assert result.exit_code == 0 + + +def test_cluster_has_replica_ok_with_count_thresholds_lag( + mocker: MockerFixture, +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ok_lag", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--warning", + "@1", + "--critical", + "@0", + "--max-lag", + "1MB", + ], + ) + assert result.exit_code == 0 + + +def test_cluster_has_replica_ko_with_count_thresholds(mocker: MockerFixture) -> None: + runner = CliRunner() + my_mock(mocker, "cluster_has_replica_ko", 200) result = runner.invoke( main, @@ -27,10 +69,33 @@ def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> "-e", "https://10.20.199.3:8008", "cluster_has_replica", - "--warninng", - "@2", + "--warning", + "@1", "--critical", - "@0:1", + "@0", + ], + ) + assert result.exit_code == 1 + + +def test_cluster_has_replica_ko_with_count_thresholds_and_lag( + mocker: MockerFixture, +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ko_lag", 200) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--warning", + "@1", + "--critical", + "@0", + "--max-lag", + "1MB", ], ) assert result.exit_code == 2 diff --git a/test/test_node_is_replica.py b/test/test_node_is_replica.py index e5f7254..52758b9 100644 --- a/test/test_node_is_replica.py +++ b/test/test_node_is_replica.py @@ -28,6 +28,6 @@ def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None: # We don't do the check ourselves, patroni does it and changes the return code my_mock(mocker, "node_is_replica_ok", 404) result = runner.invoke( - main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"] + main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--max-lag", "100"] ) assert result.exit_code == 2