From ee3837fab11218f9da30645ee1b58e41e3e46a38 Mon Sep 17 00:00:00 2001 From: benoit Date: Thu, 24 Aug 2023 15:43:35 +0200 Subject: [PATCH] Add info and options about sync standby to cluster_has_replica * Add `--sync-warning` and `--sync-critical` * Add `sync_replica` to track the number of sync replica in the perf data * Add `MEMBER-sync` to track if a member is a sync replica in the perf data --- CHANGELOG.md | 1 + README.md | 21 ++++++----- check_patroni/cli.py | 30 ++++++++++++++-- check_patroni/cluster.py | 17 ++++++++- tests/test_cluster_has_replica.py | 58 ++++++++++++++++++++++++++++--- 5 files changed, 111 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 168ab98..4d6fdb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Added * Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel) +* Add info and options (`--sync-warning` and `--sync-critical`) about sync replica to `cluster_has_replica`. * Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart. * Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`). * Add `standby-leader` as a valid leader type for `cluster_has_leader`. diff --git a/README.md b/README.md index a3fe9bf..2b42c31 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Options: Commands: cluster_config_has_changed Check if the hash of the configuration... cluster_has_leader Check if the cluster has a leader. - cluster_has_replica Check if the cluster has healthy replicas. + cluster_has_replica Check if the cluster has healthy replicas... cluster_has_scheduled_action Check if the cluster has a scheduled... cluster_is_in_maintenance Check if the cluster is in maintenance... cluster_node_count Count the number of nodes in the cluster. @@ -223,7 +223,7 @@ Options: ``` Usage: check_patroni cluster_has_replica [OPTIONS] - Check if the cluster has healthy replicas. + Check if the cluster has healthy replicas and/or if some are sync standbies A healthy replica: * is in running or streaming state (V3.0.4) @@ -232,19 +232,24 @@ Usage: check_patroni cluster_has_replica [OPTIONS] Check: * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold. + and if the sync_replica count is compatible with the sync replica count threshold. * `WARNING` / `CRITICAL`: otherwise Perfdata: * healthy_replica & unhealthy_replica count + * the number of sync_replica, they are included in the previous count * the lag of each replica labelled with "member name"_lag + * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync Options: - -w, --warning TEXT Warning threshold for the number of healthy replica - nodes. - -c, --critical TEXT Critical threshold for the number of healthy replica - nodes. - --max-lag TEXT maximum allowed lag - --help Show this message and exit. + -w, --warning TEXT Warning threshold for the number of healthy replica + nodes. + -c, --critical TEXT Critical threshold for the number of healthy replica + nodes. + --sync-warning TEXT Warning threshold for the number of sync replica. + --sync-critical TEXT Critical threshold for the number of sync replica. + --max-lag TEXT maximum allowed lag + --help Show this message and exit. ``` ### cluster_has_scheduled_action diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 227e654..d249219 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -316,13 +316,30 @@ def cluster_has_leader(ctx: click.Context) -> None: type=str, help="Critical threshold for the number of healthy replica nodes.", ) +@click.option( + "--sync-warning", + "sync_warning", + type=str, + help="Warning threshold for the number of sync replica.", +) +@click.option( + "--sync-critical", + "sync_critical", + type=str, + help="Critical threshold for the number of sync replica.", +) @click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag") @click.pass_context @nagiosplugin.guarded def cluster_has_replica( - ctx: click.Context, warning: str, critical: str, max_lag: str + ctx: click.Context, + warning: str, + critical: str, + sync_warning: str, + sync_critical: str, + max_lag: str, ) -> None: - """Check if the cluster has healthy replicas. + """Check if the cluster has healthy replicas and/or if some are sync standbies \b A healthy replica: @@ -333,12 +350,15 @@ def cluster_has_replica( \b Check: * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold. + and if the sync_replica count is compatible with the sync replica count threshold. * `WARNING` / `CRITICAL`: otherwise \b Perfdata: * healthy_replica & unhealthy_replica count + * the number of sync_replica, they are included in the previous count * the lag of each replica labelled with "member name"_lag + * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync """ tmax_lag = size_to_byte(max_lag) if max_lag is not None else None @@ -350,8 +370,14 @@ def cluster_has_replica( warning, critical, ), + nagiosplugin.ScalarContext( + "sync_replica", + sync_warning, + sync_critical, + ), nagiosplugin.ScalarContext("unhealthy_replica"), nagiosplugin.ScalarContext("replica_lag"), + nagiosplugin.ScalarContext("replica_sync"), ) check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index 7d66157..eed5325 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -92,6 +92,7 @@ class ClusterHasReplica(PatroniResource): replicas = [] healthy_replica = 0 unhealthy_replica = 0 + sync_replica = 0 for member in item_dict["members"]: # FIXME are there other acceptable states if member["role"] in ["replica", "sync_standby"]: @@ -100,7 +101,17 @@ class ClusterHasReplica(PatroniResource): member["state"] in ["running", "streaming"] and member["lag"] != "unknown" ): - replicas.append({"name": member["name"], "lag": member["lag"]}) + replicas.append( + { + "name": member["name"], + "lag": member["lag"], + "sync": 1 if member["role"] == "sync_standby" else 0, + } + ) + + if member["role"] == "sync_standby": + sync_replica += 1 + if self.max_lag is None or self.max_lag >= int(member["lag"]): healthy_replica += 1 continue @@ -108,6 +119,7 @@ class ClusterHasReplica(PatroniResource): # The actual check yield nagiosplugin.Metric("healthy_replica", healthy_replica) + yield nagiosplugin.Metric("sync_replica", sync_replica) # The performance data : unhealthy replica count, replicas lag yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica) @@ -115,6 +127,9 @@ class ClusterHasReplica(PatroniResource): yield nagiosplugin.Metric( f"{replica['name']}_lag", replica["lag"], context="replica_lag" ) + yield nagiosplugin.Metric( + f"{replica['name']}_sync", replica["sync"], context="replica_sync" + ) # FIXME is this needed ?? diff --git a/tests/test_cluster_has_replica.py b/tests/test_cluster_has_replica.py index 7cd3ea1..d2a4e7b 100644 --- a/tests/test_cluster_has_replica.py +++ b/tests/test_cluster_has_replica.py @@ -19,7 +19,7 @@ def test_cluster_has_relica_ok( assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n" ) @@ -44,7 +44,30 @@ def test_cluster_has_replica_ok_with_count_thresholds( assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n" + ) + + +def test_cluster_has_replica_ok_with_sync_count_thresholds( + mocker: MockerFixture, use_old_replica_state: bool +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ok", 200, use_old_replica_state) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--sync-warning", + "1:", + ], + ) + assert result.exit_code == 0 + assert ( + result.stdout + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n" ) @@ -72,7 +95,7 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag( assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv3_lag=0 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n" ) @@ -97,7 +120,32 @@ def test_cluster_has_replica_ko_with_count_thresholds( assert result.exit_code == 1 assert ( result.stdout - == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 unhealthy_replica=1\n" + == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n" + ) + + +def test_cluster_has_replica_ko_with_sync_count_thresholds( + mocker: MockerFixture, use_old_replica_state: bool +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_replica_ko", 200, use_old_replica_state) + result = runner.invoke( + main, + [ + "-e", + "https://10.20.199.3:8008", + "cluster_has_replica", + "--sync-warning", + "2:", + "--sync-critical", + "1:", + ], + ) + assert result.exit_code == 2 + assert ( + result.stdout + == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n" ) @@ -125,5 +173,5 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag( assert result.exit_code == 2 assert ( result.stdout - == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv3_lag=20000000 unhealthy_replica=2\n" + == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n" )