Add info and options about sync standby to cluster_has_replica

* Add `--sync-warning` and `--sync-critical` * Add `sync_replica` to track the number of sync replica in the perf data * Add `MEMBER-sync` to track if a member is a sync replica in the perf data
2023-08-24 15:43:35 +02:00 · 2023-08-24 15:43:35 +02:00 · ee3837fab1
parent b9fbdfdefd
commit ee3837fab1
5 changed files with 111 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,7 @@
 ### Added

 * Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
+* Add info and options (`--sync-warning` and `--sync-critical`) about sync replica to `cluster_has_replica`.
 * Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
 * Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
 * Add `standby-leader` as a valid leader type for `cluster_has_leader`.
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ Options:
 Commands:
  cluster_config_has_changed    Check if the hash of the configuration...
  cluster_has_leader            Check if the cluster has a leader.
-  cluster_has_replica           Check if the cluster has healthy replicas.
+  cluster_has_replica           Check if the cluster has healthy replicas...
  cluster_has_scheduled_action  Check if the cluster has a scheduled...
  cluster_is_in_maintenance     Check if the cluster is in maintenance...
  cluster_node_count            Count the number of nodes in the cluster.
@ -223,7 +223,7 @@ Options:
 ```
 Usage: check_patroni cluster_has_replica [OPTIONS]

-  Check if the cluster has healthy replicas.
+  Check if the cluster has healthy replicas and/or if some are sync standbies

  A healthy replica:
  * is in running or streaming state (V3.0.4)
@ -232,19 +232,24 @@ Usage: check_patroni cluster_has_replica [OPTIONS]

  Check:
  * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
+          and if the sync_replica count is compatible with the sync replica count threshold.
  * `WARNING` / `CRITICAL`: otherwise

  Perfdata:
  * healthy_replica & unhealthy_replica count
+  * the number of sync_replica, they are included in the previous count
  * the lag of each replica labelled with  "member name"_lag
+  * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync

 Options:
-  -w, --warning TEXT   Warning threshold for the number of healthy replica
-                       nodes.
-  -c, --critical TEXT  Critical threshold for the number of healthy replica
-                       nodes.
-  --max-lag TEXT       maximum allowed lag
-  --help               Show this message and exit.
+  -w, --warning TEXT    Warning threshold for the number of healthy replica
+                        nodes.
+  -c, --critical TEXT   Critical threshold for the number of healthy replica
+                        nodes.
+  --sync-warning TEXT   Warning threshold for the number of sync replica.
+  --sync-critical TEXT  Critical threshold for the number of sync replica.
+  --max-lag TEXT        maximum allowed lag
+  --help                Show this message and exit.
 ```

 ### cluster_has_scheduled_action
--- a/check_patroni/cli.py
+++ b/check_patroni/cli.py
@ -316,13 +316,30 @@ def cluster_has_leader(ctx: click.Context) -> None:
    type=str,
    help="Critical threshold for the number of healthy replica nodes.",
 )
+@click.option(
+    "--sync-warning",
+    "sync_warning",
+    type=str,
+    help="Warning threshold for the number of sync replica.",
+)
+@click.option(
+    "--sync-critical",
+    "sync_critical",
+    type=str,
+    help="Critical threshold for the number of sync replica.",
+)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
 def cluster_has_replica(
-    ctx: click.Context, warning: str, critical: str, max_lag: str
+    ctx: click.Context,
+    warning: str,
+    critical: str,
+    sync_warning: str,
+    sync_critical: str,
+    max_lag: str,
 ) -> None:
-    """Check if the cluster has healthy replicas.
+    """Check if the cluster has healthy replicas and/or if some are sync standbies

    \b
    A healthy replica:
@ -333,12 +350,15 @@ def cluster_has_replica(
    \b
    Check:
    * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
+            and if the sync_replica count is compatible with the sync replica count threshold.
    * `WARNING` / `CRITICAL`: otherwise

    \b
    Perfdata:
    * healthy_replica & unhealthy_replica count
+    * the number of sync_replica, they are included in the previous count
    * the lag of each replica labelled with  "member name"_lag
+    * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync
    """

    tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
@ -350,8 +370,14 @@ def cluster_has_replica(
            warning,
            critical,
        ),
+        nagiosplugin.ScalarContext(
+            "sync_replica",
+            sync_warning,
+            sync_critical,
+        ),
        nagiosplugin.ScalarContext("unhealthy_replica"),
        nagiosplugin.ScalarContext("replica_lag"),
+        nagiosplugin.ScalarContext("replica_sync"),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)

--- a/check_patroni/cluster.py
+++ b/check_patroni/cluster.py
@ -92,6 +92,7 @@ class ClusterHasReplica(PatroniResource):
        replicas = []
        healthy_replica = 0
        unhealthy_replica = 0
+        sync_replica = 0
        for member in item_dict["members"]:
            # FIXME are there other acceptable states
            if member["role"] in ["replica", "sync_standby"]:
@ -100,7 +101,17 @@ class ClusterHasReplica(PatroniResource):
                    member["state"] in ["running", "streaming"]
                    and member["lag"] != "unknown"
                ):
-                    replicas.append({"name": member["name"], "lag": member["lag"]})
+                    replicas.append(
+                        {
+                            "name": member["name"],
+                            "lag": member["lag"],
+                            "sync": 1 if member["role"] == "sync_standby" else 0,
+                        }
+                    )
+
+                    if member["role"] == "sync_standby":
+                        sync_replica += 1
+
                    if self.max_lag is None or self.max_lag >= int(member["lag"]):
                        healthy_replica += 1
                        continue
@ -108,6 +119,7 @@ class ClusterHasReplica(PatroniResource):

        # The actual check
        yield nagiosplugin.Metric("healthy_replica", healthy_replica)
+        yield nagiosplugin.Metric("sync_replica", sync_replica)

        # The performance data : unhealthy replica count, replicas lag
        yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
@ -115,6 +127,9 @@ class ClusterHasReplica(PatroniResource):
            yield nagiosplugin.Metric(
                f"{replica['name']}_lag", replica["lag"], context="replica_lag"
            )
+            yield nagiosplugin.Metric(
+                f"{replica['name']}_sync", replica["sync"], context="replica_sync"
+            )


 # FIXME is this needed ??
--- a/tests/test_cluster_has_replica.py
+++ b/tests/test_cluster_has_replica.py
@ -19,7 +19,7 @@ def test_cluster_has_relica_ok(
    assert result.exit_code == 0
    assert (
        result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
    )


@ -44,7 +44,30 @@ def test_cluster_has_replica_ok_with_count_thresholds(
    assert result.exit_code == 0
    assert (
        result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
+    )
+
+
+def test_cluster_has_replica_ok_with_sync_count_thresholds(
+    mocker: MockerFixture, use_old_replica_state: bool
+) -> None:
+    runner = CliRunner()
+
+    my_mock(mocker, "cluster_has_replica_ok", 200, use_old_replica_state)
+    result = runner.invoke(
+        main,
+        [
+            "-e",
+            "https://10.20.199.3:8008",
+            "cluster_has_replica",
+            "--sync-warning",
+            "1:",
+        ],
+    )
+    assert result.exit_code == 0
+    assert (
+        result.stdout
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n"
    )


@ -72,7 +95,7 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag(
    assert result.exit_code == 0
    assert (
        result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv3_lag=0 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n"
    )


@ -97,7 +120,32 @@ def test_cluster_has_replica_ko_with_count_thresholds(
    assert result.exit_code == 1
    assert (
        result.stdout
-        == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 unhealthy_replica=1\n"
+        == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n"
+    )
+
+
+def test_cluster_has_replica_ko_with_sync_count_thresholds(
+    mocker: MockerFixture, use_old_replica_state: bool
+) -> None:
+    runner = CliRunner()
+
+    my_mock(mocker, "cluster_has_replica_ko", 200, use_old_replica_state)
+    result = runner.invoke(
+        main,
+        [
+            "-e",
+            "https://10.20.199.3:8008",
+            "cluster_has_replica",
+            "--sync-warning",
+            "2:",
+            "--sync-critical",
+            "1:",
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        result.stdout
+        == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n"
    )


@ -125,5 +173,5 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
    assert result.exit_code == 2
    assert (
        result.stdout
-        == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv3_lag=20000000 unhealthy_replica=2\n"
+        == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n"
    )