Redefining cluster_node_count using Patroni 3.0.4's new status indicators

Previously, replica nodes were labeled with a `running` state. As a result, our checks were based on nodes marked as `running` through the `--running-[warning|critical]` options. However, with the recent changes in Patroni 3.0.4, replica nodes now carry a `streaming` state. This shift in terminology calls for an adjustment in our approach. A new state, `healthy_member`, has been introduced to encompass both `running` and `streaming` nodes. Key Modifications: * The existing `--running-[warning|critical]` option is now designated as `--healthy-[warning|critical]`. * Introduction of the `healthy_member` perfdata, which serves as the reference point for the aforementioned options. * Updates to documentation, help messages, and tests.
2023-08-21 10:45:02 +02:00 · 2023-08-21 10:45:02 +02:00 · 021b572e53
parent 2f250e846e
commit 021b572e53
4 changed files with 37 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -230,19 +230,22 @@ Usage: check_patroni cluster_node_count [OPTIONS]
  * uninitialized

  Check:
-  * Compares the number of nodes against the normal and running node warning and critical thresholds.
+  * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
  * `OK`:  If they are not provided.

  Perfdata:
  * `members`: the member count.
-  * all the roles of the nodes in the cluster with their number (start with "role_").
-  * all the statuses of the nodes in the cluster with their number (start with "state_").
+  * `healthy_members`: the running and streaming member count.
+  * all the roles of the nodes in the cluster with their count (start with "role_").
+  * all the statuses of the nodes in the cluster with their count (start with "state_").

 Options:
  -w, --warning TEXT       Warning threshold for the number of nodes.
  -c, --critical TEXT      Critical threshold for the number of nodes.
-  --running-warning TEXT   Warning threshold for the number of running nodes.
-  --running-critical TEXT  Critical threshold for the number of running nodes.
+  --healthy-warning TEXT   Warning threshold for the number of healthy nodes
+                           (running + streaming).
+  --healthy-critical TEXT  Critical threshold for the number of healthy nodes
+                           (running + streaming).
  --help                   Show this message and exit.
 ```

--- a/check_patroni/cli.py
+++ b/check_patroni/cli.py
@ -201,16 +201,16 @@ def main(
    help="Critical threshold for the number of nodes.",
 )
@click.option(
-    "--running-warning",
-    "running_warning",
+    "--healthy-warning",
+    "healthy_warning",
    type=str,
-    help="Warning threshold for the number of running nodes.",
+    help="Warning threshold for the number of healthy nodes (running + streaming).",
 )
@click.option(
-    "--running-critical",
-    "running_critical",
+    "--healthy-critical",
+    "healthy_critical",
    type=str,
-    help="Critical threshold for the number of running nodes.",
+    help="Critical threshold for the number of healthy nodes (running + streaming).",
 )
@click.pass_context
@nagiosplugin.guarded
@ -218,8 +218,8 @@ def cluster_node_count(
    ctx: click.Context,
    warning: str,
    critical: str,
-    running_warning: str,
-    running_critical: str,
+    healthy_warning: str,
+    healthy_critical: str,
 ) -> None:
    """Count the number of nodes in the cluster.

@ -245,14 +245,15 @@ def cluster_node_count(

    \b
    Check:
-    * Compares the number of nodes against the normal and running node warning and critical thresholds.
+    * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
    * `OK`:  If they are not provided.

    \b
    Perfdata:
    * `members`: the member count.
-    * all the roles of the nodes in the cluster with their number (start with "role_").
-    * all the statuses of the nodes in the cluster with their number (start with "state_").
+    * `healthy_members`: the running and streaming member count.
+    * all the roles of the nodes in the cluster with their count (start with "role_").
+    * all the statuses of the nodes in the cluster with their count (start with "state_").
    """
    check = nagiosplugin.Check()
    check.add(
@ -263,9 +264,9 @@ def cluster_node_count(
            critical,
        ),
        nagiosplugin.ScalarContext(
-            "state_running",
-            running_warning,
-            running_critical,
+            "healthy_members",
+            healthy_warning,
+            healthy_critical,
        ),
        nagiosplugin.ScalarContext("member_roles"),
        nagiosplugin.ScalarContext("member_statuses"),
--- a/check_patroni/cluster.py
+++ b/check_patroni/cluster.py
@ -27,9 +27,12 @@ class ClusterNodeCount(PatroniResource):
        role_counters.update(roles)
        status_counters.update(statuses)

-        # The actual check: members, running state
+        # The actual check: members, healthy_members
        yield nagiosplugin.Metric("members", len(item_dict["members"]))
-        yield nagiosplugin.Metric("state_running", status_counters["running"])
+        yield nagiosplugin.Metric(
+            "healthy_members",
+            status_counters["running"] + status_counters.get("streaming", 0),
+        )

        # The performance data : role
        for role in role_counters:
@ -39,10 +42,9 @@ class ClusterNodeCount(PatroniResource):

        # The performance data : statuses (except running)
        for state in status_counters:
-            if state != "running":
-                yield nagiosplugin.Metric(
-                    f"state_{state}", status_counters[state], context="member_statuses"
-                )
+            yield nagiosplugin.Metric(
+                f"state_{state}", status_counters[state], context="member_statuses"
+            )


 class ClusterHasLeader(PatroniResource):
--- a/tests/test_cluster_node_count.py
+++ b/tests/test_cluster_node_count.py
@ -30,9 +30,9 @@ def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None:
            "@0:1",
            "--critical",
            "@2",
-            "--running-warning",
+            "--healthy-warning",
            "@2",
-            "--running-critical",
+            "--healthy-critical",
            "@0:1",
        ],
    )
@ -49,9 +49,9 @@ def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None:
            "-e",
            "https://10.20.199.3:8008",
            "cluster_node_count",
-            "--running-warning",
+            "--healthy-warning",
            "@2",
-            "--running-critical",
+            "--healthy-critical",
            "@0:1",
        ],
    )
@ -68,9 +68,9 @@ def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None:
            "-e",
            "https://10.20.199.3:8008",
            "cluster_node_count",
-            "--running-warning",
+            "--healthy-warning",
            "@2",
-            "--running-critical",
+            "--healthy-critical",
            "@0:1",
        ],
    )