From 021b572e533da9def786b1ce5ee3a6c0335e3618 Mon Sep 17 00:00:00 2001 From: benoit Date: Mon, 21 Aug 2023 10:45:02 +0200 Subject: [PATCH] Redefining `cluster_node_count` using Patroni 3.0.4's new status indicators Previously, replica nodes were labeled with a `running` state. As a result, our checks were based on nodes marked as `running` through the `--running-[warning|critical]` options. However, with the recent changes in Patroni 3.0.4, replica nodes now carry a `streaming` state. This shift in terminology calls for an adjustment in our approach. A new state, `healthy_member`, has been introduced to encompass both `running` and `streaming` nodes. Key Modifications: * The existing `--running-[warning|critical]` option is now designated as `--healthy-[warning|critical]`. * Introduction of the `healthy_member` perfdata, which serves as the reference point for the aforementioned options. * Updates to documentation, help messages, and tests. --- README.md | 13 ++++++++----- check_patroni/cli.py | 29 +++++++++++++++-------------- check_patroni/cluster.py | 14 ++++++++------ tests/test_cluster_node_count.py | 12 ++++++------ 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index de6d94c..17aaf00 100644 --- a/README.md +++ b/README.md @@ -230,19 +230,22 @@ Usage: check_patroni cluster_node_count [OPTIONS] * uninitialized Check: - * Compares the number of nodes against the normal and running node warning and critical thresholds. + * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds. * `OK`: If they are not provided. Perfdata: * `members`: the member count. - * all the roles of the nodes in the cluster with their number (start with "role_"). - * all the statuses of the nodes in the cluster with their number (start with "state_"). + * `healthy_members`: the running and streaming member count. + * all the roles of the nodes in the cluster with their count (start with "role_"). + * all the statuses of the nodes in the cluster with their count (start with "state_"). Options: -w, --warning TEXT Warning threshold for the number of nodes. -c, --critical TEXT Critical threshold for the number of nodes. - --running-warning TEXT Warning threshold for the number of running nodes. - --running-critical TEXT Critical threshold for the number of running nodes. + --healthy-warning TEXT Warning threshold for the number of healthy nodes + (running + streaming). + --healthy-critical TEXT Critical threshold for the number of healthy nodes + (running + streaming). --help Show this message and exit. ``` diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 52b4989..f7e0cae 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -201,16 +201,16 @@ def main( help="Critical threshold for the number of nodes.", ) @click.option( - "--running-warning", - "running_warning", + "--healthy-warning", + "healthy_warning", type=str, - help="Warning threshold for the number of running nodes.", + help="Warning threshold for the number of healthy nodes (running + streaming).", ) @click.option( - "--running-critical", - "running_critical", + "--healthy-critical", + "healthy_critical", type=str, - help="Critical threshold for the number of running nodes.", + help="Critical threshold for the number of healthy nodes (running + streaming).", ) @click.pass_context @nagiosplugin.guarded @@ -218,8 +218,8 @@ def cluster_node_count( ctx: click.Context, warning: str, critical: str, - running_warning: str, - running_critical: str, + healthy_warning: str, + healthy_critical: str, ) -> None: """Count the number of nodes in the cluster. @@ -245,14 +245,15 @@ def cluster_node_count( \b Check: - * Compares the number of nodes against the normal and running node warning and critical thresholds. + * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds. * `OK`: If they are not provided. \b Perfdata: * `members`: the member count. - * all the roles of the nodes in the cluster with their number (start with "role_"). - * all the statuses of the nodes in the cluster with their number (start with "state_"). + * `healthy_members`: the running and streaming member count. + * all the roles of the nodes in the cluster with their count (start with "role_"). + * all the statuses of the nodes in the cluster with their count (start with "state_"). """ check = nagiosplugin.Check() check.add( @@ -263,9 +264,9 @@ def cluster_node_count( critical, ), nagiosplugin.ScalarContext( - "state_running", - running_warning, - running_critical, + "healthy_members", + healthy_warning, + healthy_critical, ), nagiosplugin.ScalarContext("member_roles"), nagiosplugin.ScalarContext("member_statuses"), diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index b5c1d63..55e9bdc 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -27,9 +27,12 @@ class ClusterNodeCount(PatroniResource): role_counters.update(roles) status_counters.update(statuses) - # The actual check: members, running state + # The actual check: members, healthy_members yield nagiosplugin.Metric("members", len(item_dict["members"])) - yield nagiosplugin.Metric("state_running", status_counters["running"]) + yield nagiosplugin.Metric( + "healthy_members", + status_counters["running"] + status_counters.get("streaming", 0), + ) # The performance data : role for role in role_counters: @@ -39,10 +42,9 @@ class ClusterNodeCount(PatroniResource): # The performance data : statuses (except running) for state in status_counters: - if state != "running": - yield nagiosplugin.Metric( - f"state_{state}", status_counters[state], context="member_statuses" - ) + yield nagiosplugin.Metric( + f"state_{state}", status_counters[state], context="member_statuses" + ) class ClusterHasLeader(PatroniResource): diff --git a/tests/test_cluster_node_count.py b/tests/test_cluster_node_count.py index be3bbc0..a03e292 100644 --- a/tests/test_cluster_node_count.py +++ b/tests/test_cluster_node_count.py @@ -30,9 +30,9 @@ def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None: "@0:1", "--critical", "@2", - "--running-warning", + "--healthy-warning", "@2", - "--running-critical", + "--healthy-critical", "@0:1", ], ) @@ -49,9 +49,9 @@ def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None: "-e", "https://10.20.199.3:8008", "cluster_node_count", - "--running-warning", + "--healthy-warning", "@2", - "--running-critical", + "--healthy-critical", "@0:1", ], ) @@ -68,9 +68,9 @@ def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None: "-e", "https://10.20.199.3:8008", "cluster_node_count", - "--running-warning", + "--healthy-warning", "@2", - "--running-critical", + "--healthy-critical", "@0:1", ], )