Redefining cluster_node_count using Patroni 3.0.4's new status indicators

Previously, replica nodes were labeled with a `running` state. As a
result, our checks were based on nodes marked as `running` through
the `--running-[warning|critical]` options.

However, with the recent changes in Patroni 3.0.4, replica nodes now
carry a `streaming` state. This shift in terminology calls for an
adjustment in our approach. A new state, `healthy_member`, has been
introduced to encompass both `running` and `streaming` nodes.

Key Modifications:

* The existing `--running-[warning|critical]` option is now designated
  as `--healthy-[warning|critical]`.
* Introduction of the `healthy_member` perfdata, which serves as the
  reference point for the aforementioned options.
* Updates to documentation, help messages, and tests.
This commit is contained in:
benoit 2023-08-21 10:45:02 +02:00 committed by Benoit
parent 2f250e846e
commit 021b572e53
4 changed files with 37 additions and 31 deletions

View file

@ -230,19 +230,22 @@ Usage: check_patroni cluster_node_count [OPTIONS]
* uninitialized * uninitialized
Check: Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds. * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
* `OK`: If they are not provided. * `OK`: If they are not provided.
Perfdata: Perfdata:
* `members`: the member count. * `members`: the member count.
* all the roles of the nodes in the cluster with their number (start with "role_"). * `healthy_members`: the running and streaming member count.
* all the statuses of the nodes in the cluster with their number (start with "state_"). * all the roles of the nodes in the cluster with their count (start with "role_").
* all the statuses of the nodes in the cluster with their count (start with "state_").
Options: Options:
-w, --warning TEXT Warning threshold for the number of nodes. -w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the number of nodes. -c, --critical TEXT Critical threshold for the number of nodes.
--running-warning TEXT Warning threshold for the number of running nodes. --healthy-warning TEXT Warning threshold for the number of healthy nodes
--running-critical TEXT Critical threshold for the number of running nodes. (running + streaming).
--healthy-critical TEXT Critical threshold for the number of healthy nodes
(running + streaming).
--help Show this message and exit. --help Show this message and exit.
``` ```

View file

@ -201,16 +201,16 @@ def main(
help="Critical threshold for the number of nodes.", help="Critical threshold for the number of nodes.",
) )
@click.option( @click.option(
"--running-warning", "--healthy-warning",
"running_warning", "healthy_warning",
type=str, type=str,
help="Warning threshold for the number of running nodes.", help="Warning threshold for the number of healthy nodes (running + streaming).",
) )
@click.option( @click.option(
"--running-critical", "--healthy-critical",
"running_critical", "healthy_critical",
type=str, type=str,
help="Critical threshold for the number of running nodes.", help="Critical threshold for the number of healthy nodes (running + streaming).",
) )
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
@ -218,8 +218,8 @@ def cluster_node_count(
ctx: click.Context, ctx: click.Context,
warning: str, warning: str,
critical: str, critical: str,
running_warning: str, healthy_warning: str,
running_critical: str, healthy_critical: str,
) -> None: ) -> None:
"""Count the number of nodes in the cluster. """Count the number of nodes in the cluster.
@ -245,14 +245,15 @@ def cluster_node_count(
\b \b
Check: Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds. * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
* `OK`: If they are not provided. * `OK`: If they are not provided.
\b \b
Perfdata: Perfdata:
* `members`: the member count. * `members`: the member count.
* all the roles of the nodes in the cluster with their number (start with "role_"). * `healthy_members`: the running and streaming member count.
* all the statuses of the nodes in the cluster with their number (start with "state_"). * all the roles of the nodes in the cluster with their count (start with "role_").
* all the statuses of the nodes in the cluster with their count (start with "state_").
""" """
check = nagiosplugin.Check() check = nagiosplugin.Check()
check.add( check.add(
@ -263,9 +264,9 @@ def cluster_node_count(
critical, critical,
), ),
nagiosplugin.ScalarContext( nagiosplugin.ScalarContext(
"state_running", "healthy_members",
running_warning, healthy_warning,
running_critical, healthy_critical,
), ),
nagiosplugin.ScalarContext("member_roles"), nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("member_statuses"), nagiosplugin.ScalarContext("member_statuses"),

View file

@ -27,9 +27,12 @@ class ClusterNodeCount(PatroniResource):
role_counters.update(roles) role_counters.update(roles)
status_counters.update(statuses) status_counters.update(statuses)
# The actual check: members, running state # The actual check: members, healthy_members
yield nagiosplugin.Metric("members", len(item_dict["members"])) yield nagiosplugin.Metric("members", len(item_dict["members"]))
yield nagiosplugin.Metric("state_running", status_counters["running"]) yield nagiosplugin.Metric(
"healthy_members",
status_counters["running"] + status_counters.get("streaming", 0),
)
# The performance data : role # The performance data : role
for role in role_counters: for role in role_counters:
@ -39,10 +42,9 @@ class ClusterNodeCount(PatroniResource):
# The performance data : statuses (except running) # The performance data : statuses (except running)
for state in status_counters: for state in status_counters:
if state != "running": yield nagiosplugin.Metric(
yield nagiosplugin.Metric( f"state_{state}", status_counters[state], context="member_statuses"
f"state_{state}", status_counters[state], context="member_statuses" )
)
class ClusterHasLeader(PatroniResource): class ClusterHasLeader(PatroniResource):

View file

@ -30,9 +30,9 @@ def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None:
"@0:1", "@0:1",
"--critical", "--critical",
"@2", "@2",
"--running-warning", "--healthy-warning",
"@2", "@2",
"--running-critical", "--healthy-critical",
"@0:1", "@0:1",
], ],
) )
@ -49,9 +49,9 @@ def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None:
"-e", "-e",
"https://10.20.199.3:8008", "https://10.20.199.3:8008",
"cluster_node_count", "cluster_node_count",
"--running-warning", "--healthy-warning",
"@2", "@2",
"--running-critical", "--healthy-critical",
"@0:1", "@0:1",
], ],
) )
@ -68,9 +68,9 @@ def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None:
"-e", "-e",
"https://10.20.199.3:8008", "https://10.20.199.3:8008",
"cluster_node_count", "cluster_node_count",
"--running-warning", "--healthy-warning",
"@2", "@2",
"--running-critical", "--healthy-critical",
"@0:1", "@0:1",
], ],
) )