Redefining cluster_node_count
using Patroni 3.0.4's new status indicators
Previously, replica nodes were labeled with a `running` state. As a result, our checks were based on nodes marked as `running` through the `--running-[warning|critical]` options. However, with the recent changes in Patroni 3.0.4, replica nodes now carry a `streaming` state. This shift in terminology calls for an adjustment in our approach. A new state, `healthy_member`, has been introduced to encompass both `running` and `streaming` nodes. Key Modifications: * The existing `--running-[warning|critical]` option is now designated as `--healthy-[warning|critical]`. * Introduction of the `healthy_member` perfdata, which serves as the reference point for the aforementioned options. * Updates to documentation, help messages, and tests.
This commit is contained in:
parent
2f250e846e
commit
021b572e53
13
README.md
13
README.md
|
@ -230,19 +230,22 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
||||||
* uninitialized
|
* uninitialized
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
* Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
|
||||||
* `OK`: If they are not provided.
|
* `OK`: If they are not provided.
|
||||||
|
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* `members`: the member count.
|
* `members`: the member count.
|
||||||
* all the roles of the nodes in the cluster with their number (start with "role_").
|
* `healthy_members`: the running and streaming member count.
|
||||||
* all the statuses of the nodes in the cluster with their number (start with "state_").
|
* all the roles of the nodes in the cluster with their count (start with "role_").
|
||||||
|
* all the statuses of the nodes in the cluster with their count (start with "state_").
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||||
-c, --critical TEXT Critical threshold for the number of nodes.
|
-c, --critical TEXT Critical threshold for the number of nodes.
|
||||||
--running-warning TEXT Warning threshold for the number of running nodes.
|
--healthy-warning TEXT Warning threshold for the number of healthy nodes
|
||||||
--running-critical TEXT Critical threshold for the number of running nodes.
|
(running + streaming).
|
||||||
|
--healthy-critical TEXT Critical threshold for the number of healthy nodes
|
||||||
|
(running + streaming).
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -201,16 +201,16 @@ def main(
|
||||||
help="Critical threshold for the number of nodes.",
|
help="Critical threshold for the number of nodes.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--running-warning",
|
"--healthy-warning",
|
||||||
"running_warning",
|
"healthy_warning",
|
||||||
type=str,
|
type=str,
|
||||||
help="Warning threshold for the number of running nodes.",
|
help="Warning threshold for the number of healthy nodes (running + streaming).",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--running-critical",
|
"--healthy-critical",
|
||||||
"running_critical",
|
"healthy_critical",
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the number of running nodes.",
|
help="Critical threshold for the number of healthy nodes (running + streaming).",
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
|
@ -218,8 +218,8 @@ def cluster_node_count(
|
||||||
ctx: click.Context,
|
ctx: click.Context,
|
||||||
warning: str,
|
warning: str,
|
||||||
critical: str,
|
critical: str,
|
||||||
running_warning: str,
|
healthy_warning: str,
|
||||||
running_critical: str,
|
healthy_critical: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Count the number of nodes in the cluster.
|
"""Count the number of nodes in the cluster.
|
||||||
|
|
||||||
|
@ -245,14 +245,15 @@ def cluster_node_count(
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
* Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
|
||||||
* `OK`: If they are not provided.
|
* `OK`: If they are not provided.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* `members`: the member count.
|
* `members`: the member count.
|
||||||
* all the roles of the nodes in the cluster with their number (start with "role_").
|
* `healthy_members`: the running and streaming member count.
|
||||||
* all the statuses of the nodes in the cluster with their number (start with "state_").
|
* all the roles of the nodes in the cluster with their count (start with "role_").
|
||||||
|
* all the statuses of the nodes in the cluster with their count (start with "state_").
|
||||||
"""
|
"""
|
||||||
check = nagiosplugin.Check()
|
check = nagiosplugin.Check()
|
||||||
check.add(
|
check.add(
|
||||||
|
@ -263,9 +264,9 @@ def cluster_node_count(
|
||||||
critical,
|
critical,
|
||||||
),
|
),
|
||||||
nagiosplugin.ScalarContext(
|
nagiosplugin.ScalarContext(
|
||||||
"state_running",
|
"healthy_members",
|
||||||
running_warning,
|
healthy_warning,
|
||||||
running_critical,
|
healthy_critical,
|
||||||
),
|
),
|
||||||
nagiosplugin.ScalarContext("member_roles"),
|
nagiosplugin.ScalarContext("member_roles"),
|
||||||
nagiosplugin.ScalarContext("member_statuses"),
|
nagiosplugin.ScalarContext("member_statuses"),
|
||||||
|
|
|
@ -27,9 +27,12 @@ class ClusterNodeCount(PatroniResource):
|
||||||
role_counters.update(roles)
|
role_counters.update(roles)
|
||||||
status_counters.update(statuses)
|
status_counters.update(statuses)
|
||||||
|
|
||||||
# The actual check: members, running state
|
# The actual check: members, healthy_members
|
||||||
yield nagiosplugin.Metric("members", len(item_dict["members"]))
|
yield nagiosplugin.Metric("members", len(item_dict["members"]))
|
||||||
yield nagiosplugin.Metric("state_running", status_counters["running"])
|
yield nagiosplugin.Metric(
|
||||||
|
"healthy_members",
|
||||||
|
status_counters["running"] + status_counters.get("streaming", 0),
|
||||||
|
)
|
||||||
|
|
||||||
# The performance data : role
|
# The performance data : role
|
||||||
for role in role_counters:
|
for role in role_counters:
|
||||||
|
@ -39,7 +42,6 @@ class ClusterNodeCount(PatroniResource):
|
||||||
|
|
||||||
# The performance data : statuses (except running)
|
# The performance data : statuses (except running)
|
||||||
for state in status_counters:
|
for state in status_counters:
|
||||||
if state != "running":
|
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
f"state_{state}", status_counters[state], context="member_statuses"
|
f"state_{state}", status_counters[state], context="member_statuses"
|
||||||
)
|
)
|
||||||
|
|
|
@ -30,9 +30,9 @@ def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None:
|
||||||
"@0:1",
|
"@0:1",
|
||||||
"--critical",
|
"--critical",
|
||||||
"@2",
|
"@2",
|
||||||
"--running-warning",
|
"--healthy-warning",
|
||||||
"@2",
|
"@2",
|
||||||
"--running-critical",
|
"--healthy-critical",
|
||||||
"@0:1",
|
"@0:1",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -49,9 +49,9 @@ def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None:
|
||||||
"-e",
|
"-e",
|
||||||
"https://10.20.199.3:8008",
|
"https://10.20.199.3:8008",
|
||||||
"cluster_node_count",
|
"cluster_node_count",
|
||||||
"--running-warning",
|
"--healthy-warning",
|
||||||
"@2",
|
"@2",
|
||||||
"--running-critical",
|
"--healthy-critical",
|
||||||
"@0:1",
|
"@0:1",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -68,9 +68,9 @@ def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None:
|
||||||
"-e",
|
"-e",
|
||||||
"https://10.20.199.3:8008",
|
"https://10.20.199.3:8008",
|
||||||
"cluster_node_count",
|
"cluster_node_count",
|
||||||
"--running-warning",
|
"--healthy-warning",
|
||||||
"@2",
|
"@2",
|
||||||
"--running-critical",
|
"--healthy-critical",
|
||||||
"@0:1",
|
"@0:1",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue