Node and Cluster services reviews

This commit is contained in:
benoit 2022-02-07 14:18:14 +01:00
parent ec3b8f6806
commit 4de20fefdc
3 changed files with 28 additions and 22 deletions

View file

@ -23,7 +23,7 @@ Options:
Commands:
cluster_config_has_changed Check if the hash of the configuration has...
cluster_has_leader Check if the cluster has a leader.
cluster_has_replica Check if the cluster has healthy replicates.
cluster_has_replica Check if the cluster has healthy replicas.
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
cluster_node_count Count the number of nodes in the cluster.
node_is_alive Check if the node is alive ie patroni is...
@ -133,6 +133,8 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
Check if the cluster has a leader.
Note: there is no difference between a normal and standby leader.
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
@ -148,9 +150,9 @@ Options:
```
Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has healthy replicates.
Check if the cluster has healthy replicas.
A healthy replicate:
A healthy replica:
* is in running state
* has a replica role
* has a lag lower or equal to max_lag
@ -164,8 +166,10 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
* the lag of each replica labelled with "member name"_lag
Options:
-w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the number of replica nodes.
-w, --warning TEXT Warning threshold for the number of healthy replica
nodes.
-c, --critical TEXT Critical threshold for the number of healthy replica
nodes.
--max-lag TEXT maximum allowed lag
--help Show this message and exit.
```
@ -197,7 +201,7 @@ Usage: check_patroni cluster_node_count [OPTIONS]
Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`! If they are not provided.
* `OK`: If they are not provided.
Perfdata:
* `members`: the member count.
@ -206,9 +210,9 @@ Usage: check_patroni cluster_node_count [OPTIONS]
Options:
-w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the nimber of nodes.
-c, --critical TEXT Critical threshold for the number of nodes.
--running-warning TEXT Warning threshold for the number of running nodes.
--running-critical TEXT Critical threshold for the nimber of running nodes.
--running-critical TEXT Critical threshold for the number of running nodes.
--help Show this message and exit.
```

View file

@ -164,7 +164,7 @@ def main(
"--critical",
"critical",
type=str,
help="Critical threshold for the nimber of nodes.",
help="Critical threshold for the number of nodes.",
)
@click.option(
"--running-warning",
@ -176,7 +176,7 @@ def main(
"--running-critical",
"running_critical",
type=str,
help="Critical threshold for the nimber of running nodes.",
help="Critical threshold for the number of running nodes.",
)
@click.pass_context
@nagiosplugin.guarded
@ -192,7 +192,7 @@ def cluster_node_count(
\b
Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`! If they are not provided.
* `OK`: If they are not provided.
\b
Perfdata:
@ -213,8 +213,8 @@ def cluster_node_count(
running_warning,
running_critical,
),
nagiosplugin.ScalarContext("members_roles"),
nagiosplugin.ScalarContext("members_statuses"),
nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("member_statuses"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@ -225,6 +225,8 @@ def cluster_node_count(
def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader.
Note: there is no difference between a normal and standby leader.
\b
Check:
* `OK`: if there is a leader node.
@ -232,7 +234,6 @@ def cluster_has_leader(ctx: click.Context) -> None:
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
"""
# FIXME: Manage primary or standby leader in the same place ?
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
@ -248,14 +249,14 @@ def cluster_has_leader(ctx: click.Context) -> None:
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
help="Warning threshold for the number of healthy replica nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of replica nodes.",
help="Critical threshold for the number of healthy replica nodes.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@ -263,10 +264,10 @@ def cluster_has_leader(ctx: click.Context) -> None:
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None:
"""Check if the cluster has healthy replicates.
"""Check if the cluster has healthy replicas.
\b
A healthy replicate:
A healthy replica:
* is in running state
* has a replica role
* has a lag lower or equal to max_lag
@ -324,7 +325,7 @@ def cluster_config_has_changed(
Perfdata:
* `is_configuration_changed` is 1 if the configuration has changed
"""
# FIXME hash in perfdata ?
# Note: hash cannot be in the perf data = not a number
if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None
):

View file

@ -39,14 +39,14 @@ class ClusterNodeCount(PatroniResource):
# The performance data : role
for role in role_counters:
yield nagiosplugin.Metric(
f"role_{role}", role_counters[role], context="members_roles"
f"role_{role}", role_counters[role], context="member_roles"
)
# The performance data : statuses (except running)
for state in status_counters:
if state != "running":
yield nagiosplugin.Metric(
f"state_{state}", status_counters[state], context="members_statuses"
f"state_{state}", status_counters[state], context="member_statuses"
)
@ -111,7 +111,7 @@ class ClusterHasReplica(PatroniResource):
# The actual check
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
# The performance data : unheakthy replica count, replicas lag
# The performance data : unhealthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
for replica in replicas:
yield nagiosplugin.Metric(
@ -165,6 +165,7 @@ class ClusterConfigHasChanged(PatroniResource):
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
# TODO: It would be helpful to display the old / new hash here, but it's not a metric.
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
return "The hash of patroni's dynamic configuration has not changed."