Node and Cluster services reviews

This commit is contained in:
benoit 2022-02-07 14:18:14 +01:00
parent ec3b8f6806
commit 4de20fefdc
3 changed files with 28 additions and 22 deletions

View file

@ -23,7 +23,7 @@ Options:
Commands: Commands:
cluster_config_has_changed Check if the hash of the configuration has... cluster_config_has_changed Check if the hash of the configuration has...
cluster_has_leader Check if the cluster has a leader. cluster_has_leader Check if the cluster has a leader.
cluster_has_replica Check if the cluster has healthy replicates. cluster_has_replica Check if the cluster has healthy replicas.
cluster_is_in_maintenance Check if the cluster is in maintenance mode... cluster_is_in_maintenance Check if the cluster is in maintenance mode...
cluster_node_count Count the number of nodes in the cluster. cluster_node_count Count the number of nodes in the cluster.
node_is_alive Check if the node is alive ie patroni is... node_is_alive Check if the node is alive ie patroni is...
@ -133,6 +133,8 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
Check if the cluster has a leader. Check if the cluster has a leader.
Note: there is no difference between a normal and standby leader.
Check: Check:
* `OK`: if there is a leader node. * `OK`: if there is a leader node.
* `CRITICAL`: otherwise * `CRITICAL`: otherwise
@ -148,9 +150,9 @@ Options:
``` ```
Usage: check_patroni cluster_has_replica [OPTIONS] Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has healthy replicates. Check if the cluster has healthy replicas.
A healthy replicate: A healthy replica:
* is in running state * is in running state
* has a replica role * has a replica role
* has a lag lower or equal to max_lag * has a lag lower or equal to max_lag
@ -164,8 +166,10 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
* the lag of each replica labelled with "member name"_lag * the lag of each replica labelled with "member name"_lag
Options: Options:
-w, --warning TEXT Warning threshold for the number of nodes. -w, --warning TEXT Warning threshold for the number of healthy replica
-c, --critical TEXT Critical threshold for the number of replica nodes. nodes.
-c, --critical TEXT Critical threshold for the number of healthy replica
nodes.
--max-lag TEXT maximum allowed lag --max-lag TEXT maximum allowed lag
--help Show this message and exit. --help Show this message and exit.
``` ```
@ -197,7 +201,7 @@ Usage: check_patroni cluster_node_count [OPTIONS]
Check: Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds. * Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`! If they are not provided. * `OK`: If they are not provided.
Perfdata: Perfdata:
* `members`: the member count. * `members`: the member count.
@ -206,9 +210,9 @@ Usage: check_patroni cluster_node_count [OPTIONS]
Options: Options:
-w, --warning TEXT Warning threshold for the number of nodes. -w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the nimber of nodes. -c, --critical TEXT Critical threshold for the number of nodes.
--running-warning TEXT Warning threshold for the number of running nodes. --running-warning TEXT Warning threshold for the number of running nodes.
--running-critical TEXT Critical threshold for the nimber of running nodes. --running-critical TEXT Critical threshold for the number of running nodes.
--help Show this message and exit. --help Show this message and exit.
``` ```

View file

@ -164,7 +164,7 @@ def main(
"--critical", "--critical",
"critical", "critical",
type=str, type=str,
help="Critical threshold for the nimber of nodes.", help="Critical threshold for the number of nodes.",
) )
@click.option( @click.option(
"--running-warning", "--running-warning",
@ -176,7 +176,7 @@ def main(
"--running-critical", "--running-critical",
"running_critical", "running_critical",
type=str, type=str,
help="Critical threshold for the nimber of running nodes.", help="Critical threshold for the number of running nodes.",
) )
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
@ -192,7 +192,7 @@ def cluster_node_count(
\b \b
Check: Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds. * Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`! If they are not provided. * `OK`: If they are not provided.
\b \b
Perfdata: Perfdata:
@ -213,8 +213,8 @@ def cluster_node_count(
running_warning, running_warning,
running_critical, running_critical,
), ),
nagiosplugin.ScalarContext("members_roles"), nagiosplugin.ScalarContext("member_roles"),
nagiosplugin.ScalarContext("members_statuses"), nagiosplugin.ScalarContext("member_statuses"),
) )
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@ -225,6 +225,8 @@ def cluster_node_count(
def cluster_has_leader(ctx: click.Context) -> None: def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader. """Check if the cluster has a leader.
Note: there is no difference between a normal and standby leader.
\b \b
Check: Check:
* `OK`: if there is a leader node. * `OK`: if there is a leader node.
@ -232,7 +234,6 @@ def cluster_has_leader(ctx: click.Context) -> None:
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
""" """
# FIXME: Manage primary or standby leader in the same place ?
check = nagiosplugin.Check() check = nagiosplugin.Check()
check.add( check.add(
ClusterHasLeader(ctx.obj.connection_info), ClusterHasLeader(ctx.obj.connection_info),
@ -248,14 +249,14 @@ def cluster_has_leader(ctx: click.Context) -> None:
"--warning", "--warning",
"warning", "warning",
type=str, type=str,
help="Warning threshold for the number of nodes.", help="Warning threshold for the number of healthy replica nodes.",
) )
@click.option( @click.option(
"-c", "-c",
"--critical", "--critical",
"critical", "critical",
type=str, type=str,
help="Critical threshold for the number of replica nodes.", help="Critical threshold for the number of healthy replica nodes.",
) )
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag") @click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context @click.pass_context
@ -263,10 +264,10 @@ def cluster_has_leader(ctx: click.Context) -> None:
def cluster_has_replica( def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, max_lag: str ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None: ) -> None:
"""Check if the cluster has healthy replicates. """Check if the cluster has healthy replicas.
\b \b
A healthy replicate: A healthy replica:
* is in running state * is in running state
* has a replica role * has a replica role
* has a lag lower or equal to max_lag * has a lag lower or equal to max_lag
@ -324,7 +325,7 @@ def cluster_config_has_changed(
Perfdata: Perfdata:
* `is_configuration_changed` is 1 if the configuration has changed * `is_configuration_changed` is 1 if the configuration has changed
""" """
# FIXME hash in perfdata ? # Note: hash cannot be in the perf data = not a number
if (config_hash is None and state_file is None) or ( if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None config_hash is not None and state_file is not None
): ):

View file

@ -39,14 +39,14 @@ class ClusterNodeCount(PatroniResource):
# The performance data : role # The performance data : role
for role in role_counters: for role in role_counters:
yield nagiosplugin.Metric( yield nagiosplugin.Metric(
f"role_{role}", role_counters[role], context="members_roles" f"role_{role}", role_counters[role], context="member_roles"
) )
# The performance data : statuses (except running) # The performance data : statuses (except running)
for state in status_counters: for state in status_counters:
if state != "running": if state != "running":
yield nagiosplugin.Metric( yield nagiosplugin.Metric(
f"state_{state}", status_counters[state], context="members_statuses" f"state_{state}", status_counters[state], context="member_statuses"
) )
@ -111,7 +111,7 @@ class ClusterHasReplica(PatroniResource):
# The actual check # The actual check
yield nagiosplugin.Metric("healthy_replica", healthy_replica) yield nagiosplugin.Metric("healthy_replica", healthy_replica)
# The performance data : unheakthy replica count, replicas lag # The performance data : unhealthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica) yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
for replica in replicas: for replica in replicas:
yield nagiosplugin.Metric( yield nagiosplugin.Metric(
@ -165,6 +165,7 @@ class ClusterConfigHasChanged(PatroniResource):
class ClusterConfigHasChangedSummary(nagiosplugin.Summary): class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
# TODO: It would be helpful to display the old / new hash here, but it's not a metric.
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str: def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
return "The hash of patroni's dynamic configuration has not changed." return "The hash of patroni's dynamic configuration has not changed."