Node and Cluster services reviews
This commit is contained in:
parent
ec3b8f6806
commit
4de20fefdc
20
README.md
20
README.md
|
@ -23,7 +23,7 @@ Options:
|
|||
Commands:
|
||||
cluster_config_has_changed Check if the hash of the configuration has...
|
||||
cluster_has_leader Check if the cluster has a leader.
|
||||
cluster_has_replica Check if the cluster has healthy replicates.
|
||||
cluster_has_replica Check if the cluster has healthy replicas.
|
||||
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
||||
cluster_node_count Count the number of nodes in the cluster.
|
||||
node_is_alive Check if the node is alive ie patroni is...
|
||||
|
@ -133,6 +133,8 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
|
|||
|
||||
Check if the cluster has a leader.
|
||||
|
||||
Note: there is no difference between a normal and standby leader.
|
||||
|
||||
Check:
|
||||
* `OK`: if there is a leader node.
|
||||
* `CRITICAL`: otherwise
|
||||
|
@ -148,9 +150,9 @@ Options:
|
|||
```
|
||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||
|
||||
Check if the cluster has healthy replicates.
|
||||
Check if the cluster has healthy replicas.
|
||||
|
||||
A healthy replicate:
|
||||
A healthy replica:
|
||||
* is in running state
|
||||
* has a replica role
|
||||
* has a lag lower or equal to max_lag
|
||||
|
@ -164,8 +166,10 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
|
|||
* the lag of each replica labelled with "member name"_lag
|
||||
|
||||
Options:
|
||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of replica nodes.
|
||||
-w, --warning TEXT Warning threshold for the number of healthy replica
|
||||
nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of healthy replica
|
||||
nodes.
|
||||
--max-lag TEXT maximum allowed lag
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
@ -197,7 +201,7 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
|||
|
||||
Check:
|
||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
||||
* `OK`! If they are not provided.
|
||||
* `OK`: If they are not provided.
|
||||
|
||||
Perfdata:
|
||||
* `members`: the member count.
|
||||
|
@ -206,9 +210,9 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
|||
|
||||
Options:
|
||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||
-c, --critical TEXT Critical threshold for the nimber of nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of nodes.
|
||||
--running-warning TEXT Warning threshold for the number of running nodes.
|
||||
--running-critical TEXT Critical threshold for the nimber of running nodes.
|
||||
--running-critical TEXT Critical threshold for the number of running nodes.
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
|
|
|
@ -164,7 +164,7 @@ def main(
|
|||
"--critical",
|
||||
"critical",
|
||||
type=str,
|
||||
help="Critical threshold for the nimber of nodes.",
|
||||
help="Critical threshold for the number of nodes.",
|
||||
)
|
||||
@click.option(
|
||||
"--running-warning",
|
||||
|
@ -176,7 +176,7 @@ def main(
|
|||
"--running-critical",
|
||||
"running_critical",
|
||||
type=str,
|
||||
help="Critical threshold for the nimber of running nodes.",
|
||||
help="Critical threshold for the number of running nodes.",
|
||||
)
|
||||
@click.pass_context
|
||||
@nagiosplugin.guarded
|
||||
|
@ -192,7 +192,7 @@ def cluster_node_count(
|
|||
\b
|
||||
Check:
|
||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
||||
* `OK`! If they are not provided.
|
||||
* `OK`: If they are not provided.
|
||||
|
||||
\b
|
||||
Perfdata:
|
||||
|
@ -213,8 +213,8 @@ def cluster_node_count(
|
|||
running_warning,
|
||||
running_critical,
|
||||
),
|
||||
nagiosplugin.ScalarContext("members_roles"),
|
||||
nagiosplugin.ScalarContext("members_statuses"),
|
||||
nagiosplugin.ScalarContext("member_roles"),
|
||||
nagiosplugin.ScalarContext("member_statuses"),
|
||||
)
|
||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||
|
||||
|
@ -225,6 +225,8 @@ def cluster_node_count(
|
|||
def cluster_has_leader(ctx: click.Context) -> None:
|
||||
"""Check if the cluster has a leader.
|
||||
|
||||
Note: there is no difference between a normal and standby leader.
|
||||
|
||||
\b
|
||||
Check:
|
||||
* `OK`: if there is a leader node.
|
||||
|
@ -232,7 +234,6 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
|
||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
||||
"""
|
||||
# FIXME: Manage primary or standby leader in the same place ?
|
||||
check = nagiosplugin.Check()
|
||||
check.add(
|
||||
ClusterHasLeader(ctx.obj.connection_info),
|
||||
|
@ -248,14 +249,14 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
"--warning",
|
||||
"warning",
|
||||
type=str,
|
||||
help="Warning threshold for the number of nodes.",
|
||||
help="Warning threshold for the number of healthy replica nodes.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--critical",
|
||||
"critical",
|
||||
type=str,
|
||||
help="Critical threshold for the number of replica nodes.",
|
||||
help="Critical threshold for the number of healthy replica nodes.",
|
||||
)
|
||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||
@click.pass_context
|
||||
|
@ -263,10 +264,10 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
def cluster_has_replica(
|
||||
ctx: click.Context, warning: str, critical: str, max_lag: str
|
||||
) -> None:
|
||||
"""Check if the cluster has healthy replicates.
|
||||
"""Check if the cluster has healthy replicas.
|
||||
|
||||
\b
|
||||
A healthy replicate:
|
||||
A healthy replica:
|
||||
* is in running state
|
||||
* has a replica role
|
||||
* has a lag lower or equal to max_lag
|
||||
|
@ -324,7 +325,7 @@ def cluster_config_has_changed(
|
|||
Perfdata:
|
||||
* `is_configuration_changed` is 1 if the configuration has changed
|
||||
"""
|
||||
# FIXME hash in perfdata ?
|
||||
# Note: hash cannot be in the perf data = not a number
|
||||
if (config_hash is None and state_file is None) or (
|
||||
config_hash is not None and state_file is not None
|
||||
):
|
||||
|
|
|
@ -39,14 +39,14 @@ class ClusterNodeCount(PatroniResource):
|
|||
# The performance data : role
|
||||
for role in role_counters:
|
||||
yield nagiosplugin.Metric(
|
||||
f"role_{role}", role_counters[role], context="members_roles"
|
||||
f"role_{role}", role_counters[role], context="member_roles"
|
||||
)
|
||||
|
||||
# The performance data : statuses (except running)
|
||||
for state in status_counters:
|
||||
if state != "running":
|
||||
yield nagiosplugin.Metric(
|
||||
f"state_{state}", status_counters[state], context="members_statuses"
|
||||
f"state_{state}", status_counters[state], context="member_statuses"
|
||||
)
|
||||
|
||||
|
||||
|
@ -111,7 +111,7 @@ class ClusterHasReplica(PatroniResource):
|
|||
# The actual check
|
||||
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||
|
||||
# The performance data : unheakthy replica count, replicas lag
|
||||
# The performance data : unhealthy replica count, replicas lag
|
||||
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||
for replica in replicas:
|
||||
yield nagiosplugin.Metric(
|
||||
|
@ -165,6 +165,7 @@ class ClusterConfigHasChanged(PatroniResource):
|
|||
|
||||
|
||||
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
|
||||
# TODO: It would be helpful to display the old / new hash here, but it's not a metric.
|
||||
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
|
||||
return "The hash of patroni's dynamic configuration has not changed."
|
||||
|
||||
|
|
Loading…
Reference in a new issue