Node and Cluster services reviews
This commit is contained in:
parent
ec3b8f6806
commit
4de20fefdc
20
README.md
20
README.md
|
@ -23,7 +23,7 @@ Options:
|
||||||
Commands:
|
Commands:
|
||||||
cluster_config_has_changed Check if the hash of the configuration has...
|
cluster_config_has_changed Check if the hash of the configuration has...
|
||||||
cluster_has_leader Check if the cluster has a leader.
|
cluster_has_leader Check if the cluster has a leader.
|
||||||
cluster_has_replica Check if the cluster has healthy replicates.
|
cluster_has_replica Check if the cluster has healthy replicas.
|
||||||
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
||||||
cluster_node_count Count the number of nodes in the cluster.
|
cluster_node_count Count the number of nodes in the cluster.
|
||||||
node_is_alive Check if the node is alive ie patroni is...
|
node_is_alive Check if the node is alive ie patroni is...
|
||||||
|
@ -133,6 +133,8 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
|
||||||
|
|
||||||
Check if the cluster has a leader.
|
Check if the cluster has a leader.
|
||||||
|
|
||||||
|
Note: there is no difference between a normal and standby leader.
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if there is a leader node.
|
* `OK`: if there is a leader node.
|
||||||
* `CRITICAL`: otherwise
|
* `CRITICAL`: otherwise
|
||||||
|
@ -148,9 +150,9 @@ Options:
|
||||||
```
|
```
|
||||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||||
|
|
||||||
Check if the cluster has healthy replicates.
|
Check if the cluster has healthy replicas.
|
||||||
|
|
||||||
A healthy replicate:
|
A healthy replica:
|
||||||
* is in running state
|
* is in running state
|
||||||
* has a replica role
|
* has a replica role
|
||||||
* has a lag lower or equal to max_lag
|
* has a lag lower or equal to max_lag
|
||||||
|
@ -164,8 +166,10 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||||
* the lag of each replica labelled with "member name"_lag
|
* the lag of each replica labelled with "member name"_lag
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
-w, --warning TEXT Warning threshold for the number of healthy replica
|
||||||
-c, --critical TEXT Critical threshold for the number of replica nodes.
|
nodes.
|
||||||
|
-c, --critical TEXT Critical threshold for the number of healthy replica
|
||||||
|
nodes.
|
||||||
--max-lag TEXT maximum allowed lag
|
--max-lag TEXT maximum allowed lag
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
@ -197,7 +201,7 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
||||||
* `OK`! If they are not provided.
|
* `OK`: If they are not provided.
|
||||||
|
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* `members`: the member count.
|
* `members`: the member count.
|
||||||
|
@ -206,9 +210,9 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||||
-c, --critical TEXT Critical threshold for the nimber of nodes.
|
-c, --critical TEXT Critical threshold for the number of nodes.
|
||||||
--running-warning TEXT Warning threshold for the number of running nodes.
|
--running-warning TEXT Warning threshold for the number of running nodes.
|
||||||
--running-critical TEXT Critical threshold for the nimber of running nodes.
|
--running-critical TEXT Critical threshold for the number of running nodes.
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -164,7 +164,7 @@ def main(
|
||||||
"--critical",
|
"--critical",
|
||||||
"critical",
|
"critical",
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the nimber of nodes.",
|
help="Critical threshold for the number of nodes.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--running-warning",
|
"--running-warning",
|
||||||
|
@ -176,7 +176,7 @@ def main(
|
||||||
"--running-critical",
|
"--running-critical",
|
||||||
"running_critical",
|
"running_critical",
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the nimber of running nodes.",
|
help="Critical threshold for the number of running nodes.",
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
|
@ -192,7 +192,7 @@ def cluster_node_count(
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
* Compares the number of nodes against the normal and running node warning and critical thresholds.
|
||||||
* `OK`! If they are not provided.
|
* `OK`: If they are not provided.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Perfdata:
|
Perfdata:
|
||||||
|
@ -213,8 +213,8 @@ def cluster_node_count(
|
||||||
running_warning,
|
running_warning,
|
||||||
running_critical,
|
running_critical,
|
||||||
),
|
),
|
||||||
nagiosplugin.ScalarContext("members_roles"),
|
nagiosplugin.ScalarContext("member_roles"),
|
||||||
nagiosplugin.ScalarContext("members_statuses"),
|
nagiosplugin.ScalarContext("member_statuses"),
|
||||||
)
|
)
|
||||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
@ -225,6 +225,8 @@ def cluster_node_count(
|
||||||
def cluster_has_leader(ctx: click.Context) -> None:
|
def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
"""Check if the cluster has a leader.
|
"""Check if the cluster has a leader.
|
||||||
|
|
||||||
|
Note: there is no difference between a normal and standby leader.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if there is a leader node.
|
* `OK`: if there is a leader node.
|
||||||
|
@ -232,7 +234,6 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
|
|
||||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
||||||
"""
|
"""
|
||||||
# FIXME: Manage primary or standby leader in the same place ?
|
|
||||||
check = nagiosplugin.Check()
|
check = nagiosplugin.Check()
|
||||||
check.add(
|
check.add(
|
||||||
ClusterHasLeader(ctx.obj.connection_info),
|
ClusterHasLeader(ctx.obj.connection_info),
|
||||||
|
@ -248,14 +249,14 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
"--warning",
|
"--warning",
|
||||||
"warning",
|
"warning",
|
||||||
type=str,
|
type=str,
|
||||||
help="Warning threshold for the number of nodes.",
|
help="Warning threshold for the number of healthy replica nodes.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-c",
|
"-c",
|
||||||
"--critical",
|
"--critical",
|
||||||
"critical",
|
"critical",
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the number of replica nodes.",
|
help="Critical threshold for the number of healthy replica nodes.",
|
||||||
)
|
)
|
||||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
@ -263,10 +264,10 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
def cluster_has_replica(
|
def cluster_has_replica(
|
||||||
ctx: click.Context, warning: str, critical: str, max_lag: str
|
ctx: click.Context, warning: str, critical: str, max_lag: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check if the cluster has healthy replicates.
|
"""Check if the cluster has healthy replicas.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
A healthy replicate:
|
A healthy replica:
|
||||||
* is in running state
|
* is in running state
|
||||||
* has a replica role
|
* has a replica role
|
||||||
* has a lag lower or equal to max_lag
|
* has a lag lower or equal to max_lag
|
||||||
|
@ -324,7 +325,7 @@ def cluster_config_has_changed(
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* `is_configuration_changed` is 1 if the configuration has changed
|
* `is_configuration_changed` is 1 if the configuration has changed
|
||||||
"""
|
"""
|
||||||
# FIXME hash in perfdata ?
|
# Note: hash cannot be in the perf data = not a number
|
||||||
if (config_hash is None and state_file is None) or (
|
if (config_hash is None and state_file is None) or (
|
||||||
config_hash is not None and state_file is not None
|
config_hash is not None and state_file is not None
|
||||||
):
|
):
|
||||||
|
|
|
@ -39,14 +39,14 @@ class ClusterNodeCount(PatroniResource):
|
||||||
# The performance data : role
|
# The performance data : role
|
||||||
for role in role_counters:
|
for role in role_counters:
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
f"role_{role}", role_counters[role], context="members_roles"
|
f"role_{role}", role_counters[role], context="member_roles"
|
||||||
)
|
)
|
||||||
|
|
||||||
# The performance data : statuses (except running)
|
# The performance data : statuses (except running)
|
||||||
for state in status_counters:
|
for state in status_counters:
|
||||||
if state != "running":
|
if state != "running":
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
f"state_{state}", status_counters[state], context="members_statuses"
|
f"state_{state}", status_counters[state], context="member_statuses"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ class ClusterHasReplica(PatroniResource):
|
||||||
# The actual check
|
# The actual check
|
||||||
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||||
|
|
||||||
# The performance data : unheakthy replica count, replicas lag
|
# The performance data : unhealthy replica count, replicas lag
|
||||||
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||||
for replica in replicas:
|
for replica in replicas:
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
|
@ -165,6 +165,7 @@ class ClusterConfigHasChanged(PatroniResource):
|
||||||
|
|
||||||
|
|
||||||
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
|
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
|
||||||
|
# TODO: It would be helpful to display the old / new hash here, but it's not a metric.
|
||||||
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
|
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
|
||||||
return "The hash of patroni's dynamic configuration has not changed."
|
return "The hash of patroni's dynamic configuration has not changed."
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue