Add info and options about sync standby to cluster_has_replica

* Add `--sync-warning` and `--sync-critical`
* Add `sync_replica` to track the number of sync replica in the perf data
* Add `MEMBER-sync` to track if a member is a sync replica in the perf data
This commit is contained in:
benoit 2023-08-24 15:43:35 +02:00 committed by Benoit
parent b9fbdfdefd
commit ee3837fab1
5 changed files with 111 additions and 16 deletions

View file

@ -5,6 +5,7 @@
### Added
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
* Add info and options (`--sync-warning` and `--sync-critical`) about sync replica to `cluster_has_replica`.
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.

View file

@ -37,7 +37,7 @@ Options:
Commands:
cluster_config_has_changed Check if the hash of the configuration...
cluster_has_leader Check if the cluster has a leader.
cluster_has_replica Check if the cluster has healthy replicas.
cluster_has_replica Check if the cluster has healthy replicas...
cluster_has_scheduled_action Check if the cluster has a scheduled...
cluster_is_in_maintenance Check if the cluster is in maintenance...
cluster_node_count Count the number of nodes in the cluster.
@ -223,7 +223,7 @@ Options:
```
Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has healthy replicas.
Check if the cluster has healthy replicas and/or if some are sync standbies
A healthy replica:
* is in running or streaming state (V3.0.4)
@ -232,19 +232,24 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
and if the sync_replica count is compatible with the sync replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
Perfdata:
* healthy_replica & unhealthy_replica count
* the number of sync_replica, they are included in the previous count
* the lag of each replica labelled with "member name"_lag
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
Options:
-w, --warning TEXT Warning threshold for the number of healthy replica
nodes.
-c, --critical TEXT Critical threshold for the number of healthy replica
nodes.
--max-lag TEXT maximum allowed lag
--help Show this message and exit.
-w, --warning TEXT Warning threshold for the number of healthy replica
nodes.
-c, --critical TEXT Critical threshold for the number of healthy replica
nodes.
--sync-warning TEXT Warning threshold for the number of sync replica.
--sync-critical TEXT Critical threshold for the number of sync replica.
--max-lag TEXT maximum allowed lag
--help Show this message and exit.
```
### cluster_has_scheduled_action

View file

@ -316,13 +316,30 @@ def cluster_has_leader(ctx: click.Context) -> None:
type=str,
help="Critical threshold for the number of healthy replica nodes.",
)
@click.option(
"--sync-warning",
"sync_warning",
type=str,
help="Warning threshold for the number of sync replica.",
)
@click.option(
"--sync-critical",
"sync_critical",
type=str,
help="Critical threshold for the number of sync replica.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, max_lag: str
ctx: click.Context,
warning: str,
critical: str,
sync_warning: str,
sync_critical: str,
max_lag: str,
) -> None:
"""Check if the cluster has healthy replicas.
"""Check if the cluster has healthy replicas and/or if some are sync standbies
\b
A healthy replica:
@ -333,12 +350,15 @@ def cluster_has_replica(
\b
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
and if the sync_replica count is compatible with the sync replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata:
* healthy_replica & unhealthy_replica count
* the number of sync_replica, they are included in the previous count
* the lag of each replica labelled with "member name"_lag
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
"""
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
@ -350,8 +370,14 @@ def cluster_has_replica(
warning,
critical,
),
nagiosplugin.ScalarContext(
"sync_replica",
sync_warning,
sync_critical,
),
nagiosplugin.ScalarContext("unhealthy_replica"),
nagiosplugin.ScalarContext("replica_lag"),
nagiosplugin.ScalarContext("replica_sync"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)

View file

@ -92,6 +92,7 @@ class ClusterHasReplica(PatroniResource):
replicas = []
healthy_replica = 0
unhealthy_replica = 0
sync_replica = 0
for member in item_dict["members"]:
# FIXME are there other acceptable states
if member["role"] in ["replica", "sync_standby"]:
@ -100,7 +101,17 @@ class ClusterHasReplica(PatroniResource):
member["state"] in ["running", "streaming"]
and member["lag"] != "unknown"
):
replicas.append({"name": member["name"], "lag": member["lag"]})
replicas.append(
{
"name": member["name"],
"lag": member["lag"],
"sync": 1 if member["role"] == "sync_standby" else 0,
}
)
if member["role"] == "sync_standby":
sync_replica += 1
if self.max_lag is None or self.max_lag >= int(member["lag"]):
healthy_replica += 1
continue
@ -108,6 +119,7 @@ class ClusterHasReplica(PatroniResource):
# The actual check
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
yield nagiosplugin.Metric("sync_replica", sync_replica)
# The performance data : unhealthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
@ -115,6 +127,9 @@ class ClusterHasReplica(PatroniResource):
yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
)
yield nagiosplugin.Metric(
f"{replica['name']}_sync", replica["sync"], context="replica_sync"
)
# FIXME is this needed ??

View file

@ -19,7 +19,7 @@ def test_cluster_has_relica_ok(
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
)
@ -44,7 +44,30 @@ def test_cluster_has_replica_ok_with_count_thresholds(
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
)
def test_cluster_has_replica_ok_with_sync_count_thresholds(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok", 200, use_old_replica_state)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--sync-warning",
"1:",
],
)
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n"
)
@ -72,7 +95,7 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag(
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv3_lag=0 unhealthy_replica=0\n"
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n"
)
@ -97,7 +120,32 @@ def test_cluster_has_replica_ko_with_count_thresholds(
assert result.exit_code == 1
assert (
result.stdout
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 unhealthy_replica=1\n"
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n"
)
def test_cluster_has_replica_ko_with_sync_count_thresholds(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko", 200, use_old_replica_state)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--sync-warning",
"2:",
"--sync-critical",
"1:",
],
)
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n"
)
@ -125,5 +173,5 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv3_lag=20000000 unhealthy_replica=2\n"
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n"
)