Add info and options about sync standby to cluster_has_replica
* Add `--sync-warning` and `--sync-critical` * Add `sync_replica` to track the number of sync replica in the perf data * Add `MEMBER-sync` to track if a member is a sync replica in the perf data
This commit is contained in:
parent
b9fbdfdefd
commit
ee3837fab1
|
@ -5,6 +5,7 @@
|
|||
### Added
|
||||
|
||||
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
|
||||
* Add info and options (`--sync-warning` and `--sync-critical`) about sync replica to `cluster_has_replica`.
|
||||
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
||||
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
||||
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
||||
|
|
21
README.md
21
README.md
|
@ -37,7 +37,7 @@ Options:
|
|||
Commands:
|
||||
cluster_config_has_changed Check if the hash of the configuration...
|
||||
cluster_has_leader Check if the cluster has a leader.
|
||||
cluster_has_replica Check if the cluster has healthy replicas.
|
||||
cluster_has_replica Check if the cluster has healthy replicas...
|
||||
cluster_has_scheduled_action Check if the cluster has a scheduled...
|
||||
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
||||
cluster_node_count Count the number of nodes in the cluster.
|
||||
|
@ -223,7 +223,7 @@ Options:
|
|||
```
|
||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||
|
||||
Check if the cluster has healthy replicas.
|
||||
Check if the cluster has healthy replicas and/or if some are sync standbies
|
||||
|
||||
A healthy replica:
|
||||
* is in running or streaming state (V3.0.4)
|
||||
|
@ -232,19 +232,24 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
|
|||
|
||||
Check:
|
||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||
and if the sync_replica count is compatible with the sync replica count threshold.
|
||||
* `WARNING` / `CRITICAL`: otherwise
|
||||
|
||||
Perfdata:
|
||||
* healthy_replica & unhealthy_replica count
|
||||
* the number of sync_replica, they are included in the previous count
|
||||
* the lag of each replica labelled with "member name"_lag
|
||||
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
|
||||
|
||||
Options:
|
||||
-w, --warning TEXT Warning threshold for the number of healthy replica
|
||||
nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of healthy replica
|
||||
nodes.
|
||||
--max-lag TEXT maximum allowed lag
|
||||
--help Show this message and exit.
|
||||
-w, --warning TEXT Warning threshold for the number of healthy replica
|
||||
nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of healthy replica
|
||||
nodes.
|
||||
--sync-warning TEXT Warning threshold for the number of sync replica.
|
||||
--sync-critical TEXT Critical threshold for the number of sync replica.
|
||||
--max-lag TEXT maximum allowed lag
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
### cluster_has_scheduled_action
|
||||
|
|
|
@ -316,13 +316,30 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
type=str,
|
||||
help="Critical threshold for the number of healthy replica nodes.",
|
||||
)
|
||||
@click.option(
|
||||
"--sync-warning",
|
||||
"sync_warning",
|
||||
type=str,
|
||||
help="Warning threshold for the number of sync replica.",
|
||||
)
|
||||
@click.option(
|
||||
"--sync-critical",
|
||||
"sync_critical",
|
||||
type=str,
|
||||
help="Critical threshold for the number of sync replica.",
|
||||
)
|
||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||
@click.pass_context
|
||||
@nagiosplugin.guarded
|
||||
def cluster_has_replica(
|
||||
ctx: click.Context, warning: str, critical: str, max_lag: str
|
||||
ctx: click.Context,
|
||||
warning: str,
|
||||
critical: str,
|
||||
sync_warning: str,
|
||||
sync_critical: str,
|
||||
max_lag: str,
|
||||
) -> None:
|
||||
"""Check if the cluster has healthy replicas.
|
||||
"""Check if the cluster has healthy replicas and/or if some are sync standbies
|
||||
|
||||
\b
|
||||
A healthy replica:
|
||||
|
@ -333,12 +350,15 @@ def cluster_has_replica(
|
|||
\b
|
||||
Check:
|
||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||
and if the sync_replica count is compatible with the sync replica count threshold.
|
||||
* `WARNING` / `CRITICAL`: otherwise
|
||||
|
||||
\b
|
||||
Perfdata:
|
||||
* healthy_replica & unhealthy_replica count
|
||||
* the number of sync_replica, they are included in the previous count
|
||||
* the lag of each replica labelled with "member name"_lag
|
||||
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
|
||||
"""
|
||||
|
||||
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
||||
|
@ -350,8 +370,14 @@ def cluster_has_replica(
|
|||
warning,
|
||||
critical,
|
||||
),
|
||||
nagiosplugin.ScalarContext(
|
||||
"sync_replica",
|
||||
sync_warning,
|
||||
sync_critical,
|
||||
),
|
||||
nagiosplugin.ScalarContext("unhealthy_replica"),
|
||||
nagiosplugin.ScalarContext("replica_lag"),
|
||||
nagiosplugin.ScalarContext("replica_sync"),
|
||||
)
|
||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||
|
||||
|
|
|
@ -92,6 +92,7 @@ class ClusterHasReplica(PatroniResource):
|
|||
replicas = []
|
||||
healthy_replica = 0
|
||||
unhealthy_replica = 0
|
||||
sync_replica = 0
|
||||
for member in item_dict["members"]:
|
||||
# FIXME are there other acceptable states
|
||||
if member["role"] in ["replica", "sync_standby"]:
|
||||
|
@ -100,7 +101,17 @@ class ClusterHasReplica(PatroniResource):
|
|||
member["state"] in ["running", "streaming"]
|
||||
and member["lag"] != "unknown"
|
||||
):
|
||||
replicas.append({"name": member["name"], "lag": member["lag"]})
|
||||
replicas.append(
|
||||
{
|
||||
"name": member["name"],
|
||||
"lag": member["lag"],
|
||||
"sync": 1 if member["role"] == "sync_standby" else 0,
|
||||
}
|
||||
)
|
||||
|
||||
if member["role"] == "sync_standby":
|
||||
sync_replica += 1
|
||||
|
||||
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
||||
healthy_replica += 1
|
||||
continue
|
||||
|
@ -108,6 +119,7 @@ class ClusterHasReplica(PatroniResource):
|
|||
|
||||
# The actual check
|
||||
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||
yield nagiosplugin.Metric("sync_replica", sync_replica)
|
||||
|
||||
# The performance data : unhealthy replica count, replicas lag
|
||||
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||
|
@ -115,6 +127,9 @@ class ClusterHasReplica(PatroniResource):
|
|||
yield nagiosplugin.Metric(
|
||||
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
||||
)
|
||||
yield nagiosplugin.Metric(
|
||||
f"{replica['name']}_sync", replica["sync"], context="replica_sync"
|
||||
)
|
||||
|
||||
|
||||
# FIXME is this needed ??
|
||||
|
|
|
@ -19,7 +19,7 @@ def test_cluster_has_relica_ok(
|
|||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
|
||||
)
|
||||
|
||||
|
||||
|
@ -44,7 +44,30 @@ def test_cluster_has_replica_ok_with_count_thresholds(
|
|||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
|
||||
)
|
||||
|
||||
|
||||
def test_cluster_has_replica_ok_with_sync_count_thresholds(
|
||||
mocker: MockerFixture, use_old_replica_state: bool
|
||||
) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ok", 200, use_old_replica_state)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--sync-warning",
|
||||
"1:",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n"
|
||||
)
|
||||
|
||||
|
||||
|
@ -72,7 +95,7 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag(
|
|||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv3_lag=0 unhealthy_replica=0\n"
|
||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n"
|
||||
)
|
||||
|
||||
|
||||
|
@ -97,7 +120,32 @@ def test_cluster_has_replica_ko_with_count_thresholds(
|
|||
assert result.exit_code == 1
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 unhealthy_replica=1\n"
|
||||
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n"
|
||||
)
|
||||
|
||||
|
||||
def test_cluster_has_replica_ko_with_sync_count_thresholds(
|
||||
mocker: MockerFixture, use_old_replica_state: bool
|
||||
) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ko", 200, use_old_replica_state)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--sync-warning",
|
||||
"2:",
|
||||
"--sync-critical",
|
||||
"1:",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n"
|
||||
)
|
||||
|
||||
|
||||
|
@ -125,5 +173,5 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
|
|||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv3_lag=20000000 unhealthy_replica=2\n"
|
||||
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n"
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue