Add info and options about sync standby to cluster_has_replica
* Add `--sync-warning` and `--sync-critical` * Add `sync_replica` to track the number of sync replica in the perf data * Add `MEMBER-sync` to track if a member is a sync replica in the perf data
This commit is contained in:
parent
b9fbdfdefd
commit
ee3837fab1
|
@ -5,6 +5,7 @@
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
|
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
|
||||||
|
* Add info and options (`--sync-warning` and `--sync-critical`) about sync replica to `cluster_has_replica`.
|
||||||
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
||||||
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
||||||
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
||||||
|
|
21
README.md
21
README.md
|
@ -37,7 +37,7 @@ Options:
|
||||||
Commands:
|
Commands:
|
||||||
cluster_config_has_changed Check if the hash of the configuration...
|
cluster_config_has_changed Check if the hash of the configuration...
|
||||||
cluster_has_leader Check if the cluster has a leader.
|
cluster_has_leader Check if the cluster has a leader.
|
||||||
cluster_has_replica Check if the cluster has healthy replicas.
|
cluster_has_replica Check if the cluster has healthy replicas...
|
||||||
cluster_has_scheduled_action Check if the cluster has a scheduled...
|
cluster_has_scheduled_action Check if the cluster has a scheduled...
|
||||||
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
||||||
cluster_node_count Count the number of nodes in the cluster.
|
cluster_node_count Count the number of nodes in the cluster.
|
||||||
|
@ -223,7 +223,7 @@ Options:
|
||||||
```
|
```
|
||||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||||
|
|
||||||
Check if the cluster has healthy replicas.
|
Check if the cluster has healthy replicas and/or if some are sync standbies
|
||||||
|
|
||||||
A healthy replica:
|
A healthy replica:
|
||||||
* is in running or streaming state (V3.0.4)
|
* is in running or streaming state (V3.0.4)
|
||||||
|
@ -232,19 +232,24 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||||
|
and if the sync_replica count is compatible with the sync replica count threshold.
|
||||||
* `WARNING` / `CRITICAL`: otherwise
|
* `WARNING` / `CRITICAL`: otherwise
|
||||||
|
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* healthy_replica & unhealthy_replica count
|
* healthy_replica & unhealthy_replica count
|
||||||
|
* the number of sync_replica, they are included in the previous count
|
||||||
* the lag of each replica labelled with "member name"_lag
|
* the lag of each replica labelled with "member name"_lag
|
||||||
|
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-w, --warning TEXT Warning threshold for the number of healthy replica
|
-w, --warning TEXT Warning threshold for the number of healthy replica
|
||||||
nodes.
|
nodes.
|
||||||
-c, --critical TEXT Critical threshold for the number of healthy replica
|
-c, --critical TEXT Critical threshold for the number of healthy replica
|
||||||
nodes.
|
nodes.
|
||||||
--max-lag TEXT maximum allowed lag
|
--sync-warning TEXT Warning threshold for the number of sync replica.
|
||||||
--help Show this message and exit.
|
--sync-critical TEXT Critical threshold for the number of sync replica.
|
||||||
|
--max-lag TEXT maximum allowed lag
|
||||||
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
### cluster_has_scheduled_action
|
### cluster_has_scheduled_action
|
||||||
|
|
|
@ -316,13 +316,30 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the number of healthy replica nodes.",
|
help="Critical threshold for the number of healthy replica nodes.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--sync-warning",
|
||||||
|
"sync_warning",
|
||||||
|
type=str,
|
||||||
|
help="Warning threshold for the number of sync replica.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--sync-critical",
|
||||||
|
"sync_critical",
|
||||||
|
type=str,
|
||||||
|
help="Critical threshold for the number of sync replica.",
|
||||||
|
)
|
||||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
def cluster_has_replica(
|
def cluster_has_replica(
|
||||||
ctx: click.Context, warning: str, critical: str, max_lag: str
|
ctx: click.Context,
|
||||||
|
warning: str,
|
||||||
|
critical: str,
|
||||||
|
sync_warning: str,
|
||||||
|
sync_critical: str,
|
||||||
|
max_lag: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check if the cluster has healthy replicas.
|
"""Check if the cluster has healthy replicas and/or if some are sync standbies
|
||||||
|
|
||||||
\b
|
\b
|
||||||
A healthy replica:
|
A healthy replica:
|
||||||
|
@ -333,12 +350,15 @@ def cluster_has_replica(
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||||
|
and if the sync_replica count is compatible with the sync replica count threshold.
|
||||||
* `WARNING` / `CRITICAL`: otherwise
|
* `WARNING` / `CRITICAL`: otherwise
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Perfdata:
|
Perfdata:
|
||||||
* healthy_replica & unhealthy_replica count
|
* healthy_replica & unhealthy_replica count
|
||||||
|
* the number of sync_replica, they are included in the previous count
|
||||||
* the lag of each replica labelled with "member name"_lag
|
* the lag of each replica labelled with "member name"_lag
|
||||||
|
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
||||||
|
@ -350,8 +370,14 @@ def cluster_has_replica(
|
||||||
warning,
|
warning,
|
||||||
critical,
|
critical,
|
||||||
),
|
),
|
||||||
|
nagiosplugin.ScalarContext(
|
||||||
|
"sync_replica",
|
||||||
|
sync_warning,
|
||||||
|
sync_critical,
|
||||||
|
),
|
||||||
nagiosplugin.ScalarContext("unhealthy_replica"),
|
nagiosplugin.ScalarContext("unhealthy_replica"),
|
||||||
nagiosplugin.ScalarContext("replica_lag"),
|
nagiosplugin.ScalarContext("replica_lag"),
|
||||||
|
nagiosplugin.ScalarContext("replica_sync"),
|
||||||
)
|
)
|
||||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
|
|
@ -92,6 +92,7 @@ class ClusterHasReplica(PatroniResource):
|
||||||
replicas = []
|
replicas = []
|
||||||
healthy_replica = 0
|
healthy_replica = 0
|
||||||
unhealthy_replica = 0
|
unhealthy_replica = 0
|
||||||
|
sync_replica = 0
|
||||||
for member in item_dict["members"]:
|
for member in item_dict["members"]:
|
||||||
# FIXME are there other acceptable states
|
# FIXME are there other acceptable states
|
||||||
if member["role"] in ["replica", "sync_standby"]:
|
if member["role"] in ["replica", "sync_standby"]:
|
||||||
|
@ -100,7 +101,17 @@ class ClusterHasReplica(PatroniResource):
|
||||||
member["state"] in ["running", "streaming"]
|
member["state"] in ["running", "streaming"]
|
||||||
and member["lag"] != "unknown"
|
and member["lag"] != "unknown"
|
||||||
):
|
):
|
||||||
replicas.append({"name": member["name"], "lag": member["lag"]})
|
replicas.append(
|
||||||
|
{
|
||||||
|
"name": member["name"],
|
||||||
|
"lag": member["lag"],
|
||||||
|
"sync": 1 if member["role"] == "sync_standby" else 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if member["role"] == "sync_standby":
|
||||||
|
sync_replica += 1
|
||||||
|
|
||||||
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
||||||
healthy_replica += 1
|
healthy_replica += 1
|
||||||
continue
|
continue
|
||||||
|
@ -108,6 +119,7 @@ class ClusterHasReplica(PatroniResource):
|
||||||
|
|
||||||
# The actual check
|
# The actual check
|
||||||
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||||
|
yield nagiosplugin.Metric("sync_replica", sync_replica)
|
||||||
|
|
||||||
# The performance data : unhealthy replica count, replicas lag
|
# The performance data : unhealthy replica count, replicas lag
|
||||||
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||||
|
@ -115,6 +127,9 @@ class ClusterHasReplica(PatroniResource):
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
||||||
)
|
)
|
||||||
|
yield nagiosplugin.Metric(
|
||||||
|
f"{replica['name']}_sync", replica["sync"], context="replica_sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# FIXME is this needed ??
|
# FIXME is this needed ??
|
||||||
|
|
|
@ -19,7 +19,7 @@ def test_cluster_has_relica_ok(
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
|
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +44,30 @@ def test_cluster_has_replica_ok_with_count_thresholds(
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv3_lag=0 unhealthy_replica=0\n"
|
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_replica_ok_with_sync_count_thresholds(
|
||||||
|
mocker: MockerFixture, use_old_replica_state: bool
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_replica_ok", 200, use_old_replica_state)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
"https://10.20.199.3:8008",
|
||||||
|
"cluster_has_replica",
|
||||||
|
"--sync-warning",
|
||||||
|
"1:",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,7 +95,7 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag(
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv3_lag=0 unhealthy_replica=0\n"
|
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,7 +120,32 @@ def test_cluster_has_replica_ko_with_count_thresholds(
|
||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 unhealthy_replica=1\n"
|
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_replica_ko_with_sync_count_thresholds(
|
||||||
|
mocker: MockerFixture, use_old_replica_state: bool
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_replica_ko", 200, use_old_replica_state)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
"https://10.20.199.3:8008",
|
||||||
|
"cluster_has_replica",
|
||||||
|
"--sync-warning",
|
||||||
|
"2:",
|
||||||
|
"--sync-critical",
|
||||||
|
"1:",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -125,5 +173,5 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
|
||||||
assert result.exit_code == 2
|
assert result.exit_code == 2
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv3_lag=20000000 unhealthy_replica=2\n"
|
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n"
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue