--max-lag additions and fixups
* add --max-lag to cluster_has_replica * change --lag to --max-lag in node_is_replica * update tests * update README.md
This commit is contained in:
parent
2d37ed2d94
commit
d4e974da51
16
README.md
16
README.md
|
@ -21,7 +21,7 @@ Options:
|
|||
Commands:
|
||||
cluster_config_has_changed Check if the hash of the configuration has...
|
||||
cluster_has_leader Check if the cluster has a leader.
|
||||
cluster_has_replica Check if the cluster has replicas and their...
|
||||
cluster_has_replica Check if the cluster has healthy replicates.
|
||||
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
||||
cluster_node_count Count the number of nodes in the cluster.
|
||||
node_is_alive Check if the node is alive ie patroni is...
|
||||
|
@ -78,21 +78,23 @@ Options:
|
|||
```
|
||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||
|
||||
Check if the cluster has replicas and their lag.
|
||||
Check if the cluster has healthy replicates.
|
||||
|
||||
A healthy replicate : * is in running state * has a replica role * has a lag
|
||||
lower or equal to max_lag
|
||||
|
||||
Check:
|
||||
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
|
||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||
* `WARNING` / `CRITICAL`: otherwise
|
||||
|
||||
Perfdata :
|
||||
* replica count
|
||||
* healthy_replica & unhealthy_replica count
|
||||
* the lag of each replica labelled with "member name"_lag
|
||||
|
||||
Options:
|
||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||
-c, --critical TEXT Critical threshold for the number of replica nodes.
|
||||
--lag-warning TEXT Warning threshold for the lag.
|
||||
--lag-critical TEXT Critical threshold for the lag.
|
||||
--max-lag TEXT maximum allowed lag
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
|
@ -211,7 +213,7 @@ Usage: check_patroni node_is_replica [OPTIONS]
|
|||
noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
|
||||
|
||||
Options:
|
||||
--lag TEXT maximum allowed lag
|
||||
--max-lag TEXT maximum allowed lag
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ from .node import (
|
|||
NodeTLHasChangedSummary,
|
||||
)
|
||||
from .types import ConnectionInfo
|
||||
from .convert import size_to_byte
|
||||
|
||||
|
||||
def print_version(ctx: click.Context, param: str, value: str) -> None:
|
||||
|
@ -247,45 +248,41 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
type=str,
|
||||
help="Critical threshold for the number of replica nodes.",
|
||||
)
|
||||
@click.option(
|
||||
"--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag."
|
||||
)
|
||||
# FIWME how do we manage maximum_lag_on_failover without doing many api calls
|
||||
@click.option(
|
||||
"--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag."
|
||||
)
|
||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||
@click.pass_context
|
||||
@nagiosplugin.guarded
|
||||
def cluster_has_replica(
|
||||
ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str
|
||||
ctx: click.Context, warning: str, critical: str, max_lag: str
|
||||
) -> None:
|
||||
"""Check if the cluster has replicas and their lag.
|
||||
"""Check if the cluster has healthy replicates.
|
||||
|
||||
A healthy replicate :
|
||||
* is in running state
|
||||
* has a replica role
|
||||
* has a lag lower or equal to max_lag
|
||||
|
||||
\b
|
||||
Check:
|
||||
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
|
||||
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||
* `WARNING` / `CRITICAL`: otherwise
|
||||
|
||||
\b
|
||||
Perfdata :
|
||||
* replica count
|
||||
* healthy_replica & unhealthy_replica count
|
||||
* the lag of each replica labelled with "member name"_lag
|
||||
"""
|
||||
# FIXME the idea here would be to make sur we have a replica.
|
||||
# lag should be check to prune invalid replicas
|
||||
|
||||
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
||||
check = nagiosplugin.Check()
|
||||
check.add(
|
||||
ClusterHasReplica(ctx.obj),
|
||||
ClusterHasReplica(ctx.obj, tmax_lag),
|
||||
nagiosplugin.ScalarContext(
|
||||
"replica_count",
|
||||
"healthy_replica",
|
||||
warning,
|
||||
critical,
|
||||
),
|
||||
nagiosplugin.ScalarContext(
|
||||
"replica_lag",
|
||||
lag_warning,
|
||||
lag_critical,
|
||||
),
|
||||
nagiosplugin.ScalarContext("unhealthy_replica"),
|
||||
nagiosplugin.ScalarContext("replica_lag"),
|
||||
)
|
||||
check.main(
|
||||
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
||||
|
@ -388,10 +385,10 @@ def node_is_primary(ctx: click.Context) -> None:
|
|||
|
||||
|
||||
@main.command(name="node_is_replica")
|
||||
@click.option("--lag", "lag", type=str, help="maximum allowed lag")
|
||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||
@click.pass_context
|
||||
@nagiosplugin.guarded
|
||||
def node_is_replica(ctx: click.Context, lag: str) -> None:
|
||||
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
|
||||
"""Check if the node is a running replica with no noloadbalance tag.
|
||||
|
||||
\b
|
||||
|
@ -404,9 +401,9 @@ def node_is_replica(ctx: click.Context, lag: str) -> None:
|
|||
# FIXME add a lag check ??
|
||||
check = nagiosplugin.Check()
|
||||
check.add(
|
||||
NodeIsReplica(ctx.obj, lag),
|
||||
NodeIsReplica(ctx.obj, max_lag),
|
||||
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
|
||||
NodeIsReplicaSummary(lag),
|
||||
NodeIsReplicaSummary(max_lag),
|
||||
)
|
||||
check.main(
|
||||
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
||||
|
|
|
@ -3,7 +3,7 @@ import hashlib
|
|||
import json
|
||||
import logging
|
||||
import nagiosplugin
|
||||
from typing import Iterable
|
||||
from typing import Iterable, Union
|
||||
|
||||
from .types import PatroniResource, ConnectionInfo, handle_unknown
|
||||
|
||||
|
@ -81,6 +81,14 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
|||
|
||||
|
||||
class ClusterHasReplica(PatroniResource):
|
||||
def __init__(
|
||||
self: "ClusterHasReplica",
|
||||
connection_info: ConnectionInfo,
|
||||
max_lag: Union[int, None],
|
||||
):
|
||||
super().__init__(connection_info)
|
||||
self.max_lag = max_lag
|
||||
|
||||
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
|
||||
r = self.rest_api("cluster")
|
||||
_log.debug(f"api call status: {r.status}")
|
||||
|
@ -88,17 +96,23 @@ class ClusterHasReplica(PatroniResource):
|
|||
|
||||
item_dict = json.loads(r.data)
|
||||
replicas = []
|
||||
healthy_replica = 0
|
||||
unhealthy_replica = 0
|
||||
for member in item_dict["members"]:
|
||||
# FIXME are there other acceptable states
|
||||
if member["role"] == "replica" and member["state"] == "running":
|
||||
# FIXME which lag ?
|
||||
if member["role"] == "replica":
|
||||
if member["state"] == "running" and member["lag"] != "unknown":
|
||||
replicas.append({"name": member["name"], "lag": member["lag"]})
|
||||
break
|
||||
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
||||
healthy_replica += 1
|
||||
continue
|
||||
unhealthy_replica += 1
|
||||
|
||||
# The actual check
|
||||
yield nagiosplugin.Metric("replica_count", len(replicas))
|
||||
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||
|
||||
# The performance data : replicas lag
|
||||
# The performance data : unheakthy replica count, replicas lag
|
||||
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||
for replica in replicas:
|
||||
yield nagiosplugin.Metric(
|
||||
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
||||
|
|
|
@ -29,16 +29,16 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
|
|||
|
||||
class NodeIsReplica(PatroniResource):
|
||||
def __init__(
|
||||
self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str
|
||||
self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
|
||||
) -> None:
|
||||
super().__init__(connection_info)
|
||||
self.lag = lag
|
||||
self.max_lag = max_lag
|
||||
|
||||
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
|
||||
if self.lag is None:
|
||||
if self.max_lag is None:
|
||||
r = self.rest_api("replica")
|
||||
else:
|
||||
r = self.rest_api(f"replica?lag={self.lag}")
|
||||
r = self.rest_api(f"replica?lag={self.max_lag}")
|
||||
_log.debug(f"api call status: {r.status}")
|
||||
_log.debug(f"api call data: {r.data}")
|
||||
|
||||
|
|
|
@ -20,6 +20,48 @@ def test_cluster_has_relica_ok(mocker: MockerFixture) -> None:
|
|||
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ok", 200)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--warning",
|
||||
"@1",
|
||||
"--critical",
|
||||
"@0",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
def test_cluster_has_replica_ok_with_count_thresholds_lag(
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ok_lag", 200)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--warning",
|
||||
"@1",
|
||||
"--critical",
|
||||
"@0",
|
||||
"--max-lag",
|
||||
"1MB",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
def test_cluster_has_replica_ko_with_count_thresholds(mocker: MockerFixture) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ko", 200)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
|
@ -27,10 +69,33 @@ def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) ->
|
|||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--warninng",
|
||||
"@2",
|
||||
"--warning",
|
||||
"@1",
|
||||
"--critical",
|
||||
"@0:1",
|
||||
"@0",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "cluster_has_replica_ko_lag", 200)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
"https://10.20.199.3:8008",
|
||||
"cluster_has_replica",
|
||||
"--warning",
|
||||
"@1",
|
||||
"--critical",
|
||||
"@0",
|
||||
"--max-lag",
|
||||
"1MB",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
|
|
@ -28,6 +28,6 @@ def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None:
|
|||
# We don't do the check ourselves, patroni does it and changes the return code
|
||||
my_mock(mocker, "node_is_replica_ok", 404)
|
||||
result = runner.invoke(
|
||||
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"]
|
||||
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--max-lag", "100"]
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
|
Loading…
Reference in a new issue