--max-lag additions and fixups

* add --max-lag to cluster_has_replica
* change --lag to --max-lag in node_is_replica
* update tests
* update README.md
This commit is contained in:
benoit 2021-08-13 11:00:43 +02:00
parent 2d37ed2d94
commit d4e974da51
6 changed files with 125 additions and 47 deletions

View file

@ -21,7 +21,7 @@ Options:
Commands:
cluster_config_has_changed Check if the hash of the configuration has...
cluster_has_leader Check if the cluster has a leader.
cluster_has_replica Check if the cluster has replicas and their...
cluster_has_replica Check if the cluster has healthy replicates.
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
cluster_node_count Count the number of nodes in the cluster.
node_is_alive Check if the node is alive ie patroni is...
@ -78,21 +78,23 @@ Options:
```
Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has replicas and their lag.
Check if the cluster has healthy replicates.
A healthy replicate : * is in running state * has a replica role * has a lag
lower or equal to max_lag
Check:
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
Perfdata :
* replica count
* healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag
Options:
-w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the number of replica nodes.
--lag-warning TEXT Warning threshold for the lag.
--lag-critical TEXT Critical threshold for the lag.
--max-lag TEXT maximum allowed lag
--help Show this message and exit.
```
@ -211,8 +213,8 @@ Usage: check_patroni node_is_replica [OPTIONS]
noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
Options:
--lag TEXT maximum allowed lag
--help Show this message and exit.
--max-lag TEXT maximum allowed lag
--help Show this message and exit.
```
### node_patroni_version

View file

@ -29,6 +29,7 @@ from .node import (
NodeTLHasChangedSummary,
)
from .types import ConnectionInfo
from .convert import size_to_byte
def print_version(ctx: click.Context, param: str, value: str) -> None:
@ -247,45 +248,41 @@ def cluster_has_leader(ctx: click.Context) -> None:
type=str,
help="Critical threshold for the number of replica nodes.",
)
@click.option(
"--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag."
)
# FIWME how do we manage maximum_lag_on_failover without doing many api calls
@click.option(
"--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag."
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str
ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None:
"""Check if the cluster has replicas and their lag.
"""Check if the cluster has healthy replicates.
A healthy replicate :
* is in running state
* has a replica role
* has a lag lower or equal to max_lag
\b
Check:
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata :
* replica count
* healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag
"""
# FIXME the idea here would be to make sur we have a replica.
# lag should be check to prune invalid replicas
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj),
ClusterHasReplica(ctx.obj, tmax_lag),
nagiosplugin.ScalarContext(
"replica_count",
"healthy_replica",
warning,
critical,
),
nagiosplugin.ScalarContext(
"replica_lag",
lag_warning,
lag_critical,
),
nagiosplugin.ScalarContext("unhealthy_replica"),
nagiosplugin.ScalarContext("replica_lag"),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
@ -388,10 +385,10 @@ def node_is_primary(ctx: click.Context) -> None:
@main.command(name="node_is_replica")
@click.option("--lag", "lag", type=str, help="maximum allowed lag")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(ctx: click.Context, lag: str) -> None:
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
"""Check if the node is a running replica with no noloadbalance tag.
\b
@ -404,9 +401,9 @@ def node_is_replica(ctx: click.Context, lag: str) -> None:
# FIXME add a lag check ??
check = nagiosplugin.Check()
check.add(
NodeIsReplica(ctx.obj, lag),
NodeIsReplica(ctx.obj, max_lag),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(lag),
NodeIsReplicaSummary(max_lag),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]

View file

@ -3,7 +3,7 @@ import hashlib
import json
import logging
import nagiosplugin
from typing import Iterable
from typing import Iterable, Union
from .types import PatroniResource, ConnectionInfo, handle_unknown
@ -81,6 +81,14 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
class ClusterHasReplica(PatroniResource):
def __init__(
self: "ClusterHasReplica",
connection_info: ConnectionInfo,
max_lag: Union[int, None],
):
super().__init__(connection_info)
self.max_lag = max_lag
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
@ -88,17 +96,23 @@ class ClusterHasReplica(PatroniResource):
item_dict = json.loads(r.data)
replicas = []
healthy_replica = 0
unhealthy_replica = 0
for member in item_dict["members"]:
# FIXME are there other acceptable states
if member["role"] == "replica" and member["state"] == "running":
# FIXME which lag ?
replicas.append({"name": member["name"], "lag": member["lag"]})
break
if member["role"] == "replica":
if member["state"] == "running" and member["lag"] != "unknown":
replicas.append({"name": member["name"], "lag": member["lag"]})
if self.max_lag is None or self.max_lag >= int(member["lag"]):
healthy_replica += 1
continue
unhealthy_replica += 1
# The actual check
yield nagiosplugin.Metric("replica_count", len(replicas))
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
# The performance data : replicas lag
# The performance data : unheakthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
for replica in replicas:
yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag"

View file

@ -29,16 +29,16 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
class NodeIsReplica(PatroniResource):
def __init__(
self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str
self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
) -> None:
super().__init__(connection_info)
self.lag = lag
self.max_lag = max_lag
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
if self.lag is None:
if self.max_lag is None:
r = self.rest_api("replica")
else:
r = self.rest_api(f"replica?lag={self.lag}")
r = self.rest_api(f"replica?lag={self.max_lag}")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")

View file

@ -20,6 +20,48 @@ def test_cluster_has_relica_ok(mocker: MockerFixture) -> None:
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
],
)
assert result.exit_code == 0
def test_cluster_has_replica_ok_with_count_thresholds_lag(
mocker: MockerFixture,
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok_lag", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert result.exit_code == 0
def test_cluster_has_replica_ko_with_count_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko", 200)
result = runner.invoke(
main,
@ -27,10 +69,33 @@ def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) ->
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warninng",
"@2",
"--warning",
"@1",
"--critical",
"@0:1",
"@0",
],
)
assert result.exit_code == 1
def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
mocker: MockerFixture,
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko_lag", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert result.exit_code == 2

View file

@ -28,6 +28,6 @@ def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None:
# We don't do the check ourselves, patroni does it and changes the return code
my_mock(mocker, "node_is_replica_ok", 404)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"]
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--max-lag", "100"]
)
assert result.exit_code == 2