--max-lag additions and fixups

* add --max-lag to cluster_has_replica
* change --lag to --max-lag in node_is_replica
* update tests
* update README.md
This commit is contained in:
benoit 2021-08-13 11:00:43 +02:00
parent 2d37ed2d94
commit d4e974da51
6 changed files with 125 additions and 47 deletions

View file

@ -21,7 +21,7 @@ Options:
Commands: Commands:
cluster_config_has_changed Check if the hash of the configuration has... cluster_config_has_changed Check if the hash of the configuration has...
cluster_has_leader Check if the cluster has a leader. cluster_has_leader Check if the cluster has a leader.
cluster_has_replica Check if the cluster has replicas and their... cluster_has_replica Check if the cluster has healthy replicates.
cluster_is_in_maintenance Check if the cluster is in maintenance mode... cluster_is_in_maintenance Check if the cluster is in maintenance mode...
cluster_node_count Count the number of nodes in the cluster. cluster_node_count Count the number of nodes in the cluster.
node_is_alive Check if the node is alive ie patroni is... node_is_alive Check if the node is alive ie patroni is...
@ -78,21 +78,23 @@ Options:
``` ```
Usage: check_patroni cluster_has_replica [OPTIONS] Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has replicas and their lag. Check if the cluster has healthy replicates.
A healthy replicate : * is in running state * has a replica role * has a lag
lower or equal to max_lag
Check: Check:
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds. * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise * `WARNING` / `CRITICAL`: otherwise
Perfdata : Perfdata :
* replica count * healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag * the lag of each replica labelled with "member name"_lag
Options: Options:
-w, --warning TEXT Warning threshold for the number of nodes. -w, --warning TEXT Warning threshold for the number of nodes.
-c, --critical TEXT Critical threshold for the number of replica nodes. -c, --critical TEXT Critical threshold for the number of replica nodes.
--lag-warning TEXT Warning threshold for the lag. --max-lag TEXT maximum allowed lag
--lag-critical TEXT Critical threshold for the lag.
--help Show this message and exit. --help Show this message and exit.
``` ```
@ -211,8 +213,8 @@ Usage: check_patroni node_is_replica [OPTIONS]
noloadbalance tag and the lag is under the maximum threshold, 0 otherwise. noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
Options: Options:
--lag TEXT maximum allowed lag --max-lag TEXT maximum allowed lag
--help Show this message and exit. --help Show this message and exit.
``` ```
### node_patroni_version ### node_patroni_version

View file

@ -29,6 +29,7 @@ from .node import (
NodeTLHasChangedSummary, NodeTLHasChangedSummary,
) )
from .types import ConnectionInfo from .types import ConnectionInfo
from .convert import size_to_byte
def print_version(ctx: click.Context, param: str, value: str) -> None: def print_version(ctx: click.Context, param: str, value: str) -> None:
@ -247,45 +248,41 @@ def cluster_has_leader(ctx: click.Context) -> None:
type=str, type=str,
help="Critical threshold for the number of replica nodes.", help="Critical threshold for the number of replica nodes.",
) )
@click.option( @click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
"--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag."
)
# FIWME how do we manage maximum_lag_on_failover without doing many api calls
@click.option(
"--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag."
)
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
def cluster_has_replica( def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str ctx: click.Context, warning: str, critical: str, max_lag: str
) -> None: ) -> None:
"""Check if the cluster has replicas and their lag. """Check if the cluster has healthy replicates.
A healthy replicate :
* is in running state
* has a replica role
* has a lag lower or equal to max_lag
\b \b
Check: Check:
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds. * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
* `WARNING` / `CRITICAL`: otherwise * `WARNING` / `CRITICAL`: otherwise
\b \b
Perfdata : Perfdata :
* replica count * healthy_replica & unhealthy_replica count
* the lag of each replica labelled with "member name"_lag * the lag of each replica labelled with "member name"_lag
""" """
# FIXME the idea here would be to make sur we have a replica.
# lag should be check to prune invalid replicas tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check() check = nagiosplugin.Check()
check.add( check.add(
ClusterHasReplica(ctx.obj), ClusterHasReplica(ctx.obj, tmax_lag),
nagiosplugin.ScalarContext( nagiosplugin.ScalarContext(
"replica_count", "healthy_replica",
warning, warning,
critical, critical,
), ),
nagiosplugin.ScalarContext( nagiosplugin.ScalarContext("unhealthy_replica"),
"replica_lag", nagiosplugin.ScalarContext("replica_lag"),
lag_warning,
lag_critical,
),
) )
check.main( check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
@ -388,10 +385,10 @@ def node_is_primary(ctx: click.Context) -> None:
@main.command(name="node_is_replica") @main.command(name="node_is_replica")
@click.option("--lag", "lag", type=str, help="maximum allowed lag") @click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
def node_is_replica(ctx: click.Context, lag: str) -> None: def node_is_replica(ctx: click.Context, max_lag: str) -> None:
"""Check if the node is a running replica with no noloadbalance tag. """Check if the node is a running replica with no noloadbalance tag.
\b \b
@ -404,9 +401,9 @@ def node_is_replica(ctx: click.Context, lag: str) -> None:
# FIXME add a lag check ?? # FIXME add a lag check ??
check = nagiosplugin.Check() check = nagiosplugin.Check()
check.add( check.add(
NodeIsReplica(ctx.obj, lag), NodeIsReplica(ctx.obj, max_lag),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"), nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(lag), NodeIsReplicaSummary(max_lag),
) )
check.main( check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]

View file

@ -3,7 +3,7 @@ import hashlib
import json import json
import logging import logging
import nagiosplugin import nagiosplugin
from typing import Iterable from typing import Iterable, Union
from .types import PatroniResource, ConnectionInfo, handle_unknown from .types import PatroniResource, ConnectionInfo, handle_unknown
@ -81,6 +81,14 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
class ClusterHasReplica(PatroniResource): class ClusterHasReplica(PatroniResource):
def __init__(
self: "ClusterHasReplica",
connection_info: ConnectionInfo,
max_lag: Union[int, None],
):
super().__init__(connection_info)
self.max_lag = max_lag
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]: def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster") r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}") _log.debug(f"api call status: {r.status}")
@ -88,17 +96,23 @@ class ClusterHasReplica(PatroniResource):
item_dict = json.loads(r.data) item_dict = json.loads(r.data)
replicas = [] replicas = []
healthy_replica = 0
unhealthy_replica = 0
for member in item_dict["members"]: for member in item_dict["members"]:
# FIXME are there other acceptable states # FIXME are there other acceptable states
if member["role"] == "replica" and member["state"] == "running": if member["role"] == "replica":
# FIXME which lag ? if member["state"] == "running" and member["lag"] != "unknown":
replicas.append({"name": member["name"], "lag": member["lag"]}) replicas.append({"name": member["name"], "lag": member["lag"]})
break if self.max_lag is None or self.max_lag >= int(member["lag"]):
healthy_replica += 1
continue
unhealthy_replica += 1
# The actual check # The actual check
yield nagiosplugin.Metric("replica_count", len(replicas)) yield nagiosplugin.Metric("healthy_replica", healthy_replica)
# The performance data : replicas lag # The performance data : unheakthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
for replica in replicas: for replica in replicas:
yield nagiosplugin.Metric( yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag" f"{replica['name']}_lag", replica["lag"], context="replica_lag"

View file

@ -29,16 +29,16 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
class NodeIsReplica(PatroniResource): class NodeIsReplica(PatroniResource):
def __init__( def __init__(
self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
) -> None: ) -> None:
super().__init__(connection_info) super().__init__(connection_info)
self.lag = lag self.max_lag = max_lag
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]: def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
if self.lag is None: if self.max_lag is None:
r = self.rest_api("replica") r = self.rest_api("replica")
else: else:
r = self.rest_api(f"replica?lag={self.lag}") r = self.rest_api(f"replica?lag={self.max_lag}")
_log.debug(f"api call status: {r.status}") _log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}") _log.debug(f"api call data: {r.data}")

View file

@ -20,6 +20,48 @@ def test_cluster_has_relica_ok(mocker: MockerFixture) -> None:
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None: def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner() runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
],
)
assert result.exit_code == 0
def test_cluster_has_replica_ok_with_count_thresholds_lag(
mocker: MockerFixture,
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok_lag", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert result.exit_code == 0
def test_cluster_has_replica_ko_with_count_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko", 200) my_mock(mocker, "cluster_has_replica_ko", 200)
result = runner.invoke( result = runner.invoke(
main, main,
@ -27,10 +69,33 @@ def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) ->
"-e", "-e",
"https://10.20.199.3:8008", "https://10.20.199.3:8008",
"cluster_has_replica", "cluster_has_replica",
"--warninng", "--warning",
"@2", "@1",
"--critical", "--critical",
"@0:1", "@0",
],
)
assert result.exit_code == 1
def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
mocker: MockerFixture,
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko_lag", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
], ],
) )
assert result.exit_code == 2 assert result.exit_code == 2

View file

@ -28,6 +28,6 @@ def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None:
# We don't do the check ourselves, patroni does it and changes the return code # We don't do the check ourselves, patroni does it and changes the return code
my_mock(mocker, "node_is_replica_ok", 404) my_mock(mocker, "node_is_replica_ok", 404)
result = runner.invoke( result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"] main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--max-lag", "100"]
) )
assert result.exit_code == 2 assert result.exit_code == 2