--max-lag additions and fixups
* add --max-lag to cluster_has_replica * change --lag to --max-lag in node_is_replica * update tests * update README.md
This commit is contained in:
parent
2d37ed2d94
commit
d4e974da51
18
README.md
18
README.md
|
@ -21,7 +21,7 @@ Options:
|
||||||
Commands:
|
Commands:
|
||||||
cluster_config_has_changed Check if the hash of the configuration has...
|
cluster_config_has_changed Check if the hash of the configuration has...
|
||||||
cluster_has_leader Check if the cluster has a leader.
|
cluster_has_leader Check if the cluster has a leader.
|
||||||
cluster_has_replica Check if the cluster has replicas and their...
|
cluster_has_replica Check if the cluster has healthy replicates.
|
||||||
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
cluster_is_in_maintenance Check if the cluster is in maintenance mode...
|
||||||
cluster_node_count Count the number of nodes in the cluster.
|
cluster_node_count Count the number of nodes in the cluster.
|
||||||
node_is_alive Check if the node is alive ie patroni is...
|
node_is_alive Check if the node is alive ie patroni is...
|
||||||
|
@ -78,21 +78,23 @@ Options:
|
||||||
```
|
```
|
||||||
Usage: check_patroni cluster_has_replica [OPTIONS]
|
Usage: check_patroni cluster_has_replica [OPTIONS]
|
||||||
|
|
||||||
Check if the cluster has replicas and their lag.
|
Check if the cluster has healthy replicates.
|
||||||
|
|
||||||
|
A healthy replicate : * is in running state * has a replica role * has a lag
|
||||||
|
lower or equal to max_lag
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
|
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||||
* `WARNING` / `CRITICAL`: otherwise
|
* `WARNING` / `CRITICAL`: otherwise
|
||||||
|
|
||||||
Perfdata :
|
Perfdata :
|
||||||
* replica count
|
* healthy_replica & unhealthy_replica count
|
||||||
* the lag of each replica labelled with "member name"_lag
|
* the lag of each replica labelled with "member name"_lag
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-w, --warning TEXT Warning threshold for the number of nodes.
|
-w, --warning TEXT Warning threshold for the number of nodes.
|
||||||
-c, --critical TEXT Critical threshold for the number of replica nodes.
|
-c, --critical TEXT Critical threshold for the number of replica nodes.
|
||||||
--lag-warning TEXT Warning threshold for the lag.
|
--max-lag TEXT maximum allowed lag
|
||||||
--lag-critical TEXT Critical threshold for the lag.
|
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -211,8 +213,8 @@ Usage: check_patroni node_is_replica [OPTIONS]
|
||||||
noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
|
noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--lag TEXT maximum allowed lag
|
--max-lag TEXT maximum allowed lag
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
### node_patroni_version
|
### node_patroni_version
|
||||||
|
|
|
@ -29,6 +29,7 @@ from .node import (
|
||||||
NodeTLHasChangedSummary,
|
NodeTLHasChangedSummary,
|
||||||
)
|
)
|
||||||
from .types import ConnectionInfo
|
from .types import ConnectionInfo
|
||||||
|
from .convert import size_to_byte
|
||||||
|
|
||||||
|
|
||||||
def print_version(ctx: click.Context, param: str, value: str) -> None:
|
def print_version(ctx: click.Context, param: str, value: str) -> None:
|
||||||
|
@ -247,45 +248,41 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
type=str,
|
type=str,
|
||||||
help="Critical threshold for the number of replica nodes.",
|
help="Critical threshold for the number of replica nodes.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||||
"--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag."
|
|
||||||
)
|
|
||||||
# FIWME how do we manage maximum_lag_on_failover without doing many api calls
|
|
||||||
@click.option(
|
|
||||||
"--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag."
|
|
||||||
)
|
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
def cluster_has_replica(
|
def cluster_has_replica(
|
||||||
ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str
|
ctx: click.Context, warning: str, critical: str, max_lag: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check if the cluster has replicas and their lag.
|
"""Check if the cluster has healthy replicates.
|
||||||
|
|
||||||
|
A healthy replicate :
|
||||||
|
* is in running state
|
||||||
|
* has a replica role
|
||||||
|
* has a lag lower or equal to max_lag
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
|
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
|
||||||
* `WARNING` / `CRITICAL`: otherwise
|
* `WARNING` / `CRITICAL`: otherwise
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Perfdata :
|
Perfdata :
|
||||||
* replica count
|
* healthy_replica & unhealthy_replica count
|
||||||
* the lag of each replica labelled with "member name"_lag
|
* the lag of each replica labelled with "member name"_lag
|
||||||
"""
|
"""
|
||||||
# FIXME the idea here would be to make sur we have a replica.
|
|
||||||
# lag should be check to prune invalid replicas
|
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
|
||||||
check = nagiosplugin.Check()
|
check = nagiosplugin.Check()
|
||||||
check.add(
|
check.add(
|
||||||
ClusterHasReplica(ctx.obj),
|
ClusterHasReplica(ctx.obj, tmax_lag),
|
||||||
nagiosplugin.ScalarContext(
|
nagiosplugin.ScalarContext(
|
||||||
"replica_count",
|
"healthy_replica",
|
||||||
warning,
|
warning,
|
||||||
critical,
|
critical,
|
||||||
),
|
),
|
||||||
nagiosplugin.ScalarContext(
|
nagiosplugin.ScalarContext("unhealthy_replica"),
|
||||||
"replica_lag",
|
nagiosplugin.ScalarContext("replica_lag"),
|
||||||
lag_warning,
|
|
||||||
lag_critical,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
check.main(
|
check.main(
|
||||||
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
||||||
|
@ -388,10 +385,10 @@ def node_is_primary(ctx: click.Context) -> None:
|
||||||
|
|
||||||
|
|
||||||
@main.command(name="node_is_replica")
|
@main.command(name="node_is_replica")
|
||||||
@click.option("--lag", "lag", type=str, help="maximum allowed lag")
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
def node_is_replica(ctx: click.Context, lag: str) -> None:
|
def node_is_replica(ctx: click.Context, max_lag: str) -> None:
|
||||||
"""Check if the node is a running replica with no noloadbalance tag.
|
"""Check if the node is a running replica with no noloadbalance tag.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
|
@ -404,9 +401,9 @@ def node_is_replica(ctx: click.Context, lag: str) -> None:
|
||||||
# FIXME add a lag check ??
|
# FIXME add a lag check ??
|
||||||
check = nagiosplugin.Check()
|
check = nagiosplugin.Check()
|
||||||
check.add(
|
check.add(
|
||||||
NodeIsReplica(ctx.obj, lag),
|
NodeIsReplica(ctx.obj, max_lag),
|
||||||
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
|
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
|
||||||
NodeIsReplicaSummary(lag),
|
NodeIsReplicaSummary(max_lag),
|
||||||
)
|
)
|
||||||
check.main(
|
check.main(
|
||||||
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
|
||||||
|
|
|
@ -3,7 +3,7 @@ import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import nagiosplugin
|
import nagiosplugin
|
||||||
from typing import Iterable
|
from typing import Iterable, Union
|
||||||
|
|
||||||
from .types import PatroniResource, ConnectionInfo, handle_unknown
|
from .types import PatroniResource, ConnectionInfo, handle_unknown
|
||||||
|
|
||||||
|
@ -81,6 +81,14 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
||||||
|
|
||||||
|
|
||||||
class ClusterHasReplica(PatroniResource):
|
class ClusterHasReplica(PatroniResource):
|
||||||
|
def __init__(
|
||||||
|
self: "ClusterHasReplica",
|
||||||
|
connection_info: ConnectionInfo,
|
||||||
|
max_lag: Union[int, None],
|
||||||
|
):
|
||||||
|
super().__init__(connection_info)
|
||||||
|
self.max_lag = max_lag
|
||||||
|
|
||||||
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
|
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
|
||||||
r = self.rest_api("cluster")
|
r = self.rest_api("cluster")
|
||||||
_log.debug(f"api call status: {r.status}")
|
_log.debug(f"api call status: {r.status}")
|
||||||
|
@ -88,17 +96,23 @@ class ClusterHasReplica(PatroniResource):
|
||||||
|
|
||||||
item_dict = json.loads(r.data)
|
item_dict = json.loads(r.data)
|
||||||
replicas = []
|
replicas = []
|
||||||
|
healthy_replica = 0
|
||||||
|
unhealthy_replica = 0
|
||||||
for member in item_dict["members"]:
|
for member in item_dict["members"]:
|
||||||
# FIXME are there other acceptable states
|
# FIXME are there other acceptable states
|
||||||
if member["role"] == "replica" and member["state"] == "running":
|
if member["role"] == "replica":
|
||||||
# FIXME which lag ?
|
if member["state"] == "running" and member["lag"] != "unknown":
|
||||||
replicas.append({"name": member["name"], "lag": member["lag"]})
|
replicas.append({"name": member["name"], "lag": member["lag"]})
|
||||||
break
|
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
||||||
|
healthy_replica += 1
|
||||||
|
continue
|
||||||
|
unhealthy_replica += 1
|
||||||
|
|
||||||
# The actual check
|
# The actual check
|
||||||
yield nagiosplugin.Metric("replica_count", len(replicas))
|
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
||||||
|
|
||||||
# The performance data : replicas lag
|
# The performance data : unheakthy replica count, replicas lag
|
||||||
|
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
||||||
for replica in replicas:
|
for replica in replicas:
|
||||||
yield nagiosplugin.Metric(
|
yield nagiosplugin.Metric(
|
||||||
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
||||||
|
|
|
@ -29,16 +29,16 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
|
||||||
|
|
||||||
class NodeIsReplica(PatroniResource):
|
class NodeIsReplica(PatroniResource):
|
||||||
def __init__(
|
def __init__(
|
||||||
self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str
|
self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(connection_info)
|
super().__init__(connection_info)
|
||||||
self.lag = lag
|
self.max_lag = max_lag
|
||||||
|
|
||||||
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
|
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
|
||||||
if self.lag is None:
|
if self.max_lag is None:
|
||||||
r = self.rest_api("replica")
|
r = self.rest_api("replica")
|
||||||
else:
|
else:
|
||||||
r = self.rest_api(f"replica?lag={self.lag}")
|
r = self.rest_api(f"replica?lag={self.max_lag}")
|
||||||
_log.debug(f"api call status: {r.status}")
|
_log.debug(f"api call status: {r.status}")
|
||||||
_log.debug(f"api call data: {r.data}")
|
_log.debug(f"api call data: {r.data}")
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,48 @@ def test_cluster_has_relica_ok(mocker: MockerFixture) -> None:
|
||||||
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
|
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_replica_ok", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
"https://10.20.199.3:8008",
|
||||||
|
"cluster_has_replica",
|
||||||
|
"--warning",
|
||||||
|
"@1",
|
||||||
|
"--critical",
|
||||||
|
"@0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_replica_ok_with_count_thresholds_lag(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_replica_ok_lag", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
"https://10.20.199.3:8008",
|
||||||
|
"cluster_has_replica",
|
||||||
|
"--warning",
|
||||||
|
"@1",
|
||||||
|
"--critical",
|
||||||
|
"@0",
|
||||||
|
"--max-lag",
|
||||||
|
"1MB",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_replica_ko_with_count_thresholds(mocker: MockerFixture) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
my_mock(mocker, "cluster_has_replica_ko", 200)
|
my_mock(mocker, "cluster_has_replica_ko", 200)
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
main,
|
main,
|
||||||
|
@ -27,10 +69,33 @@ def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) ->
|
||||||
"-e",
|
"-e",
|
||||||
"https://10.20.199.3:8008",
|
"https://10.20.199.3:8008",
|
||||||
"cluster_has_replica",
|
"cluster_has_replica",
|
||||||
"--warninng",
|
"--warning",
|
||||||
"@2",
|
"@1",
|
||||||
"--critical",
|
"--critical",
|
||||||
"@0:1",
|
"@0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_replica_ko_lag", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
"https://10.20.199.3:8008",
|
||||||
|
"cluster_has_replica",
|
||||||
|
"--warning",
|
||||||
|
"@1",
|
||||||
|
"--critical",
|
||||||
|
"@0",
|
||||||
|
"--max-lag",
|
||||||
|
"1MB",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 2
|
assert result.exit_code == 2
|
||||||
|
|
|
@ -28,6 +28,6 @@ def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None:
|
||||||
# We don't do the check ourselves, patroni does it and changes the return code
|
# We don't do the check ourselves, patroni does it and changes the return code
|
||||||
my_mock(mocker, "node_is_replica_ok", 404)
|
my_mock(mocker, "node_is_replica_ok", 404)
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"]
|
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--max-lag", "100"]
|
||||||
)
|
)
|
||||||
assert result.exit_code == 2
|
assert result.exit_code == 2
|
||||||
|
|
Loading…
Reference in a new issue