Fix the cluster_has_leader service for standby clusters
Before this patch we checked the expected standby leader state was `running` for all versions of Patroni. With this patch, for: * Patroni < 3.0.4, standby leaders are in `running` state. * Patroni >= 3.0.4, standby leaders can be in `streaming` or `in archive recovey` state. We will raise a warning for the latter. The tests where modified to account for this. Co-authored-by: Denis Laxalde <denis@laxalde.org>
This commit is contained in:
parent
ffc330f96e
commit
46db3e2d15
|
@ -6,6 +6,7 @@
|
|||
|
||||
* Add the timeline in the `cluster_has_replica` perfstats. (#50)
|
||||
* Add a mention about shell completion support and shell versions in the doc. (#53)
|
||||
* Add the leader type and whether it's archiving to the `cluster_has_leader` perfstats. (#58)
|
||||
|
||||
### Fixed
|
||||
|
||||
|
@ -13,6 +14,7 @@
|
|||
version 2.25 and higher.
|
||||
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
|
||||
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
|
||||
* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)
|
||||
|
||||
### Misc
|
||||
|
||||
|
|
20
README.md
20
README.md
|
@ -176,11 +176,27 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
|
|||
|
||||
This check applies to any kind of leaders including standby leaders.
|
||||
|
||||
A leader is a node with the "leader" role and a "running" state.
|
||||
|
||||
A standby leader is a node with a "standby_leader" role and a "streaming" or
|
||||
"in archive recovery" state. Please note that log shipping could be stuck
|
||||
because the WAL are not available or applicable. Patroni doesn't provide
|
||||
information about the origin cluster (timeline or lag), so we cannot check
|
||||
if there is a problem in that particular case. That's why we issue a warning
|
||||
when the node is "in archive recovery". We suggest using other supervision
|
||||
tools to do this (eg. check_pgactivity).
|
||||
|
||||
Check:
|
||||
* `OK`: if there is a leader node.
|
||||
* `CRITICAL`: otherwise
|
||||
* 'WARNING': if there is a stanby leader in archive mode.
|
||||
* `CRITICAL`: otherwise.
|
||||
|
||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
||||
Perfdata:
|
||||
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
|
||||
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
|
||||
archive recovery", 0 otherwise
|
||||
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
|
||||
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
|
||||
|
||||
Options:
|
||||
--help Show this message and exit.
|
||||
|
|
|
@ -285,17 +285,38 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
|||
|
||||
This check applies to any kind of leaders including standby leaders.
|
||||
|
||||
A leader is a node with the "leader" role and a "running" state.
|
||||
|
||||
A standby leader is a node with a "standby_leader" role and a "streaming"
|
||||
or "in archive recovery" state. Please note that log shipping could be
|
||||
stuck because the WAL are not available or applicable. Patroni doesn't
|
||||
provide information about the origin cluster (timeline or lag), so we
|
||||
cannot check if there is a problem in that particular case. That's why we
|
||||
issue a warning when the node is "in archive recovery". We suggest using
|
||||
other supervision tools to do this (eg. check_pgactivity).
|
||||
|
||||
\b
|
||||
Check:
|
||||
* `OK`: if there is a leader node.
|
||||
* `CRITICAL`: otherwise
|
||||
* 'WARNING': if there is a stanby leader in archive mode.
|
||||
* `CRITICAL`: otherwise.
|
||||
|
||||
\b
|
||||
Perfdata:
|
||||
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
|
||||
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
|
||||
archive recovery", 0 otherwise
|
||||
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
|
||||
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
|
||||
|
||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
||||
"""
|
||||
check = nagiosplugin.Check()
|
||||
check.add(
|
||||
ClusterHasLeader(ctx.obj.connection_info),
|
||||
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
|
||||
nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None),
|
||||
nagiosplugin.ScalarContext("is_leader", None, None),
|
||||
nagiosplugin.ScalarContext("is_standby_leader", None, None),
|
||||
ClusterHasLeaderSummary(),
|
||||
)
|
||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||
|
|
|
@ -52,19 +52,42 @@ class ClusterHasLeader(PatroniResource):
|
|||
item_dict = self.rest_api("cluster")
|
||||
|
||||
is_leader_found = False
|
||||
is_standby_leader_found = False
|
||||
is_standby_leader_in_arc_rec = False
|
||||
for member in item_dict["members"]:
|
||||
if (
|
||||
member["role"] in ("leader", "standby_leader")
|
||||
and member["state"] == "running"
|
||||
):
|
||||
if member["role"] == "leader" and member["state"] == "running":
|
||||
is_leader_found = True
|
||||
break
|
||||
|
||||
if member["role"] == "standby_leader":
|
||||
if member["state"] not in ["streaming", "in archive recovery"]:
|
||||
# for patroni >= 3.0.4 any state would be wrong
|
||||
# for patroni < 3.0.4 a state different from running would be wrong
|
||||
if self.has_detailed_states() or member["state"] != "running":
|
||||
continue
|
||||
|
||||
if member["state"] in ["in archive recovery"]:
|
||||
is_standby_leader_in_arc_rec = True
|
||||
|
||||
is_standby_leader_found = True
|
||||
break
|
||||
return [
|
||||
nagiosplugin.Metric(
|
||||
"has_leader",
|
||||
1 if is_leader_found or is_standby_leader_found else 0,
|
||||
),
|
||||
nagiosplugin.Metric(
|
||||
"is_standby_leader_in_arc_rec",
|
||||
1 if is_standby_leader_in_arc_rec else 0,
|
||||
),
|
||||
nagiosplugin.Metric(
|
||||
"is_standby_leader",
|
||||
1 if is_standby_leader_found else 0,
|
||||
),
|
||||
nagiosplugin.Metric(
|
||||
"is_leader",
|
||||
1 if is_leader_found else 0,
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
@ -74,7 +97,7 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
|||
|
||||
@handle_unknown
|
||||
def problem(self, results: nagiosplugin.Result) -> str:
|
||||
return "The cluster has no running leader."
|
||||
return "The cluster has no running leader or the standby leader is in archive recovery."
|
||||
|
||||
|
||||
class ClusterHasReplica(PatroniResource):
|
||||
|
|
33
tests/json/cluster_has_leader_ko_standby_leader.json
Normal file
33
tests/json/cluster_has_leader_ko_standby_leader.json
Normal file
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"members": [
|
||||
{
|
||||
"name": "srv1",
|
||||
"role": "standby_leader",
|
||||
"state": "stopped",
|
||||
"api_url": "https://10.20.199.3:8008/patroni",
|
||||
"host": "10.20.199.3",
|
||||
"port": 5432,
|
||||
"timeline": 51
|
||||
},
|
||||
{
|
||||
"name": "srv2",
|
||||
"role": "replica",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.4:8008/patroni",
|
||||
"host": "10.20.199.4",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
},
|
||||
{
|
||||
"name": "srv3",
|
||||
"role": "replica",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.5:8008/patroni",
|
||||
"host": "10.20.199.5",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"members": [
|
||||
{
|
||||
"name": "srv1",
|
||||
"role": "standby_leader",
|
||||
"state": "in archive recovery",
|
||||
"api_url": "https://10.20.199.3:8008/patroni",
|
||||
"host": "10.20.199.3",
|
||||
"port": 5432,
|
||||
"timeline": 51
|
||||
},
|
||||
{
|
||||
"name": "srv2",
|
||||
"role": "replica",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.4:8008/patroni",
|
||||
"host": "10.20.199.4",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
},
|
||||
{
|
||||
"name": "srv3",
|
||||
"role": "replica",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.5:8008/patroni",
|
||||
"host": "10.20.199.5",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
{
|
||||
"name": "srv1",
|
||||
"role": "standby_leader",
|
||||
"state": "running",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.3:8008/patroni",
|
||||
"host": "10.20.199.3",
|
||||
"port": 5432,
|
||||
|
|
|
@ -1,37 +1,132 @@
|
|||
from pathlib import Path
|
||||
from typing import Iterator, Union
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
|
||||
from check_patroni.cli import main
|
||||
|
||||
from . import PatroniAPI
|
||||
from . import PatroniAPI, cluster_api_set_replica_running
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_has_leader_ok(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[str, Path] = "cluster_has_leader_ok.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_has_leader_ok")
|
||||
def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
||||
with patroni_api.routes({"cluster": "cluster_has_leader_ok.json"}):
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert result.exit_code == 0
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
|
||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=1 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_has_leader_ok_standby_leader(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[str, Path] = "cluster_has_leader_ok_standby_leader.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_has_leader_ok_standby_leader")
|
||||
def test_cluster_has_leader_ok_standby_leader(
|
||||
runner: CliRunner, patroni_api: PatroniAPI
|
||||
) -> None:
|
||||
with patroni_api.routes({"cluster": "cluster_has_leader_ok_standby_leader.json"}):
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_has_leader_ko(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[str, Path] = "cluster_has_leader_ko.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_has_leader_ko")
|
||||
def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
||||
with patroni_api.routes({"cluster": "cluster_has_leader_ko.json"}):
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert result.exit_code == 2
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader. | has_leader=0;;@0\n"
|
||||
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_has_leader_ko_standby_leader(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[str, Path] = "cluster_has_leader_ko_standby_leader.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader")
|
||||
def test_cluster_has_leader_ko_standby_leader(
|
||||
runner: CliRunner, patroni_api: PatroniAPI
|
||||
) -> None:
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_has_leader_ko_standby_leader_archiving(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[
|
||||
str, Path
|
||||
] = "cluster_has_leader_ko_standby_leader_archiving.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving")
|
||||
def test_cluster_has_leader_ko_standby_leader_archiving(
|
||||
runner: CliRunner, patroni_api: PatroniAPI
|
||||
) -> None:
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
|
|
Loading…
Reference in a new issue