Fix the cluster_has_leader service for standby clusters

Before this patch we checked the expected standby leader state
was `running` for all versions of Patroni.

With this patch, for:
* Patroni < 3.0.4, standby leaders are in `running` state.
* Patroni >= 3.0.4, standby leaders can be in `streaming` or `in
archive recovey` state. We will raise a warning for the latter.

The tests where modified to account for this.

Co-authored-by: Denis Laxalde <denis@laxalde.org>
This commit is contained in:
benoit 2023-11-08 17:50:32 +01:00 committed by Benoit
parent ffc330f96e
commit 46db3e2d15
8 changed files with 249 additions and 26 deletions

View file

@ -6,6 +6,7 @@
* Add the timeline in the `cluster_has_replica` perfstats. (#50)
* Add a mention about shell completion support and shell versions in the doc. (#53)
* Add the leader type and whether it's archiving to the `cluster_has_leader` perfstats. (#58)
### Fixed
@ -13,6 +14,7 @@
version 2.25 and higher.
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)
### Misc

View file

@ -176,11 +176,27 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
This check applies to any kind of leaders including standby leaders.
A leader is a node with the "leader" role and a "running" state.
A standby leader is a node with a "standby_leader" role and a "streaming" or
"in archive recovery" state. Please note that log shipping could be stuck
because the WAL are not available or applicable. Patroni doesn't provide
information about the origin cluster (timeline or lag), so we cannot check
if there is a problem in that particular case. That's why we issue a warning
when the node is "in archive recovery". We suggest using other supervision
tools to do this (eg. check_pgactivity).
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
* 'WARNING': if there is a stanby leader in archive mode.
* `CRITICAL`: otherwise.
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
Perfdata:
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
archive recovery", 0 otherwise
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
Options:
--help Show this message and exit.

View file

@ -285,17 +285,38 @@ def cluster_has_leader(ctx: click.Context) -> None:
This check applies to any kind of leaders including standby leaders.
A leader is a node with the "leader" role and a "running" state.
A standby leader is a node with a "standby_leader" role and a "streaming"
or "in archive recovery" state. Please note that log shipping could be
stuck because the WAL are not available or applicable. Patroni doesn't
provide information about the origin cluster (timeline or lag), so we
cannot check if there is a problem in that particular case. That's why we
issue a warning when the node is "in archive recovery". We suggest using
other supervision tools to do this (eg. check_pgactivity).
\b
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
* 'WARNING': if there is a stanby leader in archive mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
archive recovery", 0 otherwise
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None),
nagiosplugin.ScalarContext("is_leader", None, None),
nagiosplugin.ScalarContext("is_standby_leader", None, None),
ClusterHasLeaderSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)

View file

@ -52,19 +52,42 @@ class ClusterHasLeader(PatroniResource):
item_dict = self.rest_api("cluster")
is_leader_found = False
is_standby_leader_found = False
is_standby_leader_in_arc_rec = False
for member in item_dict["members"]:
if (
member["role"] in ("leader", "standby_leader")
and member["state"] == "running"
):
if member["role"] == "leader" and member["state"] == "running":
is_leader_found = True
break
if member["role"] == "standby_leader":
if member["state"] not in ["streaming", "in archive recovery"]:
# for patroni >= 3.0.4 any state would be wrong
# for patroni < 3.0.4 a state different from running would be wrong
if self.has_detailed_states() or member["state"] != "running":
continue
if member["state"] in ["in archive recovery"]:
is_standby_leader_in_arc_rec = True
is_standby_leader_found = True
break
return [
nagiosplugin.Metric(
"has_leader",
1 if is_leader_found or is_standby_leader_found else 0,
),
nagiosplugin.Metric(
"is_standby_leader_in_arc_rec",
1 if is_standby_leader_in_arc_rec else 0,
),
nagiosplugin.Metric(
"is_standby_leader",
1 if is_standby_leader_found else 0,
),
nagiosplugin.Metric(
"is_leader",
1 if is_leader_found else 0,
)
),
]
@ -74,7 +97,7 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
@handle_unknown
def problem(self, results: nagiosplugin.Result) -> str:
return "The cluster has no running leader."
return "The cluster has no running leader or the standby leader is in archive recovery."
class ClusterHasReplica(PatroniResource):

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "standby_leader",
"state": "stopped",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "standby_leader",
"state": "in archive recovery",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -3,7 +3,7 @@
{
"name": "srv1",
"role": "standby_leader",
"state": "running",
"state": "streaming",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,

View file

@ -1,37 +1,132 @@
from pathlib import Path
from typing import Iterator, Union
import pytest
from click.testing import CliRunner
from check_patroni.cli import main
from . import PatroniAPI
from . import PatroniAPI, cluster_api_set_replica_running
@pytest.fixture
def cluster_has_leader_ok(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ok.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_leader_ok")
def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ok.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert result.exit_code == 0
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=1 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 0
@pytest.fixture
def cluster_has_leader_ok_standby_leader(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ok_standby_leader.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_leader_ok_standby_leader")
def test_cluster_has_leader_ok_standby_leader(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ok_standby_leader.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
)
@pytest.fixture
def cluster_has_leader_ko(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ko.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_leader_ko")
def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ko.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert result.exit_code == 2
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader. | has_leader=0;;@0\n"
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 2
@pytest.fixture
def cluster_has_leader_ko_standby_leader(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ko_standby_leader.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader")
def test_cluster_has_leader_ko_standby_leader(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 2
@pytest.fixture
def cluster_has_leader_ko_standby_leader_archiving(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[
str, Path
] = "cluster_has_leader_ko_standby_leader_archiving.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving")
def test_cluster_has_leader_ko_standby_leader_archiving(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
)
assert result.exit_code == 1