From 46db3e2d1593a225a770fc6cd3b286b6c661121d Mon Sep 17 00:00:00 2001 From: benoit Date: Wed, 8 Nov 2023 17:50:32 +0100 Subject: [PATCH] Fix the cluster_has_leader service for standby clusters Before this patch we checked the expected standby leader state was `running` for all versions of Patroni. With this patch, for: * Patroni < 3.0.4, standby leaders are in `running` state. * Patroni >= 3.0.4, standby leaders can be in `streaming` or `in archive recovey` state. We will raise a warning for the latter. The tests where modified to account for this. Co-authored-by: Denis Laxalde --- CHANGELOG.md | 2 + README.md | 20 ++- check_patroni/cli.py | 25 +++- check_patroni/cluster.py | 35 ++++- .../cluster_has_leader_ko_standby_leader.json | 33 +++++ ...as_leader_ko_standby_leader_archiving.json | 33 +++++ .../cluster_has_leader_ok_standby_leader.json | 2 +- tests/test_cluster_has_leader.py | 125 +++++++++++++++--- 8 files changed, 249 insertions(+), 26 deletions(-) create mode 100644 tests/json/cluster_has_leader_ko_standby_leader.json create mode 100644 tests/json/cluster_has_leader_ko_standby_leader_archiving.json diff --git a/CHANGELOG.md b/CHANGELOG.md index c6428e0..37f2ea7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Add the timeline in the `cluster_has_replica` perfstats. (#50) * Add a mention about shell completion support and shell versions in the doc. (#53) +* Add the leader type and whether it's archiving to the `cluster_has_leader` perfstats. (#58) ### Fixed @@ -13,6 +14,7 @@ version 2.25 and higher. * Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck) * Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50) +* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck) ### Misc diff --git a/README.md b/README.md index 20268e9..37f3869 100644 --- a/README.md +++ b/README.md @@ -176,11 +176,27 @@ Usage: check_patroni cluster_has_leader [OPTIONS] This check applies to any kind of leaders including standby leaders. + A leader is a node with the "leader" role and a "running" state. + + A standby leader is a node with a "standby_leader" role and a "streaming" or + "in archive recovery" state. Please note that log shipping could be stuck + because the WAL are not available or applicable. Patroni doesn't provide + information about the origin cluster (timeline or lag), so we cannot check + if there is a problem in that particular case. That's why we issue a warning + when the node is "in archive recovery". We suggest using other supervision + tools to do this (eg. check_pgactivity). + Check: * `OK`: if there is a leader node. - * `CRITICAL`: otherwise + * 'WARNING': if there is a stanby leader in archive mode. + * `CRITICAL`: otherwise. - Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise + Perfdata: + * `has_leader` is 1 if there is any kind of leader node, 0 otherwise + * `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in + archive recovery", 0 otherwise + * `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise + * `is_leader` is 1 if there is a "classical" leader node, 0 otherwise Options: --help Show this message and exit. diff --git a/check_patroni/cli.py b/check_patroni/cli.py index d69569f..fafcb02 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -285,17 +285,38 @@ def cluster_has_leader(ctx: click.Context) -> None: This check applies to any kind of leaders including standby leaders. + A leader is a node with the "leader" role and a "running" state. + + A standby leader is a node with a "standby_leader" role and a "streaming" + or "in archive recovery" state. Please note that log shipping could be + stuck because the WAL are not available or applicable. Patroni doesn't + provide information about the origin cluster (timeline or lag), so we + cannot check if there is a problem in that particular case. That's why we + issue a warning when the node is "in archive recovery". We suggest using + other supervision tools to do this (eg. check_pgactivity). + \b Check: * `OK`: if there is a leader node. - * `CRITICAL`: otherwise + * 'WARNING': if there is a stanby leader in archive mode. + * `CRITICAL`: otherwise. + + \b + Perfdata: + * `has_leader` is 1 if there is any kind of leader node, 0 otherwise + * `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in + archive recovery", 0 otherwise + * `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise + * `is_leader` is 1 if there is a "classical" leader node, 0 otherwise - Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise """ check = nagiosplugin.Check() check.add( ClusterHasLeader(ctx.obj.connection_info), nagiosplugin.ScalarContext("has_leader", None, "@0:0"), + nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None), + nagiosplugin.ScalarContext("is_leader", None, None), + nagiosplugin.ScalarContext("is_standby_leader", None, None), ClusterHasLeaderSummary(), ) check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index a7891b8..ff75f52 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -52,19 +52,42 @@ class ClusterHasLeader(PatroniResource): item_dict = self.rest_api("cluster") is_leader_found = False + is_standby_leader_found = False + is_standby_leader_in_arc_rec = False for member in item_dict["members"]: - if ( - member["role"] in ("leader", "standby_leader") - and member["state"] == "running" - ): + if member["role"] == "leader" and member["state"] == "running": is_leader_found = True break + if member["role"] == "standby_leader": + if member["state"] not in ["streaming", "in archive recovery"]: + # for patroni >= 3.0.4 any state would be wrong + # for patroni < 3.0.4 a state different from running would be wrong + if self.has_detailed_states() or member["state"] != "running": + continue + + if member["state"] in ["in archive recovery"]: + is_standby_leader_in_arc_rec = True + + is_standby_leader_found = True + break return [ nagiosplugin.Metric( "has_leader", + 1 if is_leader_found or is_standby_leader_found else 0, + ), + nagiosplugin.Metric( + "is_standby_leader_in_arc_rec", + 1 if is_standby_leader_in_arc_rec else 0, + ), + nagiosplugin.Metric( + "is_standby_leader", + 1 if is_standby_leader_found else 0, + ), + nagiosplugin.Metric( + "is_leader", 1 if is_leader_found else 0, - ) + ), ] @@ -74,7 +97,7 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary): @handle_unknown def problem(self, results: nagiosplugin.Result) -> str: - return "The cluster has no running leader." + return "The cluster has no running leader or the standby leader is in archive recovery." class ClusterHasReplica(PatroniResource): diff --git a/tests/json/cluster_has_leader_ko_standby_leader.json b/tests/json/cluster_has_leader_ko_standby_leader.json new file mode 100644 index 0000000..3302838 --- /dev/null +++ b/tests/json/cluster_has_leader_ko_standby_leader.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "standby_leader", + "state": "stopped", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/tests/json/cluster_has_leader_ko_standby_leader_archiving.json b/tests/json/cluster_has_leader_ko_standby_leader_archiving.json new file mode 100644 index 0000000..a91ea2d --- /dev/null +++ b/tests/json/cluster_has_leader_ko_standby_leader_archiving.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "standby_leader", + "state": "in archive recovery", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/tests/json/cluster_has_leader_ok_standby_leader.json b/tests/json/cluster_has_leader_ok_standby_leader.json index baca225..b5c4844 100644 --- a/tests/json/cluster_has_leader_ok_standby_leader.json +++ b/tests/json/cluster_has_leader_ok_standby_leader.json @@ -3,7 +3,7 @@ { "name": "srv1", "role": "standby_leader", - "state": "running", + "state": "streaming", "api_url": "https://10.20.199.3:8008/patroni", "host": "10.20.199.3", "port": 5432, diff --git a/tests/test_cluster_has_leader.py b/tests/test_cluster_has_leader.py index 842399e..1421ec5 100644 --- a/tests/test_cluster_has_leader.py +++ b/tests/test_cluster_has_leader.py @@ -1,37 +1,132 @@ +from pathlib import Path +from typing import Iterator, Union + +import pytest from click.testing import CliRunner from check_patroni.cli import main -from . import PatroniAPI +from . import PatroniAPI, cluster_api_set_replica_running +@pytest.fixture +def cluster_has_leader_ok( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_leader_ok.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_leader_ok") def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None: - with patroni_api.routes({"cluster": "cluster_has_leader_ok.json"}): - result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) - assert result.exit_code == 0 + result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) assert ( result.stdout - == "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n" + == "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=1 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n" ) + assert result.exit_code == 0 +@pytest.fixture +def cluster_has_leader_ok_standby_leader( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_leader_ok_standby_leader.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_leader_ok_standby_leader") def test_cluster_has_leader_ok_standby_leader( runner: CliRunner, patroni_api: PatroniAPI ) -> None: - with patroni_api.routes({"cluster": "cluster_has_leader_ok_standby_leader.json"}): - result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) + result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) + assert ( + result.stdout + == "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n" + ) assert result.exit_code == 0 - assert ( - result.stdout - == "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n" - ) +@pytest.fixture +def cluster_has_leader_ko( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_leader_ko.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_leader_ko") def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None: - with patroni_api.routes({"cluster": "cluster_has_leader_ko.json"}): - result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) - assert result.exit_code == 2 + result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) assert ( result.stdout - == "CLUSTERHASLEADER CRITICAL - The cluster has no running leader. | has_leader=0;;@0\n" + == "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n" ) + assert result.exit_code == 2 + + +@pytest.fixture +def cluster_has_leader_ko_standby_leader( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_leader_ko_standby_leader.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader") +def test_cluster_has_leader_ko_standby_leader( + runner: CliRunner, patroni_api: PatroniAPI +) -> None: + result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) + assert ( + result.stdout + == "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n" + ) + assert result.exit_code == 2 + + +@pytest.fixture +def cluster_has_leader_ko_standby_leader_archiving( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[ + str, Path + ] = "cluster_has_leader_ko_standby_leader_archiving.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving") +def test_cluster_has_leader_ko_standby_leader_archiving( + runner: CliRunner, patroni_api: PatroniAPI +) -> None: + result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"]) + assert ( + result.stdout + == "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n" + ) + assert result.exit_code == 1