From 78ef0f6ada96a45757a10427972caf30efd6caaa Mon Sep 17 00:00:00 2001 From: benoit Date: Tue, 14 Nov 2023 12:17:36 +0100 Subject: [PATCH] Fix cluster_node_count's management of replication states The service now supports the `streaming` state. Since we dont check for lag or timeline in this service, a healthy node is : * leader : in a running state * standby_leader : running (pre Patroni 3.0.4), streaming otherwise * standby & sync_standby : running (pre Patroni 3.0.4), streaming otherwise Updated the tests for this service. --- CHANGELOG.md | 6 ++ README.md | 29 ++++-- check_patroni/cli.py | 37 +++++--- check_patroni/cluster.py | 39 ++++++-- tests/__init__.py | 2 +- ...ter_node_count_ko_in_archive_recovery.json | 33 +++++++ tests/test_cluster_node_count.py | 95 +++++++++++++++---- 7 files changed, 191 insertions(+), 50 deletions(-) create mode 100644 tests/json/cluster_node_count_ko_in_archive_recovery.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 37f2ea7..033ebb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## Unreleased +### Changed + +* In `cluster_node_count`, a healthy standby, sync replica or standby leaders cannot be "in + archive recovery" because this service doesn't check for lag and timelines. + ### Added * Add the timeline in the `cluster_has_replica` perfstats. (#50) @@ -15,6 +20,7 @@ * Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck) * Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50) * Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck) +* Fix `cluster_node_count` to correctly manage replication states. (#50, reported by @mbanck) ### Misc diff --git a/README.md b/README.md index 37f3869..52a195f 100644 --- a/README.md +++ b/README.md @@ -299,26 +299,37 @@ Usage: check_patroni cluster_node_count [OPTIONS] Count the number of nodes in the cluster. + The role refers to the role of the server in the cluster. Possible values + are: + * master or leader + * replica + * standby_leader + * sync_standby + * demoted + * promoted + * uninitialized + The state refers to the state of PostgreSQL. Possible values are: * initializing new cluster, initdb failed * running custom bootstrap script, custom bootstrap failed * starting, start failed * restarting, restart failed - * running, streaming (for a replica V3.0.4) + * running, streaming, in archive recovery * stopping, stopped, stop failed * creating replica * crashed - The role refers to the role of the server in the cluster. Possible values - are: - * master or leader (V3.0.0+) - * replica - * demoted - * promoted - * uninitialized + The "healthy" checks only ensures that: + * a leader has the running state + * a standby_leader has the running or streaming (V3.0.4) state + * a replica or sync-standby has the running or streaming (V3.0.4) state + + Since we dont check the lag or timeline, "in archive recovery" is not + considered a valid state for this service. See cluster_has_leader and + cluster_has_replica for specialized checks. Check: - * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds. + * Compares the number of nodes against the normal and healthy nodes warning and critical thresholds. * `OK`: If they are not provided. Perfdata: diff --git a/check_patroni/cli.py b/check_patroni/cli.py index fafcb02..7b8f109 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -226,29 +226,40 @@ def cluster_node_count( ) -> None: """Count the number of nodes in the cluster. - \b - The state refers to the state of PostgreSQL. Possible values are: - * initializing new cluster, initdb failed - * running custom bootstrap script, custom bootstrap failed - * starting, start failed - * restarting, restart failed - * running, streaming (for a replica V3.0.4) - * stopping, stopped, stop failed - * creating replica - * crashed - \b The role refers to the role of the server in the cluster. Possible values are: - * master or leader (V3.0.0+) + * master or leader * replica + * standby_leader + * sync_standby * demoted * promoted * uninitialized + \b + The state refers to the state of PostgreSQL. Possible values are: + * initializing new cluster, initdb failed + * running custom bootstrap script, custom bootstrap failed + * starting, start failed + * restarting, restart failed + * running, streaming, in archive recovery + * stopping, stopped, stop failed + * creating replica + * crashed + + \b + The "healthy" checks only ensures that: + * a leader has the running state + * a standby_leader has the running or streaming (V3.0.4) state + * a replica or sync-standby has the running or streaming (V3.0.4) state + + Since we dont check the lag or timeline, "in archive recovery" is not considered a valid state + for this service. See cluster_has_leader and cluster_has_replica for specialized checks. + \b Check: - * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds. + * Compares the number of nodes against the normal and healthy nodes warning and critical thresholds. * `OK`: If they are not provided. \b diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index ff75f52..4598300 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -15,24 +15,51 @@ def replace_chars(text: str) -> str: class ClusterNodeCount(PatroniResource): def probe(self) -> Iterable[nagiosplugin.Metric]: + def debug_member(member: Any, health: str) -> None: + _log.debug( + "Node %(node_name)s is %(health)s: role %(role)s state %(state)s.", + { + "node_name": member["name"], + "health": health, + "role": member["role"], + "state": member["state"], + }, + ) + + # get the cluster info item_dict = self.rest_api("cluster") + role_counters: Counter[str] = Counter() roles = [] status_counters: Counter[str] = Counter() statuses = [] + healthy_member = 0 for member in item_dict["members"]: - roles.append(replace_chars(member["role"])) - statuses.append(replace_chars(member["state"])) + state, role = member["state"], member["role"] + roles.append(replace_chars(role)) + statuses.append(replace_chars(state)) + + if role == "leader" and state == "running": + healthy_member += 1 + debug_member(member, "healthy") + continue + + if role in ["standby_leader", "replica", "sync_standby"] and ( + (self.has_detailed_states() and state == "streaming") + or (not self.has_detailed_states() and state == "running") + ): + healthy_member += 1 + debug_member(member, "healthy") + continue + + debug_member(member, "unhealthy") role_counters.update(roles) status_counters.update(statuses) # The actual check: members, healthy_members yield nagiosplugin.Metric("members", len(item_dict["members"])) - yield nagiosplugin.Metric( - "healthy_members", - status_counters["running"] + status_counters.get("streaming", 0), - ) + yield nagiosplugin.Metric("healthy_members", healthy_member) # The performance data : role for role in role_counters: diff --git a/tests/__init__.py b/tests/__init__.py index aaecf11..3b82360 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -55,7 +55,7 @@ def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path: with in_json.open() as f: js = json.load(f) for node in js["members"]: - if node["role"] in ["replica", "sync_standby"]: + if node["role"] in ["replica", "sync_standby", "standby_leader"]: if node["state"] in ["streaming", "in archive recovery"]: node["state"] = "running" assert target_dir.is_dir() diff --git a/tests/json/cluster_node_count_ko_in_archive_recovery.json b/tests/json/cluster_node_count_ko_in_archive_recovery.json new file mode 100644 index 0000000..14600a5 --- /dev/null +++ b/tests/json/cluster_node_count_ko_in_archive_recovery.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "standby_leader", + "state": "in archive recovery", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "in archive recovery", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/tests/test_cluster_node_count.py b/tests/test_cluster_node_count.py index e5a8991..519f205 100644 --- a/tests/test_cluster_node_count.py +++ b/tests/test_cluster_node_count.py @@ -13,10 +13,12 @@ from . import PatroniAPI, cluster_api_set_replica_running def cluster_node_count_ok( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_node_count_ok.json" + cluster_path: Union[str, Path] = "cluster_node_count_ok.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -25,7 +27,6 @@ def test_cluster_node_count_ok( runner: CliRunner, patroni_api: PatroniAPI, old_replica_state: bool ) -> None: result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_node_count"]) - assert result.exit_code == 0 if old_replica_state: assert ( result.output @@ -36,6 +37,7 @@ def test_cluster_node_count_ok( result.output == "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3 members=3 role_leader=1 role_replica=2 state_running=1 state_streaming=2\n" ) + assert result.exit_code == 0 @pytest.mark.usefixtures("cluster_node_count_ok") @@ -58,7 +60,6 @@ def test_cluster_node_count_ok_with_thresholds( "@0:1", ], ) - assert result.exit_code == 0 if old_replica_state: assert ( result.output @@ -69,16 +70,19 @@ def test_cluster_node_count_ok_with_thresholds( result.output == "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3;@2;@1 members=3;@1;@2 role_leader=1 role_replica=2 state_running=1 state_streaming=2\n" ) + assert result.exit_code == 0 @pytest.fixture def cluster_node_count_healthy_warning( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_node_count_healthy_warning.json" + cluster_path: Union[str, Path] = "cluster_node_count_healthy_warning.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -98,7 +102,6 @@ def test_cluster_node_count_healthy_warning( "@0:1", ], ) - assert result.exit_code == 1 if old_replica_state: assert ( result.output @@ -109,16 +112,19 @@ def test_cluster_node_count_healthy_warning( result.output == "CLUSTERNODECOUNT WARNING - healthy_members is 2 (outside range @0:2) | healthy_members=2;@2;@1 members=2 role_leader=1 role_replica=1 state_running=1 state_streaming=1\n" ) + assert result.exit_code == 1 @pytest.fixture def cluster_node_count_healthy_critical( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_node_count_healthy_critical.json" + cluster_path: Union[str, Path] = "cluster_node_count_healthy_critical.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -138,21 +144,23 @@ def test_cluster_node_count_healthy_critical( "@0:1", ], ) - assert result.exit_code == 2 assert ( result.output == "CLUSTERNODECOUNT CRITICAL - healthy_members is 1 (outside range @0:1) | healthy_members=1;@2;@1 members=3 role_leader=1 role_replica=2 state_running=1 state_start_failed=2\n" ) + assert result.exit_code == 2 @pytest.fixture def cluster_node_count_warning( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_node_count_warning.json" + cluster_path: Union[str, Path] = "cluster_node_count_warning.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -172,7 +180,6 @@ def test_cluster_node_count_warning( "@0:1", ], ) - assert result.exit_code == 1 if old_replica_state: assert ( result.stdout @@ -183,16 +190,19 @@ def test_cluster_node_count_warning( result.stdout == "CLUSTERNODECOUNT WARNING - members is 2 (outside range @0:2) | healthy_members=2 members=2;@2;@1 role_leader=1 role_replica=1 state_running=1 state_streaming=1\n" ) + assert result.exit_code == 1 @pytest.fixture def cluster_node_count_critical( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_node_count_critical.json" + cluster_path: Union[str, Path] = "cluster_node_count_critical.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -212,8 +222,51 @@ def test_cluster_node_count_critical( "@0:1", ], ) - assert result.exit_code == 2 assert ( result.stdout == "CLUSTERNODECOUNT CRITICAL - members is 1 (outside range @0:1) | healthy_members=1 members=1;@2;@1 role_leader=1 state_running=1\n" ) + assert result.exit_code == 2 + + +@pytest.fixture +def cluster_node_count_ko_in_archive_recovery( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_node_count_ko_in_archive_recovery.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_node_count_ko_in_archive_recovery") +def test_cluster_node_count_ko_in_archive_recovery( + runner: CliRunner, patroni_api: PatroniAPI, old_replica_state: bool +) -> None: + result = runner.invoke( + main, + [ + "-e", + patroni_api.endpoint, + "cluster_node_count", + "--healthy-warning", + "@2", + "--healthy-critical", + "@0:1", + ], + ) + if old_replica_state: + assert ( + result.stdout + == "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3;@2;@1 members=3 role_replica=2 role_standby_leader=1 state_running=3\n" + ) + assert result.exit_code == 0 + else: + assert ( + result.stdout + == "CLUSTERNODECOUNT CRITICAL - healthy_members is 1 (outside range @0:1) | healthy_members=1;@2;@1 members=3 role_replica=2 role_standby_leader=1 state_in_archive_recovery=2 state_streaming=1\n" + ) + assert result.exit_code == 2