From 8d6b8502b63817929be27dc15b0de79211aef403 Mon Sep 17 00:00:00 2001 From: benoit Date: Wed, 27 Sep 2023 16:37:40 +0200 Subject: [PATCH] cluster_has_replica: fix the way a healthy replica is detected For patroni >= version 3.0.4: * the role is `replica` or `sync_standby` * the state is `streaming` or `in archive recovery` * the timeline is the same as the leader * the lag is lower or equal to `max_lag` For prio versions of patroni: * the role is `replica` or `sync_standby` * the state is `running` * the timeline is the same as the leader * the lag is lower or equal to `max_lag` Additionnally, we now display the timeline in the perfstats. We also try to display the perf stats of unhealthy replica as much as possible. Update tests for cluster_has_replica: * Fix the tests to make them work with the new algotithm * Add a specific test for tl divergences --- CHANGELOG.md | 4 + README.md | 28 +++- check_patroni/cli.py | 30 +++- check_patroni/cluster.py | 91 ++++++++++-- check_patroni/types.py | 24 ++- tests/__init__.py | 5 +- .../cluster_has_replica_ko_all_replica.json | 35 +++++ .../json/cluster_has_replica_ko_wrong_tl.json | 33 +++++ tests/json/cluster_has_replica_ok.json | 2 +- ...ster_has_replica_patroni_verion_3.0.0.json | 26 ++++ ...ster_has_replica_patroni_verion_3.1.0.json | 26 ++++ tests/test_cluster_has_replica.py | 138 ++++++++++++++---- 12 files changed, 386 insertions(+), 56 deletions(-) create mode 100644 tests/json/cluster_has_replica_ko_all_replica.json create mode 100644 tests/json/cluster_has_replica_ko_wrong_tl.json create mode 100644 tests/json/cluster_has_replica_patroni_verion_3.0.0.json create mode 100644 tests/json/cluster_has_replica_patroni_verion_3.1.0.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 9370ab3..def5df9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,10 +4,14 @@ ### Added +* Add the timeline in the `cluster_has_replica` perfstats. (#50) + ### Fixed * Add compatibility with [requests](https://requests.readthedocs.io) version 2.25 and higher. +* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck) +* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50) ### Misc diff --git a/README.md b/README.md index 5dbbb24..5fab46e 100644 --- a/README.md +++ b/README.md @@ -190,10 +190,27 @@ Usage: check_patroni cluster_has_replica [OPTIONS] Check if the cluster has healthy replicas and/or if some are sync standbies + For patroni (and this check): + * a replica is `streaming` if the `pg_stat_wal_receiver` say's so. + * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`. + A healthy replica: - * is in running or streaming state (V3.0.4) - * has a replica or sync_standby role - * has a lag lower or equal to max_lag + * has a `replica` or `sync_standby` role + * has the same timeline as the leader and + * is in `running` state (patroni < V3.0.4) + * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4) + * has a lag lower or equal to `max_lag` + + Please note that replica `in archive recovery` could be stuck because the + WAL are not available or applicable (the server's timeline has diverged for + the leader's). We already detect the latter but we will miss the former. + Therefore, it's preferable to check for the lag in addition to the healthy + state if you rely on log shipping to help lagging standbies to catch up. + + Since we require a healthy replica to have the same timeline as the leader, + it's possible that we raise alerts when the cluster is performing a + switchover or failover and the standbies are in the process of catching up + with the new leader. The alert shouldn't last long. Check: * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold. @@ -203,8 +220,9 @@ Usage: check_patroni cluster_has_replica [OPTIONS] Perfdata: * healthy_replica & unhealthy_replica count * the number of sync_replica, they are included in the previous count - * the lag of each replica labelled with "member name"_lag - * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync + * the lag of each replica labelled with "member name"_lag + * the timeline of each replica labelled with "member name"_timeline + * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync Options: -w, --warning TEXT Warning threshold for the number of healthy replica diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 5344b3a..d69569f 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -341,11 +341,29 @@ def cluster_has_replica( ) -> None: """Check if the cluster has healthy replicas and/or if some are sync standbies + \b + For patroni (and this check): + * a replica is `streaming` if the `pg_stat_wal_receiver` say's so. + * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`. + \b A healthy replica: - * is in running or streaming state (V3.0.4) - * has a replica or sync_standby role - * has a lag lower or equal to max_lag + * has a `replica` or `sync_standby` role + * has the same timeline as the leader and + * is in `running` state (patroni < V3.0.4) + * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4) + * has a lag lower or equal to `max_lag` + + Please note that replica `in archive recovery` could be stuck because the WAL + are not available or applicable (the server's timeline has diverged for the + leader's). We already detect the latter but we will miss the former. + Therefore, it's preferable to check for the lag in addition to the healthy + state if you rely on log shipping to help lagging standbies to catch up. + + Since we require a healthy replica to have the same timeline as the + leader, it's possible that we raise alerts when the cluster is performing a + switchover or failover and the standbies are in the process of catching up with + the new leader. The alert shouldn't last long. \b Check: @@ -357,8 +375,9 @@ def cluster_has_replica( Perfdata: * healthy_replica & unhealthy_replica count * the number of sync_replica, they are included in the previous count - * the lag of each replica labelled with "member name"_lag - * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync + * the lag of each replica labelled with "member name"_lag + * the timeline of each replica labelled with "member name"_timeline + * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync """ tmax_lag = size_to_byte(max_lag) if max_lag is not None else None @@ -377,6 +396,7 @@ def cluster_has_replica( ), nagiosplugin.ScalarContext("unhealthy_replica"), nagiosplugin.ScalarContext("replica_lag"), + nagiosplugin.ScalarContext("replica_timeline"), nagiosplugin.ScalarContext("replica_sync"), ) check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index 5a242d4..a7891b8 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -1,7 +1,7 @@ import hashlib import json from collections import Counter -from typing import Iterable, Union +from typing import Any, Iterable, Union import nagiosplugin @@ -83,35 +83,91 @@ class ClusterHasReplica(PatroniResource): self.max_lag = max_lag def probe(self) -> Iterable[nagiosplugin.Metric]: - item_dict = self.rest_api("cluster") + def debug_member(member: Any, health: str) -> None: + _log.debug( + "Node %(node_name)s is %(health)s: lag %(lag)s, state %(state)s, tl %(tl)s.", + { + "node_name": member["name"], + "health": health, + "lag": member["lag"], + "state": member["state"], + "tl": member["timeline"], + }, + ) + + # get the cluster info + cluster_item_dict = self.rest_api("cluster") replicas = [] healthy_replica = 0 unhealthy_replica = 0 sync_replica = 0 - for member in item_dict["members"]: - # FIXME are there other acceptable states + leader_tl = None + + # Look for replicas + for member in cluster_item_dict["members"]: if member["role"] in ["replica", "sync_standby"]: - # patroni 3.0.4 changed the standby state from running to streaming - if ( - member["state"] in ["running", "streaming"] - and member["lag"] != "unknown" - ): + if member["lag"] == "unknown": + # This could happen if the node is stopped + # nagiosplugin doesn't handle strings in perfstats + # so we have to ditch all the stats in that case + debug_member(member, "unhealthy") + unhealthy_replica += 1 + continue + else: replicas.append( { "name": member["name"], "lag": member["lag"], + "timeline": member["timeline"], "sync": 1 if member["role"] == "sync_standby" else 0, } ) - if member["role"] == "sync_standby": - sync_replica += 1 + # Get the leader tl if we haven't already + if leader_tl is None: + # If there are no leaders, we will loop here for all + # members because leader_tl will remain None. it's not + # a big deal since having no leader is rare. + for tmember in cluster_item_dict["members"]: + if tmember["role"] == "leader": + leader_tl = int(tmember["timeline"]) + break - if self.max_lag is None or self.max_lag >= int(member["lag"]): - healthy_replica += 1 - continue - unhealthy_replica += 1 + _log.debug( + "Patroni's leader_timeline is %(leader_tl)s", + { + "leader_tl": leader_tl, + }, + ) + + # Test for an unhealthy replica + if ( + self.has_detailed_states() + and not ( + member["state"] in ["streaming", "in archive recovery"] + and int(member["timeline"]) == leader_tl + ) + ) or ( + not self.has_detailed_states() + and not ( + member["state"] == "running" + and int(member["timeline"]) == leader_tl + ) + ): + debug_member(member, "unhealthy") + unhealthy_replica += 1 + continue + + if member["role"] == "sync_standby": + sync_replica += 1 + + if self.max_lag is None or self.max_lag >= int(member["lag"]): + debug_member(member, "healthy") + healthy_replica += 1 + else: + debug_member(member, "unhealthy") + unhealthy_replica += 1 # The actual check yield nagiosplugin.Metric("healthy_replica", healthy_replica) @@ -123,6 +179,11 @@ class ClusterHasReplica(PatroniResource): yield nagiosplugin.Metric( f"{replica['name']}_lag", replica["lag"], context="replica_lag" ) + yield nagiosplugin.Metric( + f"{replica['name']}_timeline", + replica["timeline"], + context="replica_timeline", + ) yield nagiosplugin.Metric( f"{replica['name']}_sync", replica["sync"], context="replica_sync" ) diff --git a/check_patroni/types.py b/check_patroni/types.py index 3032547..5f08dd4 100644 --- a/check_patroni/types.py +++ b/check_patroni/types.py @@ -1,4 +1,5 @@ import json +from functools import lru_cache from typing import Any, Callable, List, Optional, Tuple, Union from urllib.parse import urlparse @@ -29,7 +30,7 @@ class Parameters: verbose: int -@attr.s(auto_attribs=True, slots=True) +@attr.s(auto_attribs=True, eq=False, slots=True) class PatroniResource(nagiosplugin.Resource): conn_info: ConnectionInfo @@ -76,6 +77,27 @@ class PatroniResource(nagiosplugin.Resource): return None raise nagiosplugin.CheckError("Connection failed for all provided endpoints") + @lru_cache(maxsize=None) + def has_detailed_states(self) -> bool: + # get patroni's version to find out if the "streaming" and "in archive recovery" states are available + patroni_item_dict = self.rest_api("patroni") + + if tuple( + int(v) for v in patroni_item_dict["patroni"]["version"].split(".", 2) + ) >= (3, 0, 4): + _log.debug( + "Patroni's version is %(version)s, more detailed states can be used to check for the health of replicas.", + {"version": patroni_item_dict["patroni"]["version"]}, + ) + + return True + + _log.debug( + "Patroni's version is %(version)s, the running state and the timelines must be used to check for the health of replicas.", + {"version": patroni_item_dict["patroni"]["version"]}, + ) + return False + HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Results], Any] diff --git a/tests/__init__.py b/tests/__init__.py index e683599..aaecf11 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -50,12 +50,13 @@ class PatroniAPI(HTTPServer): def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path: - # starting from 3.0.4 the state of replicas is streaming instead of running + # starting from 3.0.4 the state of replicas is streaming or in archive recovery + # instead of running with in_json.open() as f: js = json.load(f) for node in js["members"]: if node["role"] in ["replica", "sync_standby"]: - if node["state"] == "streaming": + if node["state"] in ["streaming", "in archive recovery"]: node["state"] = "running" assert target_dir.is_dir() out_json = target_dir / in_json.name diff --git a/tests/json/cluster_has_replica_ko_all_replica.json b/tests/json/cluster_has_replica_ko_all_replica.json new file mode 100644 index 0000000..fe82d32 --- /dev/null +++ b/tests/json/cluster_has_replica_ko_all_replica.json @@ -0,0 +1,35 @@ +{ + "members": [ + { + "name": "srv1", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + + } + ] +} diff --git a/tests/json/cluster_has_replica_ko_wrong_tl.json b/tests/json/cluster_has_replica_ko_wrong_tl.json new file mode 100644 index 0000000..6889484 --- /dev/null +++ b/tests/json/cluster_has_replica_ko_wrong_tl.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 50, + "lag": 1000000 + }, + { + "name": "srv3", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/tests/json/cluster_has_replica_ok.json b/tests/json/cluster_has_replica_ok.json index 44535e0..181ed4f 100644 --- a/tests/json/cluster_has_replica_ok.json +++ b/tests/json/cluster_has_replica_ok.json @@ -12,7 +12,7 @@ { "name": "srv2", "role": "replica", - "state": "streaming", + "state": "in archive recovery", "api_url": "https://10.20.199.4:8008/patroni", "host": "10.20.199.4", "port": 5432, diff --git a/tests/json/cluster_has_replica_patroni_verion_3.0.0.json b/tests/json/cluster_has_replica_patroni_verion_3.0.0.json new file mode 100644 index 0000000..9c922b8 --- /dev/null +++ b/tests/json/cluster_has_replica_patroni_verion_3.0.0.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 51, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "3.0.0", + "scope": "patroni-demo" + } +} diff --git a/tests/json/cluster_has_replica_patroni_verion_3.1.0.json b/tests/json/cluster_has_replica_patroni_verion_3.1.0.json new file mode 100644 index 0000000..91e4348 --- /dev/null +++ b/tests/json/cluster_has_replica_patroni_verion_3.1.0.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 51, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "3.1.0", + "scope": "patroni-demo" + } +} diff --git a/tests/test_cluster_has_replica.py b/tests/test_cluster_has_replica.py index ccbf6dd..a6a88c0 100644 --- a/tests/test_cluster_has_replica.py +++ b/tests/test_cluster_has_replica.py @@ -13,22 +13,23 @@ from . import PatroniAPI, cluster_api_set_replica_running def cluster_has_replica_ok( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_has_replica_ok.json" + cluster_path: Union[str, Path] = "cluster_has_replica_ok.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None -# TODO Lag threshold tests @pytest.mark.usefixtures("cluster_has_replica_ok") def test_cluster_has_relica_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None: result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_replica"]) - assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n" ) + assert result.exit_code == 0 @pytest.mark.usefixtures("cluster_has_replica_ok") @@ -47,11 +48,11 @@ def test_cluster_has_replica_ok_with_count_thresholds( "@0", ], ) - assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n" ) + assert result.exit_code == 0 @pytest.mark.usefixtures("cluster_has_replica_ok") @@ -68,21 +69,23 @@ def test_cluster_has_replica_ok_with_sync_count_thresholds( "1:", ], ) - assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1;1: unhealthy_replica=0\n" ) + assert result.exit_code == 0 @pytest.fixture def cluster_has_replica_ok_lag( patroni_api: PatroniAPI, datadir: Path, tmp_path: Path, old_replica_state: bool ) -> Iterator[None]: - path: Union[str, Path] = "cluster_has_replica_ok_lag.json" + cluster_path: Union[str, Path] = "cluster_has_replica_ok_lag.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -104,21 +107,23 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag( "1MB", ], ) - assert result.exit_code == 0 assert ( result.stdout - == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n" + == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=0\n" ) + assert result.exit_code == 0 @pytest.fixture def cluster_has_replica_ko( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_has_replica_ko.json" + cluster_path: Union[str, Path] = "cluster_has_replica_ko.json" + patroni_path: Union[str, Path] = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -138,11 +143,11 @@ def test_cluster_has_replica_ko_with_count_thresholds( "@0", ], ) - assert result.exit_code == 1 assert ( result.stdout - == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n" + == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n" ) + assert result.exit_code == 1 @pytest.mark.usefixtures("cluster_has_replica_ko") @@ -161,21 +166,24 @@ def test_cluster_has_replica_ko_with_sync_count_thresholds( "1:", ], ) - assert result.exit_code == 2 + # The lag on srv2 is "unknown". We don't handle string in perfstats so we have to scratch all the second node stats assert ( result.stdout - == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n" + == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0;2:;1: unhealthy_replica=1\n" ) + assert result.exit_code == 2 @pytest.fixture def cluster_has_replica_ko_lag( patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path ) -> Iterator[None]: - path: Union[str, Path] = "cluster_has_replica_ko_lag.json" + cluster_path: Union[str, Path] = "cluster_has_replica_ko_lag.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" if old_replica_state: - path = cluster_api_set_replica_running(datadir / path, tmp_path) - with patroni_api.routes({"cluster": path}): + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): yield None @@ -197,8 +205,84 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag( "1MB", ], ) - assert result.exit_code == 2 assert ( result.stdout - == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n" + == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv2_timeline=51 srv3_lag=20000000 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=2\n" ) + assert result.exit_code == 2 + + +@pytest.fixture +def cluster_has_replica_ko_wrong_tl( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_replica_ko_wrong_tl.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_replica_ko_wrong_tl") +def test_cluster_has_replica_ko_wrong_tl( + runner: CliRunner, patroni_api: PatroniAPI +) -> None: + result = runner.invoke( + main, + [ + "-e", + patroni_api.endpoint, + "cluster_has_replica", + "--warning", + "@1", + "--critical", + "@0", + "--max-lag", + "1MB", + ], + ) + assert ( + result.stdout + == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv2_lag=1000000 srv2_sync=0 srv2_timeline=50 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n" + ) + assert result.exit_code == 1 + + +@pytest.fixture +def cluster_has_replica_ko_all_replica( + patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path +) -> Iterator[None]: + cluster_path: Union[str, Path] = "cluster_has_replica_ko_all_replica.json" + patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json" + if old_replica_state: + cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path) + patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json" + with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}): + yield None + + +@pytest.mark.usefixtures("cluster_has_replica_ko_all_replica") +def test_cluster_has_replica_ko_all_replica( + runner: CliRunner, patroni_api: PatroniAPI +) -> None: + result = runner.invoke( + main, + [ + "-e", + patroni_api.endpoint, + "cluster_has_replica", + "--warning", + "@1", + "--critical", + "@0", + "--max-lag", + "1MB", + ], + ) + assert ( + result.stdout + == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv1_lag=0 srv1_sync=0 srv1_timeline=51 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=3\n" + ) + assert result.exit_code == 2