From 8d6b8502b63817929be27dc15b0de79211aef403 Mon Sep 17 00:00:00 2001
From: benoit <benoit.lobreau@dalibo.com>
Date: Wed, 27 Sep 2023 16:37:40 +0200
Subject: [PATCH] cluster_has_replica: fix the way a healthy replica is
 detected

For patroni >= version 3.0.4:
* the role is `replica` or `sync_standby`
* the state is `streaming` or `in archive recovery`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

For prio versions of patroni:
* the role is `replica` or `sync_standby`
* the state is `running`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

Additionnally, we now display the timeline in the perfstats. We also try
to display the perf stats of unhealthy replica as much as possible.

Update tests for cluster_has_replica:
* Fix the tests to make them work with the new algotithm
* Add a specific test for tl divergences
---
 CHANGELOG.md                                  |   4 +
 README.md                                     |  28 +++-
 check_patroni/cli.py                          |  30 +++-
 check_patroni/cluster.py                      |  91 ++++++++++--
 check_patroni/types.py                        |  24 ++-
 tests/__init__.py                             |   5 +-
 .../cluster_has_replica_ko_all_replica.json   |  35 +++++
 .../json/cluster_has_replica_ko_wrong_tl.json |  33 +++++
 tests/json/cluster_has_replica_ok.json        |   2 +-
 ...ster_has_replica_patroni_verion_3.0.0.json |  26 ++++
 ...ster_has_replica_patroni_verion_3.1.0.json |  26 ++++
 tests/test_cluster_has_replica.py             | 138 ++++++++++++++----
 12 files changed, 386 insertions(+), 56 deletions(-)
 create mode 100644 tests/json/cluster_has_replica_ko_all_replica.json
 create mode 100644 tests/json/cluster_has_replica_ko_wrong_tl.json
 create mode 100644 tests/json/cluster_has_replica_patroni_verion_3.0.0.json
 create mode 100644 tests/json/cluster_has_replica_patroni_verion_3.1.0.json

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9370ab3..def5df9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,10 +4,14 @@
 
 ### Added
 
+* Add the timeline in the  `cluster_has_replica` perfstats. (#50)
+
 ### Fixed
 
 * Add compatibility with [requests](https://requests.readthedocs.io)
   version 2.25 and higher.
+* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
+* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
 
 ### Misc
 
diff --git a/README.md b/README.md
index 5dbbb24..5fab46e 100644
--- a/README.md
+++ b/README.md
@@ -190,10 +190,27 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
 
   Check if the cluster has healthy replicas and/or if some are sync standbies
 
+  For patroni (and this check):
+  * a replica is `streaming` if the `pg_stat_wal_receiver` say's so.
+  * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`.
+
   A healthy replica:
-  * is in running or streaming state (V3.0.4)
-  * has a replica or sync_standby role
-  * has a lag lower or equal to max_lag
+  * has a `replica` or `sync_standby` role
+  * has the same timeline as the leader and
+    * is in `running` state (patroni < V3.0.4)
+    * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
+  * has a lag lower or equal to `max_lag`
+
+  Please note that replica `in archive recovery` could be stuck because the
+  WAL are not available or applicable (the server's timeline has diverged for
+  the leader's). We already detect the latter but we will miss the former.
+  Therefore, it's preferable to check for the lag in addition to the healthy
+  state if you rely on log shipping to help lagging standbies to catch up.
+
+  Since we require a healthy replica to have the same timeline as the leader,
+  it's possible that we raise alerts when the cluster is performing a
+  switchover or failover and the standbies are in the process of catching up
+  with the new leader. The alert shouldn't last long.
 
   Check:
   * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
@@ -203,8 +220,9 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
   Perfdata:
   * healthy_replica & unhealthy_replica count
   * the number of sync_replica, they are included in the previous count
-  * the lag of each replica labelled with  "member name"_lag
-  * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync
+  * the lag of each replica labelled with "member name"_lag
+  * the timeline of each replica labelled with "member name"_timeline
+  * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
 
 Options:
   -w, --warning TEXT    Warning threshold for the number of healthy replica
diff --git a/check_patroni/cli.py b/check_patroni/cli.py
index 5344b3a..d69569f 100644
--- a/check_patroni/cli.py
+++ b/check_patroni/cli.py
@@ -341,11 +341,29 @@ def cluster_has_replica(
 ) -> None:
     """Check if the cluster has healthy replicas and/or if some are sync standbies
 
+    \b
+    For patroni (and this check):
+    * a replica is `streaming` if the `pg_stat_wal_receiver` say's so.
+    * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`.
+
     \b
     A healthy replica:
-    * is in running or streaming state (V3.0.4)
-    * has a replica or sync_standby role
-    * has a lag lower or equal to max_lag
+    * has a `replica` or `sync_standby` role
+    * has the same timeline as the leader and
+      * is in `running` state (patroni < V3.0.4)
+      * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
+    * has a lag lower or equal to `max_lag`
+
+    Please note that replica `in archive recovery` could be stuck because the WAL
+    are not available or applicable (the server's timeline has diverged for the
+    leader's). We already detect the latter but we will miss the former.
+    Therefore, it's preferable to check for the lag in addition to the healthy
+    state if you rely on log shipping to help lagging standbies to catch up.
+
+    Since we require a healthy replica to have the same timeline as the
+    leader, it's possible that we raise alerts when the cluster is performing a
+    switchover or failover and the standbies are in the process of catching up with
+    the new leader. The alert shouldn't last long.
 
     \b
     Check:
@@ -357,8 +375,9 @@ def cluster_has_replica(
     Perfdata:
     * healthy_replica & unhealthy_replica count
     * the number of sync_replica, they are included in the previous count
-    * the lag of each replica labelled with  "member name"_lag
-    * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync
+    * the lag of each replica labelled with "member name"_lag
+    * the timeline of each replica labelled with "member name"_timeline
+    * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
     """
 
     tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
@@ -377,6 +396,7 @@ def cluster_has_replica(
         ),
         nagiosplugin.ScalarContext("unhealthy_replica"),
         nagiosplugin.ScalarContext("replica_lag"),
+        nagiosplugin.ScalarContext("replica_timeline"),
         nagiosplugin.ScalarContext("replica_sync"),
     )
     check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py
index 5a242d4..a7891b8 100644
--- a/check_patroni/cluster.py
+++ b/check_patroni/cluster.py
@@ -1,7 +1,7 @@
 import hashlib
 import json
 from collections import Counter
-from typing import Iterable, Union
+from typing import Any, Iterable, Union
 
 import nagiosplugin
 
@@ -83,35 +83,91 @@ class ClusterHasReplica(PatroniResource):
         self.max_lag = max_lag
 
     def probe(self) -> Iterable[nagiosplugin.Metric]:
-        item_dict = self.rest_api("cluster")
+        def debug_member(member: Any, health: str) -> None:
+            _log.debug(
+                "Node %(node_name)s is %(health)s: lag %(lag)s, state %(state)s, tl %(tl)s.",
+                {
+                    "node_name": member["name"],
+                    "health": health,
+                    "lag": member["lag"],
+                    "state": member["state"],
+                    "tl": member["timeline"],
+                },
+            )
+
+        # get the cluster info
+        cluster_item_dict = self.rest_api("cluster")
 
         replicas = []
         healthy_replica = 0
         unhealthy_replica = 0
         sync_replica = 0
-        for member in item_dict["members"]:
-            # FIXME are there other acceptable states
+        leader_tl = None
+
+        # Look for replicas
+        for member in cluster_item_dict["members"]:
             if member["role"] in ["replica", "sync_standby"]:
-                # patroni 3.0.4 changed the standby state from running to streaming
-                if (
-                    member["state"] in ["running", "streaming"]
-                    and member["lag"] != "unknown"
-                ):
+                if member["lag"] == "unknown":
+                    # This could happen if the node is stopped
+                    # nagiosplugin doesn't handle strings in perfstats
+                    # so we have to ditch all the stats in that case
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
+                    continue
+                else:
                     replicas.append(
                         {
                             "name": member["name"],
                             "lag": member["lag"],
+                            "timeline": member["timeline"],
                             "sync": 1 if member["role"] == "sync_standby" else 0,
                         }
                     )
 
-                    if member["role"] == "sync_standby":
-                        sync_replica += 1
+                # Get the leader tl if we haven't already
+                if leader_tl is None:
+                    # If there are no leaders, we will loop here for all
+                    # members because leader_tl will remain None. it's not
+                    # a big deal since having no leader is rare.
+                    for tmember in cluster_item_dict["members"]:
+                        if tmember["role"] == "leader":
+                            leader_tl = int(tmember["timeline"])
+                            break
 
-                    if self.max_lag is None or self.max_lag >= int(member["lag"]):
-                        healthy_replica += 1
-                        continue
-                unhealthy_replica += 1
+                    _log.debug(
+                        "Patroni's leader_timeline is %(leader_tl)s",
+                        {
+                            "leader_tl": leader_tl,
+                        },
+                    )
+
+                # Test for an unhealthy replica
+                if (
+                    self.has_detailed_states()
+                    and not (
+                        member["state"] in ["streaming", "in archive recovery"]
+                        and int(member["timeline"]) == leader_tl
+                    )
+                ) or (
+                    not self.has_detailed_states()
+                    and not (
+                        member["state"] == "running"
+                        and int(member["timeline"]) == leader_tl
+                    )
+                ):
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
+                    continue
+
+                if member["role"] == "sync_standby":
+                    sync_replica += 1
+
+                if self.max_lag is None or self.max_lag >= int(member["lag"]):
+                    debug_member(member, "healthy")
+                    healthy_replica += 1
+                else:
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
 
         # The actual check
         yield nagiosplugin.Metric("healthy_replica", healthy_replica)
@@ -123,6 +179,11 @@ class ClusterHasReplica(PatroniResource):
             yield nagiosplugin.Metric(
                 f"{replica['name']}_lag", replica["lag"], context="replica_lag"
             )
+            yield nagiosplugin.Metric(
+                f"{replica['name']}_timeline",
+                replica["timeline"],
+                context="replica_timeline",
+            )
             yield nagiosplugin.Metric(
                 f"{replica['name']}_sync", replica["sync"], context="replica_sync"
             )
diff --git a/check_patroni/types.py b/check_patroni/types.py
index 3032547..5f08dd4 100644
--- a/check_patroni/types.py
+++ b/check_patroni/types.py
@@ -1,4 +1,5 @@
 import json
+from functools import lru_cache
 from typing import Any, Callable, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
@@ -29,7 +30,7 @@ class Parameters:
     verbose: int
 
 
-@attr.s(auto_attribs=True, slots=True)
+@attr.s(auto_attribs=True, eq=False, slots=True)
 class PatroniResource(nagiosplugin.Resource):
     conn_info: ConnectionInfo
 
@@ -76,6 +77,27 @@ class PatroniResource(nagiosplugin.Resource):
                 return None
         raise nagiosplugin.CheckError("Connection failed for all provided endpoints")
 
+    @lru_cache(maxsize=None)
+    def has_detailed_states(self) -> bool:
+        # get patroni's version to find out if the "streaming" and "in archive recovery" states are available
+        patroni_item_dict = self.rest_api("patroni")
+
+        if tuple(
+            int(v) for v in patroni_item_dict["patroni"]["version"].split(".", 2)
+        ) >= (3, 0, 4):
+            _log.debug(
+                "Patroni's version is %(version)s, more detailed states can be used to check for the health of replicas.",
+                {"version": patroni_item_dict["patroni"]["version"]},
+            )
+
+            return True
+
+        _log.debug(
+            "Patroni's version is %(version)s, the running state and the timelines must be used to check for the health of replicas.",
+            {"version": patroni_item_dict["patroni"]["version"]},
+        )
+        return False
+
 
 HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Results], Any]
 
diff --git a/tests/__init__.py b/tests/__init__.py
index e683599..aaecf11 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -50,12 +50,13 @@ class PatroniAPI(HTTPServer):
 
 
 def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path:
-    # starting from 3.0.4 the state of replicas is streaming instead of running
+    # starting from 3.0.4 the state of replicas is streaming or in archive recovery
+    # instead of running
     with in_json.open() as f:
         js = json.load(f)
     for node in js["members"]:
         if node["role"] in ["replica", "sync_standby"]:
-            if node["state"] == "streaming":
+            if node["state"] in ["streaming", "in archive recovery"]:
                 node["state"] = "running"
     assert target_dir.is_dir()
     out_json = target_dir / in_json.name
diff --git a/tests/json/cluster_has_replica_ko_all_replica.json b/tests/json/cluster_has_replica_ko_all_replica.json
new file mode 100644
index 0000000..fe82d32
--- /dev/null
+++ b/tests/json/cluster_has_replica_ko_all_replica.json
@@ -0,0 +1,35 @@
+{
+  "members": [
+    {
+      "name": "srv1",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.3:8008/patroni",
+      "host": "10.20.199.3",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    },
+    {
+      "name": "srv2",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.4:8008/patroni",
+      "host": "10.20.199.4",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    },
+    {
+      "name": "srv3",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.5:8008/patroni",
+      "host": "10.20.199.5",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+
+    }
+  ]
+}
diff --git a/tests/json/cluster_has_replica_ko_wrong_tl.json b/tests/json/cluster_has_replica_ko_wrong_tl.json
new file mode 100644
index 0000000..6889484
--- /dev/null
+++ b/tests/json/cluster_has_replica_ko_wrong_tl.json
@@ -0,0 +1,33 @@
+{
+  "members": [
+    {
+      "name": "srv1",
+      "role": "leader",
+      "state": "running",
+      "api_url": "https://10.20.199.3:8008/patroni",
+      "host": "10.20.199.3",
+      "port": 5432,
+      "timeline": 51
+    },
+    {
+      "name": "srv2",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.4:8008/patroni",
+      "host": "10.20.199.4",
+      "port": 5432,
+      "timeline": 50,
+      "lag": 1000000
+    },
+    {
+      "name": "srv3",
+      "role": "replica",
+      "state": "streaming",
+      "api_url": "https://10.20.199.5:8008/patroni",
+      "host": "10.20.199.5",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    }
+  ]
+}
diff --git a/tests/json/cluster_has_replica_ok.json b/tests/json/cluster_has_replica_ok.json
index 44535e0..181ed4f 100644
--- a/tests/json/cluster_has_replica_ok.json
+++ b/tests/json/cluster_has_replica_ok.json
@@ -12,7 +12,7 @@
     {
       "name": "srv2",
       "role": "replica",
-      "state": "streaming",
+      "state": "in archive recovery",
       "api_url": "https://10.20.199.4:8008/patroni",
       "host": "10.20.199.4",
       "port": 5432,
diff --git a/tests/json/cluster_has_replica_patroni_verion_3.0.0.json b/tests/json/cluster_has_replica_patroni_verion_3.0.0.json
new file mode 100644
index 0000000..9c922b8
--- /dev/null
+++ b/tests/json/cluster_has_replica_patroni_verion_3.0.0.json
@@ -0,0 +1,26 @@
+{
+  "state": "running",
+  "postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
+  "role": "master",
+  "server_version": 110012,
+  "cluster_unlocked": false,
+  "xlog": {
+    "location": 1174407088
+  },
+  "timeline": 51,
+  "replication": [
+    {
+      "usename": "replicator",
+      "application_name": "srv1",
+      "client_addr": "10.20.199.3",
+      "state": "streaming",
+      "sync_state": "async",
+      "sync_priority": 0
+    }
+  ],
+  "database_system_identifier": "6965971025273547206",
+  "patroni": {
+    "version": "3.0.0",
+    "scope": "patroni-demo"
+  }
+}
diff --git a/tests/json/cluster_has_replica_patroni_verion_3.1.0.json b/tests/json/cluster_has_replica_patroni_verion_3.1.0.json
new file mode 100644
index 0000000..91e4348
--- /dev/null
+++ b/tests/json/cluster_has_replica_patroni_verion_3.1.0.json
@@ -0,0 +1,26 @@
+{
+  "state": "running",
+  "postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
+  "role": "master",
+  "server_version": 110012,
+  "cluster_unlocked": false,
+  "xlog": {
+    "location": 1174407088
+  },
+  "timeline": 51,
+  "replication": [
+    {
+      "usename": "replicator",
+      "application_name": "srv1",
+      "client_addr": "10.20.199.3",
+      "state": "streaming",
+      "sync_state": "async",
+      "sync_priority": 0
+    }
+  ],
+  "database_system_identifier": "6965971025273547206",
+  "patroni": {
+    "version": "3.1.0",
+    "scope": "patroni-demo"
+  }
+}
diff --git a/tests/test_cluster_has_replica.py b/tests/test_cluster_has_replica.py
index ccbf6dd..a6a88c0 100644
--- a/tests/test_cluster_has_replica.py
+++ b/tests/test_cluster_has_replica.py
@@ -13,22 +13,23 @@ from . import PatroniAPI, cluster_api_set_replica_running
 def cluster_has_replica_ok(
     patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
 ) -> Iterator[None]:
-    path: Union[str, Path] = "cluster_has_replica_ok.json"
+    cluster_path: Union[str, Path] = "cluster_has_replica_ok.json"
+    patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
     if old_replica_state:
-        path = cluster_api_set_replica_running(datadir / path, tmp_path)
-    with patroni_api.routes({"cluster": path}):
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
         yield None
 
 
-# TODO Lag threshold tests
 @pytest.mark.usefixtures("cluster_has_replica_ok")
 def test_cluster_has_relica_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
     result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_replica"])
-    assert result.exit_code == 0
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n"
     )
+    assert result.exit_code == 0
 
 
 @pytest.mark.usefixtures("cluster_has_replica_ok")
@@ -47,11 +48,11 @@ def test_cluster_has_replica_ok_with_count_thresholds(
             "@0",
         ],
     )
-    assert result.exit_code == 0
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n"
     )
+    assert result.exit_code == 0
 
 
 @pytest.mark.usefixtures("cluster_has_replica_ok")
@@ -68,21 +69,23 @@ def test_cluster_has_replica_ok_with_sync_count_thresholds(
             "1:",
         ],
     )
-    assert result.exit_code == 0
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv3_lag=0 srv3_sync=1 sync_replica=1;1: unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1;1: unhealthy_replica=0\n"
     )
+    assert result.exit_code == 0
 
 
 @pytest.fixture
 def cluster_has_replica_ok_lag(
     patroni_api: PatroniAPI, datadir: Path, tmp_path: Path, old_replica_state: bool
 ) -> Iterator[None]:
-    path: Union[str, Path] = "cluster_has_replica_ok_lag.json"
+    cluster_path: Union[str, Path] = "cluster_has_replica_ok_lag.json"
+    patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
     if old_replica_state:
-        path = cluster_api_set_replica_running(datadir / path, tmp_path)
-    with patroni_api.routes({"cluster": path}):
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
         yield None
 
 
@@ -104,21 +107,23 @@ def test_cluster_has_replica_ok_with_count_thresholds_lag(
             "1MB",
         ],
     )
-    assert result.exit_code == 0
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=0\n"
+        == "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=0\n"
     )
+    assert result.exit_code == 0
 
 
 @pytest.fixture
 def cluster_has_replica_ko(
     patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
 ) -> Iterator[None]:
-    path: Union[str, Path] = "cluster_has_replica_ko.json"
+    cluster_path: Union[str, Path] = "cluster_has_replica_ko.json"
+    patroni_path: Union[str, Path] = "cluster_has_replica_patroni_verion_3.1.0.json"
     if old_replica_state:
-        path = cluster_api_set_replica_running(datadir / path, tmp_path)
-    with patroni_api.routes({"cluster": path}):
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
         yield None
 
 
@@ -138,11 +143,11 @@ def test_cluster_has_replica_ko_with_count_thresholds(
             "@0",
         ],
     )
-    assert result.exit_code == 1
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 sync_replica=0 unhealthy_replica=1\n"
+        == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n"
     )
+    assert result.exit_code == 1
 
 
 @pytest.mark.usefixtures("cluster_has_replica_ko")
@@ -161,21 +166,24 @@ def test_cluster_has_replica_ko_with_sync_count_thresholds(
             "1:",
         ],
     )
-    assert result.exit_code == 2
+    # The lag on srv2 is "unknown". We don't handle string in perfstats so we have to scratch all the second node stats
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 sync_replica=0;2:;1: unhealthy_replica=1\n"
+        == "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0;2:;1: unhealthy_replica=1\n"
     )
+    assert result.exit_code == 2
 
 
 @pytest.fixture
 def cluster_has_replica_ko_lag(
     patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
 ) -> Iterator[None]:
-    path: Union[str, Path] = "cluster_has_replica_ko_lag.json"
+    cluster_path: Union[str, Path] = "cluster_has_replica_ko_lag.json"
+    patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
     if old_replica_state:
-        path = cluster_api_set_replica_running(datadir / path, tmp_path)
-    with patroni_api.routes({"cluster": path}):
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
         yield None
 
 
@@ -197,8 +205,84 @@ def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
             "1MB",
         ],
     )
-    assert result.exit_code == 2
     assert (
         result.stdout
-        == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv3_lag=20000000 srv3_sync=0 sync_replica=0 unhealthy_replica=2\n"
+        == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv2_timeline=51 srv3_lag=20000000 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=2\n"
     )
+    assert result.exit_code == 2
+
+
+@pytest.fixture
+def cluster_has_replica_ko_wrong_tl(
+    patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
+) -> Iterator[None]:
+    cluster_path: Union[str, Path] = "cluster_has_replica_ko_wrong_tl.json"
+    patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
+    if old_replica_state:
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
+        yield None
+
+
+@pytest.mark.usefixtures("cluster_has_replica_ko_wrong_tl")
+def test_cluster_has_replica_ko_wrong_tl(
+    runner: CliRunner, patroni_api: PatroniAPI
+) -> None:
+    result = runner.invoke(
+        main,
+        [
+            "-e",
+            patroni_api.endpoint,
+            "cluster_has_replica",
+            "--warning",
+            "@1",
+            "--critical",
+            "@0",
+            "--max-lag",
+            "1MB",
+        ],
+    )
+    assert (
+        result.stdout
+        == "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv2_lag=1000000 srv2_sync=0 srv2_timeline=50 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n"
+    )
+    assert result.exit_code == 1
+
+
+@pytest.fixture
+def cluster_has_replica_ko_all_replica(
+    patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
+) -> Iterator[None]:
+    cluster_path: Union[str, Path] = "cluster_has_replica_ko_all_replica.json"
+    patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
+    if old_replica_state:
+        cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
+        patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
+    with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
+        yield None
+
+
+@pytest.mark.usefixtures("cluster_has_replica_ko_all_replica")
+def test_cluster_has_replica_ko_all_replica(
+    runner: CliRunner, patroni_api: PatroniAPI
+) -> None:
+    result = runner.invoke(
+        main,
+        [
+            "-e",
+            patroni_api.endpoint,
+            "cluster_has_replica",
+            "--warning",
+            "@1",
+            "--critical",
+            "@0",
+            "--max-lag",
+            "1MB",
+        ],
+    )
+    assert (
+        result.stdout
+        == "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv1_lag=0 srv1_sync=0 srv1_timeline=51 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=3\n"
+    )
+    assert result.exit_code == 2