check-patroni/tests/test_cluster_has_replica.py
benoit 8d6b8502b6 cluster_has_replica: fix the way a healthy replica is detected
For patroni >= version 3.0.4:
* the role is `replica` or `sync_standby`
* the state is `streaming` or `in archive recovery`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

For prio versions of patroni:
* the role is `replica` or `sync_standby`
* the state is `running`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

Additionnally, we now display the timeline in the perfstats. We also try
to display the perf stats of unhealthy replica as much as possible.

Update tests for cluster_has_replica:
* Fix the tests to make them work with the new algotithm
* Add a specific test for tl divergences
2023-11-11 10:50:35 +01:00

289 lines
9.8 KiB
Python

from pathlib import Path
from typing import Iterator, Union
import pytest
from click.testing import CliRunner
from check_patroni.cli import main
from . import PatroniAPI, cluster_api_set_replica_running
@pytest.fixture
def cluster_has_replica_ok(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ok.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ok")
def test_cluster_has_relica_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_replica"])
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n"
)
assert result.exit_code == 0
@pytest.mark.usefixtures("cluster_has_replica_ok")
def test_cluster_has_replica_ok_with_count_thresholds(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n"
)
assert result.exit_code == 0
@pytest.mark.usefixtures("cluster_has_replica_ok")
def test_cluster_has_replica_ok_with_sync_count_thresholds(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--sync-warning",
"1:",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1;1: unhealthy_replica=0\n"
)
assert result.exit_code == 0
@pytest.fixture
def cluster_has_replica_ok_lag(
patroni_api: PatroniAPI, datadir: Path, tmp_path: Path, old_replica_state: bool
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ok_lag.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ok_lag")
def test_cluster_has_replica_ok_with_count_thresholds_lag(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2;@1;@0 srv2_lag=1024 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=0\n"
)
assert result.exit_code == 0
@pytest.fixture
def cluster_has_replica_ko(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ko.json"
patroni_path: Union[str, Path] = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ko")
def test_cluster_has_replica_ko_with_count_thresholds(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n"
)
assert result.exit_code == 1
@pytest.mark.usefixtures("cluster_has_replica_ko")
def test_cluster_has_replica_ko_with_sync_count_thresholds(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--sync-warning",
"2:",
"--sync-critical",
"1:",
],
)
# The lag on srv2 is "unknown". We don't handle string in perfstats so we have to scratch all the second node stats
assert (
result.stdout
== "CLUSTERHASREPLICA CRITICAL - sync_replica is 0 (outside range 1:) | healthy_replica=1 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0;2:;1: unhealthy_replica=1\n"
)
assert result.exit_code == 2
@pytest.fixture
def cluster_has_replica_ko_lag(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ko_lag.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ko_lag")
def test_cluster_has_replica_ko_with_count_thresholds_and_lag(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv2_lag=10241024 srv2_sync=0 srv2_timeline=51 srv3_lag=20000000 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=2\n"
)
assert result.exit_code == 2
@pytest.fixture
def cluster_has_replica_ko_wrong_tl(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ko_wrong_tl.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ko_wrong_tl")
def test_cluster_has_replica_ko_wrong_tl(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA WARNING - healthy_replica is 1 (outside range @0:1) | healthy_replica=1;@1;@0 srv2_lag=1000000 srv2_sync=0 srv2_timeline=50 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=1\n"
)
assert result.exit_code == 1
@pytest.fixture
def cluster_has_replica_ko_all_replica(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ko_all_replica.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None
@pytest.mark.usefixtures("cluster_has_replica_ko_all_replica")
def test_cluster_has_replica_ko_all_replica(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
[
"-e",
patroni_api.endpoint,
"cluster_has_replica",
"--warning",
"@1",
"--critical",
"@0",
"--max-lag",
"1MB",
],
)
assert (
result.stdout
== "CLUSTERHASREPLICA CRITICAL - healthy_replica is 0 (outside range @0:0) | healthy_replica=0;@1;@0 srv1_lag=0 srv1_sync=0 srv1_timeline=51 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=0 srv3_timeline=51 sync_replica=0 unhealthy_replica=3\n"
)
assert result.exit_code == 2