Fix cluster_node_count's management of replication states
The service now supports the `streaming` state. Since we dont check for lag or timeline in this service, a healthy node is : * leader : in a running state * standby_leader : running (pre Patroni 3.0.4), streaming otherwise * standby & sync_standby : running (pre Patroni 3.0.4), streaming otherwise Updated the tests for this service.
This commit is contained in:
parent
46db3e2d15
commit
78ef0f6ada
|
@ -2,6 +2,11 @@
|
|||
|
||||
## Unreleased
|
||||
|
||||
### Changed
|
||||
|
||||
* In `cluster_node_count`, a healthy standby, sync replica or standby leaders cannot be "in
|
||||
archive recovery" because this service doesn't check for lag and timelines.
|
||||
|
||||
### Added
|
||||
|
||||
* Add the timeline in the `cluster_has_replica` perfstats. (#50)
|
||||
|
@ -15,6 +20,7 @@
|
|||
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
|
||||
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
|
||||
* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)
|
||||
* Fix `cluster_node_count` to correctly manage replication states. (#50, reported by @mbanck)
|
||||
|
||||
### Misc
|
||||
|
||||
|
|
29
README.md
29
README.md
|
@ -299,26 +299,37 @@ Usage: check_patroni cluster_node_count [OPTIONS]
|
|||
|
||||
Count the number of nodes in the cluster.
|
||||
|
||||
The role refers to the role of the server in the cluster. Possible values
|
||||
are:
|
||||
* master or leader
|
||||
* replica
|
||||
* standby_leader
|
||||
* sync_standby
|
||||
* demoted
|
||||
* promoted
|
||||
* uninitialized
|
||||
|
||||
The state refers to the state of PostgreSQL. Possible values are:
|
||||
* initializing new cluster, initdb failed
|
||||
* running custom bootstrap script, custom bootstrap failed
|
||||
* starting, start failed
|
||||
* restarting, restart failed
|
||||
* running, streaming (for a replica V3.0.4)
|
||||
* running, streaming, in archive recovery
|
||||
* stopping, stopped, stop failed
|
||||
* creating replica
|
||||
* crashed
|
||||
|
||||
The role refers to the role of the server in the cluster. Possible values
|
||||
are:
|
||||
* master or leader (V3.0.0+)
|
||||
* replica
|
||||
* demoted
|
||||
* promoted
|
||||
* uninitialized
|
||||
The "healthy" checks only ensures that:
|
||||
* a leader has the running state
|
||||
* a standby_leader has the running or streaming (V3.0.4) state
|
||||
* a replica or sync-standby has the running or streaming (V3.0.4) state
|
||||
|
||||
Since we dont check the lag or timeline, "in archive recovery" is not
|
||||
considered a valid state for this service. See cluster_has_leader and
|
||||
cluster_has_replica for specialized checks.
|
||||
|
||||
Check:
|
||||
* Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
|
||||
* Compares the number of nodes against the normal and healthy nodes warning and critical thresholds.
|
||||
* `OK`: If they are not provided.
|
||||
|
||||
Perfdata:
|
||||
|
|
|
@ -226,29 +226,40 @@ def cluster_node_count(
|
|||
) -> None:
|
||||
"""Count the number of nodes in the cluster.
|
||||
|
||||
\b
|
||||
The state refers to the state of PostgreSQL. Possible values are:
|
||||
* initializing new cluster, initdb failed
|
||||
* running custom bootstrap script, custom bootstrap failed
|
||||
* starting, start failed
|
||||
* restarting, restart failed
|
||||
* running, streaming (for a replica V3.0.4)
|
||||
* stopping, stopped, stop failed
|
||||
* creating replica
|
||||
* crashed
|
||||
|
||||
\b
|
||||
The role refers to the role of the server in the cluster. Possible values
|
||||
are:
|
||||
* master or leader (V3.0.0+)
|
||||
* master or leader
|
||||
* replica
|
||||
* standby_leader
|
||||
* sync_standby
|
||||
* demoted
|
||||
* promoted
|
||||
* uninitialized
|
||||
|
||||
\b
|
||||
The state refers to the state of PostgreSQL. Possible values are:
|
||||
* initializing new cluster, initdb failed
|
||||
* running custom bootstrap script, custom bootstrap failed
|
||||
* starting, start failed
|
||||
* restarting, restart failed
|
||||
* running, streaming, in archive recovery
|
||||
* stopping, stopped, stop failed
|
||||
* creating replica
|
||||
* crashed
|
||||
|
||||
\b
|
||||
The "healthy" checks only ensures that:
|
||||
* a leader has the running state
|
||||
* a standby_leader has the running or streaming (V3.0.4) state
|
||||
* a replica or sync-standby has the running or streaming (V3.0.4) state
|
||||
|
||||
Since we dont check the lag or timeline, "in archive recovery" is not considered a valid state
|
||||
for this service. See cluster_has_leader and cluster_has_replica for specialized checks.
|
||||
|
||||
\b
|
||||
Check:
|
||||
* Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
|
||||
* Compares the number of nodes against the normal and healthy nodes warning and critical thresholds.
|
||||
* `OK`: If they are not provided.
|
||||
|
||||
\b
|
||||
|
|
|
@ -15,24 +15,51 @@ def replace_chars(text: str) -> str:
|
|||
|
||||
class ClusterNodeCount(PatroniResource):
|
||||
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
||||
def debug_member(member: Any, health: str) -> None:
|
||||
_log.debug(
|
||||
"Node %(node_name)s is %(health)s: role %(role)s state %(state)s.",
|
||||
{
|
||||
"node_name": member["name"],
|
||||
"health": health,
|
||||
"role": member["role"],
|
||||
"state": member["state"],
|
||||
},
|
||||
)
|
||||
|
||||
# get the cluster info
|
||||
item_dict = self.rest_api("cluster")
|
||||
|
||||
role_counters: Counter[str] = Counter()
|
||||
roles = []
|
||||
status_counters: Counter[str] = Counter()
|
||||
statuses = []
|
||||
healthy_member = 0
|
||||
|
||||
for member in item_dict["members"]:
|
||||
roles.append(replace_chars(member["role"]))
|
||||
statuses.append(replace_chars(member["state"]))
|
||||
state, role = member["state"], member["role"]
|
||||
roles.append(replace_chars(role))
|
||||
statuses.append(replace_chars(state))
|
||||
|
||||
if role == "leader" and state == "running":
|
||||
healthy_member += 1
|
||||
debug_member(member, "healthy")
|
||||
continue
|
||||
|
||||
if role in ["standby_leader", "replica", "sync_standby"] and (
|
||||
(self.has_detailed_states() and state == "streaming")
|
||||
or (not self.has_detailed_states() and state == "running")
|
||||
):
|
||||
healthy_member += 1
|
||||
debug_member(member, "healthy")
|
||||
continue
|
||||
|
||||
debug_member(member, "unhealthy")
|
||||
role_counters.update(roles)
|
||||
status_counters.update(statuses)
|
||||
|
||||
# The actual check: members, healthy_members
|
||||
yield nagiosplugin.Metric("members", len(item_dict["members"]))
|
||||
yield nagiosplugin.Metric(
|
||||
"healthy_members",
|
||||
status_counters["running"] + status_counters.get("streaming", 0),
|
||||
)
|
||||
yield nagiosplugin.Metric("healthy_members", healthy_member)
|
||||
|
||||
# The performance data : role
|
||||
for role in role_counters:
|
||||
|
|
|
@ -55,7 +55,7 @@ def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path:
|
|||
with in_json.open() as f:
|
||||
js = json.load(f)
|
||||
for node in js["members"]:
|
||||
if node["role"] in ["replica", "sync_standby"]:
|
||||
if node["role"] in ["replica", "sync_standby", "standby_leader"]:
|
||||
if node["state"] in ["streaming", "in archive recovery"]:
|
||||
node["state"] = "running"
|
||||
assert target_dir.is_dir()
|
||||
|
|
33
tests/json/cluster_node_count_ko_in_archive_recovery.json
Normal file
33
tests/json/cluster_node_count_ko_in_archive_recovery.json
Normal file
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"members": [
|
||||
{
|
||||
"name": "srv1",
|
||||
"role": "standby_leader",
|
||||
"state": "in archive recovery",
|
||||
"api_url": "https://10.20.199.3:8008/patroni",
|
||||
"host": "10.20.199.3",
|
||||
"port": 5432,
|
||||
"timeline": 51
|
||||
},
|
||||
{
|
||||
"name": "srv2",
|
||||
"role": "replica",
|
||||
"state": "in archive recovery",
|
||||
"api_url": "https://10.20.199.4:8008/patroni",
|
||||
"host": "10.20.199.4",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
},
|
||||
{
|
||||
"name": "srv3",
|
||||
"role": "replica",
|
||||
"state": "streaming",
|
||||
"api_url": "https://10.20.199.5:8008/patroni",
|
||||
"host": "10.20.199.5",
|
||||
"port": 5432,
|
||||
"timeline": 51,
|
||||
"lag": 0
|
||||
}
|
||||
]
|
||||
}
|
|
@ -13,10 +13,12 @@ from . import PatroniAPI, cluster_api_set_replica_running
|
|||
def cluster_node_count_ok(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
path: Union[str, Path] = "cluster_node_count_ok.json"
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_ok.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
path = cluster_api_set_replica_running(datadir / path, tmp_path)
|
||||
with patroni_api.routes({"cluster": path}):
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
|
@ -25,7 +27,6 @@ def test_cluster_node_count_ok(
|
|||
runner: CliRunner, patroni_api: PatroniAPI, old_replica_state: bool
|
||||
) -> None:
|
||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_node_count"])
|
||||
assert result.exit_code == 0
|
||||
if old_replica_state:
|
||||
assert (
|
||||
result.output
|
||||
|
@ -36,6 +37,7 @@ def test_cluster_node_count_ok(
|
|||
result.output
|
||||
== "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3 members=3 role_leader=1 role_replica=2 state_running=1 state_streaming=2\n"
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_node_count_ok")
|
||||
|
@ -58,7 +60,6 @@ def test_cluster_node_count_ok_with_thresholds(
|
|||
"@0:1",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
if old_replica_state:
|
||||
assert (
|
||||
result.output
|
||||
|
@ -69,16 +70,19 @@ def test_cluster_node_count_ok_with_thresholds(
|
|||
result.output
|
||||
== "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3;@2;@1 members=3;@1;@2 role_leader=1 role_replica=2 state_running=1 state_streaming=2\n"
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_node_count_healthy_warning(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
path: Union[str, Path] = "cluster_node_count_healthy_warning.json"
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_healthy_warning.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
path = cluster_api_set_replica_running(datadir / path, tmp_path)
|
||||
with patroni_api.routes({"cluster": path}):
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
|
@ -98,7 +102,6 @@ def test_cluster_node_count_healthy_warning(
|
|||
"@0:1",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
if old_replica_state:
|
||||
assert (
|
||||
result.output
|
||||
|
@ -109,16 +112,19 @@ def test_cluster_node_count_healthy_warning(
|
|||
result.output
|
||||
== "CLUSTERNODECOUNT WARNING - healthy_members is 2 (outside range @0:2) | healthy_members=2;@2;@1 members=2 role_leader=1 role_replica=1 state_running=1 state_streaming=1\n"
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_node_count_healthy_critical(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
path: Union[str, Path] = "cluster_node_count_healthy_critical.json"
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_healthy_critical.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
path = cluster_api_set_replica_running(datadir / path, tmp_path)
|
||||
with patroni_api.routes({"cluster": path}):
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
|
@ -138,21 +144,23 @@ def test_cluster_node_count_healthy_critical(
|
|||
"@0:1",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.output
|
||||
== "CLUSTERNODECOUNT CRITICAL - healthy_members is 1 (outside range @0:1) | healthy_members=1;@2;@1 members=3 role_leader=1 role_replica=2 state_running=1 state_start_failed=2\n"
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_node_count_warning(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
path: Union[str, Path] = "cluster_node_count_warning.json"
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_warning.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
path = cluster_api_set_replica_running(datadir / path, tmp_path)
|
||||
with patroni_api.routes({"cluster": path}):
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
|
@ -172,7 +180,6 @@ def test_cluster_node_count_warning(
|
|||
"@0:1",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
if old_replica_state:
|
||||
assert (
|
||||
result.stdout
|
||||
|
@ -183,16 +190,19 @@ def test_cluster_node_count_warning(
|
|||
result.stdout
|
||||
== "CLUSTERNODECOUNT WARNING - members is 2 (outside range @0:2) | healthy_members=2 members=2;@2;@1 role_leader=1 role_replica=1 state_running=1 state_streaming=1\n"
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_node_count_critical(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
path: Union[str, Path] = "cluster_node_count_critical.json"
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_critical.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
path = cluster_api_set_replica_running(datadir / path, tmp_path)
|
||||
with patroni_api.routes({"cluster": path}):
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
|
@ -212,8 +222,51 @@ def test_cluster_node_count_critical(
|
|||
"@0:1",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERNODECOUNT CRITICAL - members is 1 (outside range @0:1) | healthy_members=1 members=1;@2;@1 role_leader=1 state_running=1\n"
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cluster_node_count_ko_in_archive_recovery(
|
||||
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||
) -> Iterator[None]:
|
||||
cluster_path: Union[str, Path] = "cluster_node_count_ko_in_archive_recovery.json"
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||
if old_replica_state:
|
||||
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||
yield None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("cluster_node_count_ko_in_archive_recovery")
|
||||
def test_cluster_node_count_ko_in_archive_recovery(
|
||||
runner: CliRunner, patroni_api: PatroniAPI, old_replica_state: bool
|
||||
) -> None:
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"-e",
|
||||
patroni_api.endpoint,
|
||||
"cluster_node_count",
|
||||
"--healthy-warning",
|
||||
"@2",
|
||||
"--healthy-critical",
|
||||
"@0:1",
|
||||
],
|
||||
)
|
||||
if old_replica_state:
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERNODECOUNT OK - members is 3 | healthy_members=3;@2;@1 members=3 role_replica=2 role_standby_leader=1 state_running=3\n"
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
else:
|
||||
assert (
|
||||
result.stdout
|
||||
== "CLUSTERNODECOUNT CRITICAL - healthy_members is 1 (outside range @0:1) | healthy_members=1;@2;@1 members=3 role_replica=2 role_standby_leader=1 state_in_archive_recovery=2 state_streaming=1\n"
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
|
|
Loading…
Reference in a new issue