Fix the cluster_has_leader service for standby clusters
Before this patch we checked the expected standby leader state was `running` for all versions of Patroni. With this patch, for: * Patroni < 3.0.4, standby leaders are in `running` state. * Patroni >= 3.0.4, standby leaders can be in `streaming` or `in archive recovey` state. We will raise a warning for the latter. The tests where modified to account for this. Co-authored-by: Denis Laxalde <denis@laxalde.org>
This commit is contained in:
parent
ffc330f96e
commit
46db3e2d15
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
* Add the timeline in the `cluster_has_replica` perfstats. (#50)
|
* Add the timeline in the `cluster_has_replica` perfstats. (#50)
|
||||||
* Add a mention about shell completion support and shell versions in the doc. (#53)
|
* Add a mention about shell completion support and shell versions in the doc. (#53)
|
||||||
|
* Add the leader type and whether it's archiving to the `cluster_has_leader` perfstats. (#58)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
@ -13,6 +14,7 @@
|
||||||
version 2.25 and higher.
|
version 2.25 and higher.
|
||||||
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
|
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
|
||||||
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
|
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
|
||||||
|
* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)
|
||||||
|
|
||||||
### Misc
|
### Misc
|
||||||
|
|
||||||
|
|
20
README.md
20
README.md
|
@ -176,11 +176,27 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
|
||||||
|
|
||||||
This check applies to any kind of leaders including standby leaders.
|
This check applies to any kind of leaders including standby leaders.
|
||||||
|
|
||||||
|
A leader is a node with the "leader" role and a "running" state.
|
||||||
|
|
||||||
|
A standby leader is a node with a "standby_leader" role and a "streaming" or
|
||||||
|
"in archive recovery" state. Please note that log shipping could be stuck
|
||||||
|
because the WAL are not available or applicable. Patroni doesn't provide
|
||||||
|
information about the origin cluster (timeline or lag), so we cannot check
|
||||||
|
if there is a problem in that particular case. That's why we issue a warning
|
||||||
|
when the node is "in archive recovery". We suggest using other supervision
|
||||||
|
tools to do this (eg. check_pgactivity).
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if there is a leader node.
|
* `OK`: if there is a leader node.
|
||||||
* `CRITICAL`: otherwise
|
* 'WARNING': if there is a stanby leader in archive mode.
|
||||||
|
* `CRITICAL`: otherwise.
|
||||||
|
|
||||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
Perfdata:
|
||||||
|
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
|
||||||
|
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
|
||||||
|
archive recovery", 0 otherwise
|
||||||
|
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
|
||||||
|
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
|
|
|
@ -285,17 +285,38 @@ def cluster_has_leader(ctx: click.Context) -> None:
|
||||||
|
|
||||||
This check applies to any kind of leaders including standby leaders.
|
This check applies to any kind of leaders including standby leaders.
|
||||||
|
|
||||||
|
A leader is a node with the "leader" role and a "running" state.
|
||||||
|
|
||||||
|
A standby leader is a node with a "standby_leader" role and a "streaming"
|
||||||
|
or "in archive recovery" state. Please note that log shipping could be
|
||||||
|
stuck because the WAL are not available or applicable. Patroni doesn't
|
||||||
|
provide information about the origin cluster (timeline or lag), so we
|
||||||
|
cannot check if there is a problem in that particular case. That's why we
|
||||||
|
issue a warning when the node is "in archive recovery". We suggest using
|
||||||
|
other supervision tools to do this (eg. check_pgactivity).
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if there is a leader node.
|
* `OK`: if there is a leader node.
|
||||||
* `CRITICAL`: otherwise
|
* 'WARNING': if there is a stanby leader in archive mode.
|
||||||
|
* `CRITICAL`: otherwise.
|
||||||
|
|
||||||
|
\b
|
||||||
|
Perfdata:
|
||||||
|
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
|
||||||
|
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
|
||||||
|
archive recovery", 0 otherwise
|
||||||
|
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
|
||||||
|
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
|
||||||
|
|
||||||
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
|
|
||||||
"""
|
"""
|
||||||
check = nagiosplugin.Check()
|
check = nagiosplugin.Check()
|
||||||
check.add(
|
check.add(
|
||||||
ClusterHasLeader(ctx.obj.connection_info),
|
ClusterHasLeader(ctx.obj.connection_info),
|
||||||
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
|
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
|
||||||
|
nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None),
|
||||||
|
nagiosplugin.ScalarContext("is_leader", None, None),
|
||||||
|
nagiosplugin.ScalarContext("is_standby_leader", None, None),
|
||||||
ClusterHasLeaderSummary(),
|
ClusterHasLeaderSummary(),
|
||||||
)
|
)
|
||||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
|
@ -52,19 +52,42 @@ class ClusterHasLeader(PatroniResource):
|
||||||
item_dict = self.rest_api("cluster")
|
item_dict = self.rest_api("cluster")
|
||||||
|
|
||||||
is_leader_found = False
|
is_leader_found = False
|
||||||
|
is_standby_leader_found = False
|
||||||
|
is_standby_leader_in_arc_rec = False
|
||||||
for member in item_dict["members"]:
|
for member in item_dict["members"]:
|
||||||
if (
|
if member["role"] == "leader" and member["state"] == "running":
|
||||||
member["role"] in ("leader", "standby_leader")
|
|
||||||
and member["state"] == "running"
|
|
||||||
):
|
|
||||||
is_leader_found = True
|
is_leader_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if member["role"] == "standby_leader":
|
||||||
|
if member["state"] not in ["streaming", "in archive recovery"]:
|
||||||
|
# for patroni >= 3.0.4 any state would be wrong
|
||||||
|
# for patroni < 3.0.4 a state different from running would be wrong
|
||||||
|
if self.has_detailed_states() or member["state"] != "running":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if member["state"] in ["in archive recovery"]:
|
||||||
|
is_standby_leader_in_arc_rec = True
|
||||||
|
|
||||||
|
is_standby_leader_found = True
|
||||||
|
break
|
||||||
return [
|
return [
|
||||||
nagiosplugin.Metric(
|
nagiosplugin.Metric(
|
||||||
"has_leader",
|
"has_leader",
|
||||||
|
1 if is_leader_found or is_standby_leader_found else 0,
|
||||||
|
),
|
||||||
|
nagiosplugin.Metric(
|
||||||
|
"is_standby_leader_in_arc_rec",
|
||||||
|
1 if is_standby_leader_in_arc_rec else 0,
|
||||||
|
),
|
||||||
|
nagiosplugin.Metric(
|
||||||
|
"is_standby_leader",
|
||||||
|
1 if is_standby_leader_found else 0,
|
||||||
|
),
|
||||||
|
nagiosplugin.Metric(
|
||||||
|
"is_leader",
|
||||||
1 if is_leader_found else 0,
|
1 if is_leader_found else 0,
|
||||||
)
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,7 +97,7 @@ class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
||||||
|
|
||||||
@handle_unknown
|
@handle_unknown
|
||||||
def problem(self, results: nagiosplugin.Result) -> str:
|
def problem(self, results: nagiosplugin.Result) -> str:
|
||||||
return "The cluster has no running leader."
|
return "The cluster has no running leader or the standby leader is in archive recovery."
|
||||||
|
|
||||||
|
|
||||||
class ClusterHasReplica(PatroniResource):
|
class ClusterHasReplica(PatroniResource):
|
||||||
|
|
33
tests/json/cluster_has_leader_ko_standby_leader.json
Normal file
33
tests/json/cluster_has_leader_ko_standby_leader.json
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"members": [
|
||||||
|
{
|
||||||
|
"name": "srv1",
|
||||||
|
"role": "standby_leader",
|
||||||
|
"state": "stopped",
|
||||||
|
"api_url": "https://10.20.199.3:8008/patroni",
|
||||||
|
"host": "10.20.199.3",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv2",
|
||||||
|
"role": "replica",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.4:8008/patroni",
|
||||||
|
"host": "10.20.199.4",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv3",
|
||||||
|
"role": "replica",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.5:8008/patroni",
|
||||||
|
"host": "10.20.199.5",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"members": [
|
||||||
|
{
|
||||||
|
"name": "srv1",
|
||||||
|
"role": "standby_leader",
|
||||||
|
"state": "in archive recovery",
|
||||||
|
"api_url": "https://10.20.199.3:8008/patroni",
|
||||||
|
"host": "10.20.199.3",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv2",
|
||||||
|
"role": "replica",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.4:8008/patroni",
|
||||||
|
"host": "10.20.199.4",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv3",
|
||||||
|
"role": "replica",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.5:8008/patroni",
|
||||||
|
"host": "10.20.199.5",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
{
|
{
|
||||||
"name": "srv1",
|
"name": "srv1",
|
||||||
"role": "standby_leader",
|
"role": "standby_leader",
|
||||||
"state": "running",
|
"state": "streaming",
|
||||||
"api_url": "https://10.20.199.3:8008/patroni",
|
"api_url": "https://10.20.199.3:8008/patroni",
|
||||||
"host": "10.20.199.3",
|
"host": "10.20.199.3",
|
||||||
"port": 5432,
|
"port": 5432,
|
||||||
|
|
|
@ -1,37 +1,132 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator, Union
|
||||||
|
|
||||||
|
import pytest
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
|
|
||||||
from check_patroni.cli import main
|
from check_patroni.cli import main
|
||||||
|
|
||||||
from . import PatroniAPI
|
from . import PatroniAPI, cluster_api_set_replica_running
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cluster_has_leader_ok(
|
||||||
|
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||||
|
) -> Iterator[None]:
|
||||||
|
cluster_path: Union[str, Path] = "cluster_has_leader_ok.json"
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||||
|
if old_replica_state:
|
||||||
|
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||||
|
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("cluster_has_leader_ok")
|
||||||
def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
||||||
with patroni_api.routes({"cluster": "cluster_has_leader_ok.json"}):
|
|
||||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||||
assert result.exit_code == 0
|
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
|
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=1 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||||
)
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cluster_has_leader_ok_standby_leader(
|
||||||
|
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||||
|
) -> Iterator[None]:
|
||||||
|
cluster_path: Union[str, Path] = "cluster_has_leader_ok_standby_leader.json"
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||||
|
if old_replica_state:
|
||||||
|
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||||
|
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("cluster_has_leader_ok_standby_leader")
|
||||||
def test_cluster_has_leader_ok_standby_leader(
|
def test_cluster_has_leader_ok_standby_leader(
|
||||||
runner: CliRunner, patroni_api: PatroniAPI
|
runner: CliRunner, patroni_api: PatroniAPI
|
||||||
) -> None:
|
) -> None:
|
||||||
with patroni_api.routes({"cluster": "cluster_has_leader_ok_standby_leader.json"}):
|
|
||||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||||
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert (
|
|
||||||
result.stdout
|
|
||||||
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cluster_has_leader_ko(
|
||||||
|
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||||
|
) -> Iterator[None]:
|
||||||
|
cluster_path: Union[str, Path] = "cluster_has_leader_ko.json"
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||||
|
if old_replica_state:
|
||||||
|
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||||
|
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("cluster_has_leader_ko")
|
||||||
def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None:
|
||||||
with patroni_api.routes({"cluster": "cluster_has_leader_ko.json"}):
|
|
||||||
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||||
assert result.exit_code == 2
|
|
||||||
assert (
|
assert (
|
||||||
result.stdout
|
result.stdout
|
||||||
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader. | has_leader=0;;@0\n"
|
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||||
)
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cluster_has_leader_ko_standby_leader(
|
||||||
|
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||||
|
) -> Iterator[None]:
|
||||||
|
cluster_path: Union[str, Path] = "cluster_has_leader_ko_standby_leader.json"
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||||
|
if old_replica_state:
|
||||||
|
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||||
|
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader")
|
||||||
|
def test_cluster_has_leader_ko_standby_leader(
|
||||||
|
runner: CliRunner, patroni_api: PatroniAPI
|
||||||
|
) -> None:
|
||||||
|
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cluster_has_leader_ko_standby_leader_archiving(
|
||||||
|
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
|
||||||
|
) -> Iterator[None]:
|
||||||
|
cluster_path: Union[
|
||||||
|
str, Path
|
||||||
|
] = "cluster_has_leader_ko_standby_leader_archiving.json"
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
|
||||||
|
if old_replica_state:
|
||||||
|
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
|
||||||
|
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
|
||||||
|
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving")
|
||||||
|
def test_cluster_has_leader_ko_standby_leader_archiving(
|
||||||
|
runner: CliRunner, patroni_api: PatroniAPI
|
||||||
|
) -> None:
|
||||||
|
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
|
Loading…
Reference in a new issue