Add a node_is_leader service to check for the leader states

It's possible to check for any kind of leader of specifically for a
standby leader.
This commit is contained in:
benoit 2023-08-23 17:40:46 +02:00 committed by Benoit
parent 8883d6bdc4
commit 259f04587b
11 changed files with 247 additions and 0 deletions

View file

@ -8,6 +8,7 @@
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
* Add a new service `node_is_leader` to check if a node is a leader (which includes standby leader nodes)
### Fixed

View file

@ -42,6 +42,7 @@ Commands:
cluster_is_in_maintenance Check if the cluster is in maintenance...
cluster_node_count Count the number of nodes in the cluster.
node_is_alive Check if the node is alive ie patroni is...
node_is_leader Check if the node is a leader node.
node_is_pending_restart Check if the node is in pending restart...
node_is_primary Check if the node is the primary with the...
node_is_replica Check if the node is a running replica...
@ -350,6 +351,27 @@ Options:
--help Show this message and exit.
```
### node_is_leader
```
Usage: check_patroni node_is_leader [OPTIONS]
Check if the node is a leader node.
This check applies to any kind of leaders including standby leaders. To
check explicitly for a standby leader use the `--is-standby-leader` option.
Check:
* `OK`: if the node is a leader.
* `CRITICAL:` otherwise
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
Options:
--is-standby-leader Check for a standby leader
--help Show this message and exit.
```
### node_is_primary
```
@ -357,6 +379,9 @@ Usage: check_patroni node_is_primary [OPTIONS]
Check if the node is the primary with the leader lock.
This service is not valid for a standby leader, because this kind of node is
not a primary.
Check:
* `OK`: if the node is a primary with the leader lock.
* `CRITICAL:` otherwise

View file

@ -21,6 +21,8 @@ from .convert import size_to_byte
from .node import (
NodeIsAlive,
NodeIsAliveSummary,
NodeIsLeader,
NodeIsLeaderSummary,
NodeIsPendingRestart,
NodeIsPendingRestartSummary,
NodeIsPrimary,
@ -470,6 +472,8 @@ def cluster_has_scheduled_action(ctx: click.Context) -> None:
def node_is_primary(ctx: click.Context) -> None:
"""Check if the node is the primary with the leader lock.
This service is not valid for a standby leader, because this kind of node is not a primary.
\b
Check:
* `OK`: if the node is a primary with the leader lock.
@ -486,6 +490,38 @@ def node_is_primary(ctx: click.Context) -> None:
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_leader")
@click.option(
"--is-standby-leader",
"check_standby_leader",
is_flag=True,
default=False,
help="Check for a standby leader",
)
@click.pass_context
@nagiosplugin.guarded
def node_is_leader(ctx: click.Context, check_standby_leader: bool) -> None:
"""Check if the node is a leader node.
This check applies to any kind of leaders including standby leaders.
To check explicitly for a standby leader use the `--is-standby-leader` option.
\b
Check:
* `OK`: if the node is a leader.
* `CRITICAL:` otherwise
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsLeader(ctx.obj.connection_info, check_standby_leader),
nagiosplugin.ScalarContext("is_leader", None, "@0:0"),
NodeIsLeaderSummary(check_standby_leader),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_replica")
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.option(

View file

@ -24,6 +24,45 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
return "This node is not the primary with the leader lock."
class NodeIsLeader(PatroniResource):
def __init__(
self: "NodeIsLeader",
connection_info: ConnectionInfo,
check_is_standby_leader: bool,
) -> None:
super().__init__(connection_info)
self.check_is_standby_leader = check_is_standby_leader
def probe(self: "NodeIsLeader") -> Iterable[nagiosplugin.Metric]:
apiname = "leader"
if self.check_is_standby_leader:
apiname = "standby-leader"
try:
self.rest_api(apiname)
except APIError:
return [nagiosplugin.Metric("is_leader", 0)]
return [nagiosplugin.Metric("is_leader", 1)]
class NodeIsLeaderSummary(nagiosplugin.Summary):
def __init__(
self: "NodeIsLeaderSummary",
check_is_standby_leader: bool,
) -> None:
if check_is_standby_leader:
self.leader_kind = "standby leader"
else:
self.leader_kind = "leader"
def ok(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
return f"This node is a {self.leader_kind} node."
@handle_unknown
def problem(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
return f"This node is not a {self.leader_kind} node."
class NodeIsReplica(PatroniResource):
def __init__(
self: "NodeIsReplica",

View file

@ -172,6 +172,8 @@ readme "### node_is_alive"
helpme node_is_alive
readme "### node_is_pending_restart"
helpme node_is_pending_restart
readme "### node_is_leader"
helpme node_is_leader
readme "### node_is_primary"
helpme node_is_primary
readme "### node_is_replica"

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,19 @@
{
"state": "running",
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
"role": "standby_leader",
"server_version": 140009,
"xlog": {
"received_location": 889192448,
"replayed_location": 889192448,
"replayed_timestamp": null,
"paused": false
},
"timeline": 1,
"dcs_last_seen": 1692805971,
"database_system_identifier": "7270495803765492571",
"patroni": {
"version": "3.1.0",
"scope": "patroni-demo-sb"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,19 @@
{
"state": "running",
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
"role": "standby_leader",
"server_version": 140009,
"xlog": {
"received_location": 889192448,
"replayed_location": 889192448,
"replayed_timestamp": null,
"paused": false
},
"timeline": 1,
"dcs_last_seen": 1692805971,
"database_system_identifier": "7270495803765492571",
"patroni": {
"version": "3.1.0",
"scope": "patroni-demo-sb"
}
}

View file

@ -0,0 +1,53 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from .tools import my_mock
def test_node_is_leader_ok(mocker: MockerFixture, use_old_replica_state: bool) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_leader_ok", 200)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
assert result.exit_code == 0
assert (
result.stdout
== "NODEISLEADER OK - This node is a leader node. | is_leader=1;;@0\n"
)
my_mock(mocker, "node_is_leader_ok_standby_leader", 200)
result = runner.invoke(
main,
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
)
print(result.stdout)
assert result.exit_code == 0
assert (
result.stdout
== "NODEISLEADER OK - This node is a standby leader node. | is_leader=1;;@0\n"
)
def test_node_is_leader_ko(mocker: MockerFixture, use_old_replica_state: bool) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_leader_ko", 503)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
assert result.exit_code == 2
assert (
result.stdout
== "NODEISLEADER CRITICAL - This node is not a leader node. | is_leader=0;;@0\n"
)
my_mock(mocker, "node_is_leader_ko_standby_leader", 503)
result = runner.invoke(
main,
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
)
assert result.exit_code == 2
assert (
result.stdout
== "NODEISLEADER CRITICAL - This node is not a standby leader node. | is_leader=0;;@0\n"
)

View file

@ -18,6 +18,7 @@ echo "-- Node checks"
check_patroni -e "$1" node_is_alive
check_patroni -e "$1" node_is_pending_restart
check_patroni -e "$1" node_is_primary
check_patroni -e "$1" node_is_leader --is-standby-leader
check_patroni -e "$1" node_is_replica
check_patroni -e "$1" node_is_replica --is-sync
check_patroni -e "$1" node_patroni_version --patroni-version 3.1.0