Add a node_is_leader service to check for the leader states
It's possible to check for any kind of leader of specifically for a standby leader.
This commit is contained in:
parent
8883d6bdc4
commit
259f04587b
|
@ -8,6 +8,7 @@
|
|||
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
||||
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
||||
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
||||
* Add a new service `node_is_leader` to check if a node is a leader (which includes standby leader nodes)
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
25
README.md
25
README.md
|
@ -42,6 +42,7 @@ Commands:
|
|||
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
||||
cluster_node_count Count the number of nodes in the cluster.
|
||||
node_is_alive Check if the node is alive ie patroni is...
|
||||
node_is_leader Check if the node is a leader node.
|
||||
node_is_pending_restart Check if the node is in pending restart...
|
||||
node_is_primary Check if the node is the primary with the...
|
||||
node_is_replica Check if the node is a running replica...
|
||||
|
@ -350,6 +351,27 @@ Options:
|
|||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
### node_is_leader
|
||||
|
||||
```
|
||||
Usage: check_patroni node_is_leader [OPTIONS]
|
||||
|
||||
Check if the node is a leader node.
|
||||
|
||||
This check applies to any kind of leaders including standby leaders. To
|
||||
check explicitly for a standby leader use the `--is-standby-leader` option.
|
||||
|
||||
Check:
|
||||
* `OK`: if the node is a leader.
|
||||
* `CRITICAL:` otherwise
|
||||
|
||||
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
|
||||
|
||||
Options:
|
||||
--is-standby-leader Check for a standby leader
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
### node_is_primary
|
||||
|
||||
```
|
||||
|
@ -357,6 +379,9 @@ Usage: check_patroni node_is_primary [OPTIONS]
|
|||
|
||||
Check if the node is the primary with the leader lock.
|
||||
|
||||
This service is not valid for a standby leader, because this kind of node is
|
||||
not a primary.
|
||||
|
||||
Check:
|
||||
* `OK`: if the node is a primary with the leader lock.
|
||||
* `CRITICAL:` otherwise
|
||||
|
|
|
@ -21,6 +21,8 @@ from .convert import size_to_byte
|
|||
from .node import (
|
||||
NodeIsAlive,
|
||||
NodeIsAliveSummary,
|
||||
NodeIsLeader,
|
||||
NodeIsLeaderSummary,
|
||||
NodeIsPendingRestart,
|
||||
NodeIsPendingRestartSummary,
|
||||
NodeIsPrimary,
|
||||
|
@ -470,6 +472,8 @@ def cluster_has_scheduled_action(ctx: click.Context) -> None:
|
|||
def node_is_primary(ctx: click.Context) -> None:
|
||||
"""Check if the node is the primary with the leader lock.
|
||||
|
||||
This service is not valid for a standby leader, because this kind of node is not a primary.
|
||||
|
||||
\b
|
||||
Check:
|
||||
* `OK`: if the node is a primary with the leader lock.
|
||||
|
@ -486,6 +490,38 @@ def node_is_primary(ctx: click.Context) -> None:
|
|||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||
|
||||
|
||||
@main.command(name="node_is_leader")
|
||||
@click.option(
|
||||
"--is-standby-leader",
|
||||
"check_standby_leader",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Check for a standby leader",
|
||||
)
|
||||
@click.pass_context
|
||||
@nagiosplugin.guarded
|
||||
def node_is_leader(ctx: click.Context, check_standby_leader: bool) -> None:
|
||||
"""Check if the node is a leader node.
|
||||
|
||||
This check applies to any kind of leaders including standby leaders.
|
||||
To check explicitly for a standby leader use the `--is-standby-leader` option.
|
||||
|
||||
\b
|
||||
Check:
|
||||
* `OK`: if the node is a leader.
|
||||
* `CRITICAL:` otherwise
|
||||
|
||||
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
|
||||
"""
|
||||
check = nagiosplugin.Check()
|
||||
check.add(
|
||||
NodeIsLeader(ctx.obj.connection_info, check_standby_leader),
|
||||
nagiosplugin.ScalarContext("is_leader", None, "@0:0"),
|
||||
NodeIsLeaderSummary(check_standby_leader),
|
||||
)
|
||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||
|
||||
|
||||
@main.command(name="node_is_replica")
|
||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||
@click.option(
|
||||
|
|
|
@ -24,6 +24,45 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
|
|||
return "This node is not the primary with the leader lock."
|
||||
|
||||
|
||||
class NodeIsLeader(PatroniResource):
|
||||
def __init__(
|
||||
self: "NodeIsLeader",
|
||||
connection_info: ConnectionInfo,
|
||||
check_is_standby_leader: bool,
|
||||
) -> None:
|
||||
super().__init__(connection_info)
|
||||
self.check_is_standby_leader = check_is_standby_leader
|
||||
|
||||
def probe(self: "NodeIsLeader") -> Iterable[nagiosplugin.Metric]:
|
||||
apiname = "leader"
|
||||
if self.check_is_standby_leader:
|
||||
apiname = "standby-leader"
|
||||
|
||||
try:
|
||||
self.rest_api(apiname)
|
||||
except APIError:
|
||||
return [nagiosplugin.Metric("is_leader", 0)]
|
||||
return [nagiosplugin.Metric("is_leader", 1)]
|
||||
|
||||
|
||||
class NodeIsLeaderSummary(nagiosplugin.Summary):
|
||||
def __init__(
|
||||
self: "NodeIsLeaderSummary",
|
||||
check_is_standby_leader: bool,
|
||||
) -> None:
|
||||
if check_is_standby_leader:
|
||||
self.leader_kind = "standby leader"
|
||||
else:
|
||||
self.leader_kind = "leader"
|
||||
|
||||
def ok(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
|
||||
return f"This node is a {self.leader_kind} node."
|
||||
|
||||
@handle_unknown
|
||||
def problem(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
|
||||
return f"This node is not a {self.leader_kind} node."
|
||||
|
||||
|
||||
class NodeIsReplica(PatroniResource):
|
||||
def __init__(
|
||||
self: "NodeIsReplica",
|
||||
|
|
|
@ -172,6 +172,8 @@ readme "### node_is_alive"
|
|||
helpme node_is_alive
|
||||
readme "### node_is_pending_restart"
|
||||
helpme node_is_pending_restart
|
||||
readme "### node_is_leader"
|
||||
helpme node_is_leader
|
||||
readme "### node_is_primary"
|
||||
helpme node_is_primary
|
||||
readme "### node_is_replica"
|
||||
|
|
26
tests/json/node_is_leader_ko.json
Normal file
26
tests/json/node_is_leader_ko.json
Normal file
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"state": "running",
|
||||
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
|
||||
"role": "master",
|
||||
"server_version": 110012,
|
||||
"cluster_unlocked": false,
|
||||
"xlog": {
|
||||
"location": 1174407088
|
||||
},
|
||||
"timeline": 58,
|
||||
"replication": [
|
||||
{
|
||||
"usename": "replicator",
|
||||
"application_name": "srv1",
|
||||
"client_addr": "10.20.199.3",
|
||||
"state": "streaming",
|
||||
"sync_state": "async",
|
||||
"sync_priority": 0
|
||||
}
|
||||
],
|
||||
"database_system_identifier": "6965971025273547206",
|
||||
"patroni": {
|
||||
"version": "2.0.2",
|
||||
"scope": "patroni-demo"
|
||||
}
|
||||
}
|
19
tests/json/node_is_leader_ko_standby_leader.json
Normal file
19
tests/json/node_is_leader_ko_standby_leader.json
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"state": "running",
|
||||
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
|
||||
"role": "standby_leader",
|
||||
"server_version": 140009,
|
||||
"xlog": {
|
||||
"received_location": 889192448,
|
||||
"replayed_location": 889192448,
|
||||
"replayed_timestamp": null,
|
||||
"paused": false
|
||||
},
|
||||
"timeline": 1,
|
||||
"dcs_last_seen": 1692805971,
|
||||
"database_system_identifier": "7270495803765492571",
|
||||
"patroni": {
|
||||
"version": "3.1.0",
|
||||
"scope": "patroni-demo-sb"
|
||||
}
|
||||
}
|
26
tests/json/node_is_leader_ok.json
Normal file
26
tests/json/node_is_leader_ok.json
Normal file
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"state": "running",
|
||||
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
|
||||
"role": "master",
|
||||
"server_version": 110012,
|
||||
"cluster_unlocked": false,
|
||||
"xlog": {
|
||||
"location": 1174407088
|
||||
},
|
||||
"timeline": 58,
|
||||
"replication": [
|
||||
{
|
||||
"usename": "replicator",
|
||||
"application_name": "srv1",
|
||||
"client_addr": "10.20.199.3",
|
||||
"state": "streaming",
|
||||
"sync_state": "async",
|
||||
"sync_priority": 0
|
||||
}
|
||||
],
|
||||
"database_system_identifier": "6965971025273547206",
|
||||
"patroni": {
|
||||
"version": "2.0.2",
|
||||
"scope": "patroni-demo"
|
||||
}
|
||||
}
|
19
tests/json/node_is_leader_ok_standby_leader.json
Normal file
19
tests/json/node_is_leader_ok_standby_leader.json
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"state": "running",
|
||||
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
|
||||
"role": "standby_leader",
|
||||
"server_version": 140009,
|
||||
"xlog": {
|
||||
"received_location": 889192448,
|
||||
"replayed_location": 889192448,
|
||||
"replayed_timestamp": null,
|
||||
"paused": false
|
||||
},
|
||||
"timeline": 1,
|
||||
"dcs_last_seen": 1692805971,
|
||||
"database_system_identifier": "7270495803765492571",
|
||||
"patroni": {
|
||||
"version": "3.1.0",
|
||||
"scope": "patroni-demo-sb"
|
||||
}
|
||||
}
|
53
tests/test_node_is_leader.py
Normal file
53
tests/test_node_is_leader.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
from click.testing import CliRunner
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from check_patroni.cli import main
|
||||
|
||||
from .tools import my_mock
|
||||
|
||||
|
||||
def test_node_is_leader_ok(mocker: MockerFixture, use_old_replica_state: bool) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "node_is_leader_ok", 200)
|
||||
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "NODEISLEADER OK - This node is a leader node. | is_leader=1;;@0\n"
|
||||
)
|
||||
|
||||
my_mock(mocker, "node_is_leader_ok_standby_leader", 200)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
|
||||
)
|
||||
print(result.stdout)
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.stdout
|
||||
== "NODEISLEADER OK - This node is a standby leader node. | is_leader=1;;@0\n"
|
||||
)
|
||||
|
||||
|
||||
def test_node_is_leader_ko(mocker: MockerFixture, use_old_replica_state: bool) -> None:
|
||||
runner = CliRunner()
|
||||
|
||||
my_mock(mocker, "node_is_leader_ko", 503)
|
||||
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.stdout
|
||||
== "NODEISLEADER CRITICAL - This node is not a leader node. | is_leader=0;;@0\n"
|
||||
)
|
||||
|
||||
my_mock(mocker, "node_is_leader_ko_standby_leader", 503)
|
||||
result = runner.invoke(
|
||||
main,
|
||||
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
result.stdout
|
||||
== "NODEISLEADER CRITICAL - This node is not a standby leader node. | is_leader=0;;@0\n"
|
||||
)
|
|
@ -18,6 +18,7 @@ echo "-- Node checks"
|
|||
check_patroni -e "$1" node_is_alive
|
||||
check_patroni -e "$1" node_is_pending_restart
|
||||
check_patroni -e "$1" node_is_primary
|
||||
check_patroni -e "$1" node_is_leader --is-standby-leader
|
||||
check_patroni -e "$1" node_is_replica
|
||||
check_patroni -e "$1" node_is_replica --is-sync
|
||||
check_patroni -e "$1" node_patroni_version --patroni-version 3.1.0
|
||||
|
|
Loading…
Reference in a new issue