Add a node_is_leader service to check for the leader states
It's possible to check for any kind of leader of specifically for a standby leader.
This commit is contained in:
parent
8883d6bdc4
commit
259f04587b
|
@ -8,6 +8,7 @@
|
||||||
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
||||||
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
* Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`).
|
||||||
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
* Add `standby-leader` as a valid leader type for `cluster_has_leader`.
|
||||||
|
* Add a new service `node_is_leader` to check if a node is a leader (which includes standby leader nodes)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
|
25
README.md
25
README.md
|
@ -42,6 +42,7 @@ Commands:
|
||||||
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
cluster_is_in_maintenance Check if the cluster is in maintenance...
|
||||||
cluster_node_count Count the number of nodes in the cluster.
|
cluster_node_count Count the number of nodes in the cluster.
|
||||||
node_is_alive Check if the node is alive ie patroni is...
|
node_is_alive Check if the node is alive ie patroni is...
|
||||||
|
node_is_leader Check if the node is a leader node.
|
||||||
node_is_pending_restart Check if the node is in pending restart...
|
node_is_pending_restart Check if the node is in pending restart...
|
||||||
node_is_primary Check if the node is the primary with the...
|
node_is_primary Check if the node is the primary with the...
|
||||||
node_is_replica Check if the node is a running replica...
|
node_is_replica Check if the node is a running replica...
|
||||||
|
@ -350,6 +351,27 @@ Options:
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### node_is_leader
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: check_patroni node_is_leader [OPTIONS]
|
||||||
|
|
||||||
|
Check if the node is a leader node.
|
||||||
|
|
||||||
|
This check applies to any kind of leaders including standby leaders. To
|
||||||
|
check explicitly for a standby leader use the `--is-standby-leader` option.
|
||||||
|
|
||||||
|
Check:
|
||||||
|
* `OK`: if the node is a leader.
|
||||||
|
* `CRITICAL:` otherwise
|
||||||
|
|
||||||
|
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--is-standby-leader Check for a standby leader
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
### node_is_primary
|
### node_is_primary
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -357,6 +379,9 @@ Usage: check_patroni node_is_primary [OPTIONS]
|
||||||
|
|
||||||
Check if the node is the primary with the leader lock.
|
Check if the node is the primary with the leader lock.
|
||||||
|
|
||||||
|
This service is not valid for a standby leader, because this kind of node is
|
||||||
|
not a primary.
|
||||||
|
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the node is a primary with the leader lock.
|
* `OK`: if the node is a primary with the leader lock.
|
||||||
* `CRITICAL:` otherwise
|
* `CRITICAL:` otherwise
|
||||||
|
|
|
@ -21,6 +21,8 @@ from .convert import size_to_byte
|
||||||
from .node import (
|
from .node import (
|
||||||
NodeIsAlive,
|
NodeIsAlive,
|
||||||
NodeIsAliveSummary,
|
NodeIsAliveSummary,
|
||||||
|
NodeIsLeader,
|
||||||
|
NodeIsLeaderSummary,
|
||||||
NodeIsPendingRestart,
|
NodeIsPendingRestart,
|
||||||
NodeIsPendingRestartSummary,
|
NodeIsPendingRestartSummary,
|
||||||
NodeIsPrimary,
|
NodeIsPrimary,
|
||||||
|
@ -470,6 +472,8 @@ def cluster_has_scheduled_action(ctx: click.Context) -> None:
|
||||||
def node_is_primary(ctx: click.Context) -> None:
|
def node_is_primary(ctx: click.Context) -> None:
|
||||||
"""Check if the node is the primary with the leader lock.
|
"""Check if the node is the primary with the leader lock.
|
||||||
|
|
||||||
|
This service is not valid for a standby leader, because this kind of node is not a primary.
|
||||||
|
|
||||||
\b
|
\b
|
||||||
Check:
|
Check:
|
||||||
* `OK`: if the node is a primary with the leader lock.
|
* `OK`: if the node is a primary with the leader lock.
|
||||||
|
@ -486,6 +490,38 @@ def node_is_primary(ctx: click.Context) -> None:
|
||||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
|
||||||
|
@main.command(name="node_is_leader")
|
||||||
|
@click.option(
|
||||||
|
"--is-standby-leader",
|
||||||
|
"check_standby_leader",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check for a standby leader",
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
@nagiosplugin.guarded
|
||||||
|
def node_is_leader(ctx: click.Context, check_standby_leader: bool) -> None:
|
||||||
|
"""Check if the node is a leader node.
|
||||||
|
|
||||||
|
This check applies to any kind of leaders including standby leaders.
|
||||||
|
To check explicitly for a standby leader use the `--is-standby-leader` option.
|
||||||
|
|
||||||
|
\b
|
||||||
|
Check:
|
||||||
|
* `OK`: if the node is a leader.
|
||||||
|
* `CRITICAL:` otherwise
|
||||||
|
|
||||||
|
Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise.
|
||||||
|
"""
|
||||||
|
check = nagiosplugin.Check()
|
||||||
|
check.add(
|
||||||
|
NodeIsLeader(ctx.obj.connection_info, check_standby_leader),
|
||||||
|
nagiosplugin.ScalarContext("is_leader", None, "@0:0"),
|
||||||
|
NodeIsLeaderSummary(check_standby_leader),
|
||||||
|
)
|
||||||
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
|
||||||
@main.command(name="node_is_replica")
|
@main.command(name="node_is_replica")
|
||||||
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
|
||||||
@click.option(
|
@click.option(
|
||||||
|
|
|
@ -24,6 +24,45 @@ class NodeIsPrimarySummary(nagiosplugin.Summary):
|
||||||
return "This node is not the primary with the leader lock."
|
return "This node is not the primary with the leader lock."
|
||||||
|
|
||||||
|
|
||||||
|
class NodeIsLeader(PatroniResource):
|
||||||
|
def __init__(
|
||||||
|
self: "NodeIsLeader",
|
||||||
|
connection_info: ConnectionInfo,
|
||||||
|
check_is_standby_leader: bool,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(connection_info)
|
||||||
|
self.check_is_standby_leader = check_is_standby_leader
|
||||||
|
|
||||||
|
def probe(self: "NodeIsLeader") -> Iterable[nagiosplugin.Metric]:
|
||||||
|
apiname = "leader"
|
||||||
|
if self.check_is_standby_leader:
|
||||||
|
apiname = "standby-leader"
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.rest_api(apiname)
|
||||||
|
except APIError:
|
||||||
|
return [nagiosplugin.Metric("is_leader", 0)]
|
||||||
|
return [nagiosplugin.Metric("is_leader", 1)]
|
||||||
|
|
||||||
|
|
||||||
|
class NodeIsLeaderSummary(nagiosplugin.Summary):
|
||||||
|
def __init__(
|
||||||
|
self: "NodeIsLeaderSummary",
|
||||||
|
check_is_standby_leader: bool,
|
||||||
|
) -> None:
|
||||||
|
if check_is_standby_leader:
|
||||||
|
self.leader_kind = "standby leader"
|
||||||
|
else:
|
||||||
|
self.leader_kind = "leader"
|
||||||
|
|
||||||
|
def ok(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
|
||||||
|
return f"This node is a {self.leader_kind} node."
|
||||||
|
|
||||||
|
@handle_unknown
|
||||||
|
def problem(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str:
|
||||||
|
return f"This node is not a {self.leader_kind} node."
|
||||||
|
|
||||||
|
|
||||||
class NodeIsReplica(PatroniResource):
|
class NodeIsReplica(PatroniResource):
|
||||||
def __init__(
|
def __init__(
|
||||||
self: "NodeIsReplica",
|
self: "NodeIsReplica",
|
||||||
|
|
|
@ -172,6 +172,8 @@ readme "### node_is_alive"
|
||||||
helpme node_is_alive
|
helpme node_is_alive
|
||||||
readme "### node_is_pending_restart"
|
readme "### node_is_pending_restart"
|
||||||
helpme node_is_pending_restart
|
helpme node_is_pending_restart
|
||||||
|
readme "### node_is_leader"
|
||||||
|
helpme node_is_leader
|
||||||
readme "### node_is_primary"
|
readme "### node_is_primary"
|
||||||
helpme node_is_primary
|
helpme node_is_primary
|
||||||
readme "### node_is_replica"
|
readme "### node_is_replica"
|
||||||
|
|
26
tests/json/node_is_leader_ko.json
Normal file
26
tests/json/node_is_leader_ko.json
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"state": "running",
|
||||||
|
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
|
||||||
|
"role": "master",
|
||||||
|
"server_version": 110012,
|
||||||
|
"cluster_unlocked": false,
|
||||||
|
"xlog": {
|
||||||
|
"location": 1174407088
|
||||||
|
},
|
||||||
|
"timeline": 58,
|
||||||
|
"replication": [
|
||||||
|
{
|
||||||
|
"usename": "replicator",
|
||||||
|
"application_name": "srv1",
|
||||||
|
"client_addr": "10.20.199.3",
|
||||||
|
"state": "streaming",
|
||||||
|
"sync_state": "async",
|
||||||
|
"sync_priority": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"database_system_identifier": "6965971025273547206",
|
||||||
|
"patroni": {
|
||||||
|
"version": "2.0.2",
|
||||||
|
"scope": "patroni-demo"
|
||||||
|
}
|
||||||
|
}
|
19
tests/json/node_is_leader_ko_standby_leader.json
Normal file
19
tests/json/node_is_leader_ko_standby_leader.json
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
{
|
||||||
|
"state": "running",
|
||||||
|
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
|
||||||
|
"role": "standby_leader",
|
||||||
|
"server_version": 140009,
|
||||||
|
"xlog": {
|
||||||
|
"received_location": 889192448,
|
||||||
|
"replayed_location": 889192448,
|
||||||
|
"replayed_timestamp": null,
|
||||||
|
"paused": false
|
||||||
|
},
|
||||||
|
"timeline": 1,
|
||||||
|
"dcs_last_seen": 1692805971,
|
||||||
|
"database_system_identifier": "7270495803765492571",
|
||||||
|
"patroni": {
|
||||||
|
"version": "3.1.0",
|
||||||
|
"scope": "patroni-demo-sb"
|
||||||
|
}
|
||||||
|
}
|
26
tests/json/node_is_leader_ok.json
Normal file
26
tests/json/node_is_leader_ok.json
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"state": "running",
|
||||||
|
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
|
||||||
|
"role": "master",
|
||||||
|
"server_version": 110012,
|
||||||
|
"cluster_unlocked": false,
|
||||||
|
"xlog": {
|
||||||
|
"location": 1174407088
|
||||||
|
},
|
||||||
|
"timeline": 58,
|
||||||
|
"replication": [
|
||||||
|
{
|
||||||
|
"usename": "replicator",
|
||||||
|
"application_name": "srv1",
|
||||||
|
"client_addr": "10.20.199.3",
|
||||||
|
"state": "streaming",
|
||||||
|
"sync_state": "async",
|
||||||
|
"sync_priority": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"database_system_identifier": "6965971025273547206",
|
||||||
|
"patroni": {
|
||||||
|
"version": "2.0.2",
|
||||||
|
"scope": "patroni-demo"
|
||||||
|
}
|
||||||
|
}
|
19
tests/json/node_is_leader_ok_standby_leader.json
Normal file
19
tests/json/node_is_leader_ok_standby_leader.json
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
{
|
||||||
|
"state": "running",
|
||||||
|
"postmaster_start_time": "2023-08-23 14:30:50.201691+00:00",
|
||||||
|
"role": "standby_leader",
|
||||||
|
"server_version": 140009,
|
||||||
|
"xlog": {
|
||||||
|
"received_location": 889192448,
|
||||||
|
"replayed_location": 889192448,
|
||||||
|
"replayed_timestamp": null,
|
||||||
|
"paused": false
|
||||||
|
},
|
||||||
|
"timeline": 1,
|
||||||
|
"dcs_last_seen": 1692805971,
|
||||||
|
"database_system_identifier": "7270495803765492571",
|
||||||
|
"patroni": {
|
||||||
|
"version": "3.1.0",
|
||||||
|
"scope": "patroni-demo-sb"
|
||||||
|
}
|
||||||
|
}
|
53
tests/test_node_is_leader.py
Normal file
53
tests/test_node_is_leader.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
from click.testing import CliRunner
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
from check_patroni.cli import main
|
||||||
|
|
||||||
|
from .tools import my_mock
|
||||||
|
|
||||||
|
|
||||||
|
def test_node_is_leader_ok(mocker: MockerFixture, use_old_replica_state: bool) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "node_is_leader_ok", 200)
|
||||||
|
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "NODEISLEADER OK - This node is a leader node. | is_leader=1;;@0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
my_mock(mocker, "node_is_leader_ok_standby_leader", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
|
||||||
|
)
|
||||||
|
print(result.stdout)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "NODEISLEADER OK - This node is a standby leader node. | is_leader=1;;@0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_node_is_leader_ko(mocker: MockerFixture, use_old_replica_state: bool) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "node_is_leader_ko", 503)
|
||||||
|
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"])
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "NODEISLEADER CRITICAL - This node is not a leader node. | is_leader=0;;@0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
my_mock(mocker, "node_is_leader_ko_standby_leader", 503)
|
||||||
|
result = runner.invoke(
|
||||||
|
main,
|
||||||
|
["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "NODEISLEADER CRITICAL - This node is not a standby leader node. | is_leader=0;;@0\n"
|
||||||
|
)
|
|
@ -18,6 +18,7 @@ echo "-- Node checks"
|
||||||
check_patroni -e "$1" node_is_alive
|
check_patroni -e "$1" node_is_alive
|
||||||
check_patroni -e "$1" node_is_pending_restart
|
check_patroni -e "$1" node_is_pending_restart
|
||||||
check_patroni -e "$1" node_is_primary
|
check_patroni -e "$1" node_is_primary
|
||||||
|
check_patroni -e "$1" node_is_leader --is-standby-leader
|
||||||
check_patroni -e "$1" node_is_replica
|
check_patroni -e "$1" node_is_replica
|
||||||
check_patroni -e "$1" node_is_replica --is-sync
|
check_patroni -e "$1" node_is_replica --is-sync
|
||||||
check_patroni -e "$1" node_patroni_version --patroni-version 3.1.0
|
check_patroni -e "$1" node_patroni_version --patroni-version 3.1.0
|
||||||
|
|
Loading…
Reference in a new issue