From 259f04587b7a04e1bb4e656061ddebfcf36da6e6 Mon Sep 17 00:00:00 2001 From: benoit Date: Wed, 23 Aug 2023 17:40:46 +0200 Subject: [PATCH] Add a node_is_leader service to check for the leader states It's possible to check for any kind of leader of specifically for a standby leader. --- CHANGELOG.md | 1 + README.md | 25 +++++++++ check_patroni/cli.py | 36 +++++++++++++ check_patroni/node.py | 39 ++++++++++++++ docs/make_readme.sh | 2 + tests/json/node_is_leader_ko.json | 26 +++++++++ .../node_is_leader_ko_standby_leader.json | 19 +++++++ tests/json/node_is_leader_ok.json | 26 +++++++++ .../node_is_leader_ok_standby_leader.json | 19 +++++++ tests/test_node_is_leader.py | 53 +++++++++++++++++++ vagrant/check_patroni.sh | 1 + 11 files changed, 247 insertions(+) create mode 100644 tests/json/node_is_leader_ko.json create mode 100644 tests/json/node_is_leader_ko_standby_leader.json create mode 100644 tests/json/node_is_leader_ok.json create mode 100644 tests/json/node_is_leader_ok_standby_leader.json create mode 100644 tests/test_node_is_leader.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f3c76cc..92843d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart. * Add options to `node_is_replica` to check specifically for a synchronous (`--is-sync`) or asynchronous node (`--is-async`). * Add `standby-leader` as a valid leader type for `cluster_has_leader`. +* Add a new service `node_is_leader` to check if a node is a leader (which includes standby leader nodes) ### Fixed diff --git a/README.md b/README.md index b493262..3fb6729 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ Commands: cluster_is_in_maintenance Check if the cluster is in maintenance... cluster_node_count Count the number of nodes in the cluster. node_is_alive Check if the node is alive ie patroni is... + node_is_leader Check if the node is a leader node. node_is_pending_restart Check if the node is in pending restart... node_is_primary Check if the node is the primary with the... node_is_replica Check if the node is a running replica... @@ -350,6 +351,27 @@ Options: --help Show this message and exit. ``` +### node_is_leader + +``` +Usage: check_patroni node_is_leader [OPTIONS] + + Check if the node is a leader node. + + This check applies to any kind of leaders including standby leaders. To + check explicitly for a standby leader use the `--is-standby-leader` option. + + Check: + * `OK`: if the node is a leader. + * `CRITICAL:` otherwise + + Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise. + +Options: + --is-standby-leader Check for a standby leader + --help Show this message and exit. +``` + ### node_is_primary ``` @@ -357,6 +379,9 @@ Usage: check_patroni node_is_primary [OPTIONS] Check if the node is the primary with the leader lock. + This service is not valid for a standby leader, because this kind of node is + not a primary. + Check: * `OK`: if the node is a primary with the leader lock. * `CRITICAL:` otherwise diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 19d5058..227e654 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -21,6 +21,8 @@ from .convert import size_to_byte from .node import ( NodeIsAlive, NodeIsAliveSummary, + NodeIsLeader, + NodeIsLeaderSummary, NodeIsPendingRestart, NodeIsPendingRestartSummary, NodeIsPrimary, @@ -470,6 +472,8 @@ def cluster_has_scheduled_action(ctx: click.Context) -> None: def node_is_primary(ctx: click.Context) -> None: """Check if the node is the primary with the leader lock. + This service is not valid for a standby leader, because this kind of node is not a primary. + \b Check: * `OK`: if the node is a primary with the leader lock. @@ -486,6 +490,38 @@ def node_is_primary(ctx: click.Context) -> None: check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) +@main.command(name="node_is_leader") +@click.option( + "--is-standby-leader", + "check_standby_leader", + is_flag=True, + default=False, + help="Check for a standby leader", +) +@click.pass_context +@nagiosplugin.guarded +def node_is_leader(ctx: click.Context, check_standby_leader: bool) -> None: + """Check if the node is a leader node. + + This check applies to any kind of leaders including standby leaders. + To check explicitly for a standby leader use the `--is-standby-leader` option. + + \b + Check: + * `OK`: if the node is a leader. + * `CRITICAL:` otherwise + + Perfdata: `is_leader` is 1 if the node is a leader node, 0 otherwise. + """ + check = nagiosplugin.Check() + check.add( + NodeIsLeader(ctx.obj.connection_info, check_standby_leader), + nagiosplugin.ScalarContext("is_leader", None, "@0:0"), + NodeIsLeaderSummary(check_standby_leader), + ) + check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) + + @main.command(name="node_is_replica") @click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag") @click.option( diff --git a/check_patroni/node.py b/check_patroni/node.py index 15f8b94..df50cff 100644 --- a/check_patroni/node.py +++ b/check_patroni/node.py @@ -24,6 +24,45 @@ class NodeIsPrimarySummary(nagiosplugin.Summary): return "This node is not the primary with the leader lock." +class NodeIsLeader(PatroniResource): + def __init__( + self: "NodeIsLeader", + connection_info: ConnectionInfo, + check_is_standby_leader: bool, + ) -> None: + super().__init__(connection_info) + self.check_is_standby_leader = check_is_standby_leader + + def probe(self: "NodeIsLeader") -> Iterable[nagiosplugin.Metric]: + apiname = "leader" + if self.check_is_standby_leader: + apiname = "standby-leader" + + try: + self.rest_api(apiname) + except APIError: + return [nagiosplugin.Metric("is_leader", 0)] + return [nagiosplugin.Metric("is_leader", 1)] + + +class NodeIsLeaderSummary(nagiosplugin.Summary): + def __init__( + self: "NodeIsLeaderSummary", + check_is_standby_leader: bool, + ) -> None: + if check_is_standby_leader: + self.leader_kind = "standby leader" + else: + self.leader_kind = "leader" + + def ok(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str: + return f"This node is a {self.leader_kind} node." + + @handle_unknown + def problem(self: "NodeIsLeaderSummary", results: nagiosplugin.Result) -> str: + return f"This node is not a {self.leader_kind} node." + + class NodeIsReplica(PatroniResource): def __init__( self: "NodeIsReplica", diff --git a/docs/make_readme.sh b/docs/make_readme.sh index c29406f..6b1703b 100755 --- a/docs/make_readme.sh +++ b/docs/make_readme.sh @@ -172,6 +172,8 @@ readme "### node_is_alive" helpme node_is_alive readme "### node_is_pending_restart" helpme node_is_pending_restart +readme "### node_is_leader" +helpme node_is_leader readme "### node_is_primary" helpme node_is_primary readme "### node_is_replica" diff --git a/tests/json/node_is_leader_ko.json b/tests/json/node_is_leader_ko.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/tests/json/node_is_leader_ko.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/tests/json/node_is_leader_ko_standby_leader.json b/tests/json/node_is_leader_ko_standby_leader.json new file mode 100644 index 0000000..7f9ffd3 --- /dev/null +++ b/tests/json/node_is_leader_ko_standby_leader.json @@ -0,0 +1,19 @@ +{ + "state": "running", + "postmaster_start_time": "2023-08-23 14:30:50.201691+00:00", + "role": "standby_leader", + "server_version": 140009, + "xlog": { + "received_location": 889192448, + "replayed_location": 889192448, + "replayed_timestamp": null, + "paused": false + }, + "timeline": 1, + "dcs_last_seen": 1692805971, + "database_system_identifier": "7270495803765492571", + "patroni": { + "version": "3.1.0", + "scope": "patroni-demo-sb" + } +} diff --git a/tests/json/node_is_leader_ok.json b/tests/json/node_is_leader_ok.json new file mode 100644 index 0000000..d47b18b --- /dev/null +++ b/tests/json/node_is_leader_ok.json @@ -0,0 +1,26 @@ +{ + "state": "running", + "postmaster_start_time": "2021-08-11 07:02:20.732 UTC", + "role": "master", + "server_version": 110012, + "cluster_unlocked": false, + "xlog": { + "location": 1174407088 + }, + "timeline": 58, + "replication": [ + { + "usename": "replicator", + "application_name": "srv1", + "client_addr": "10.20.199.3", + "state": "streaming", + "sync_state": "async", + "sync_priority": 0 + } + ], + "database_system_identifier": "6965971025273547206", + "patroni": { + "version": "2.0.2", + "scope": "patroni-demo" + } +} diff --git a/tests/json/node_is_leader_ok_standby_leader.json b/tests/json/node_is_leader_ok_standby_leader.json new file mode 100644 index 0000000..7f9ffd3 --- /dev/null +++ b/tests/json/node_is_leader_ok_standby_leader.json @@ -0,0 +1,19 @@ +{ + "state": "running", + "postmaster_start_time": "2023-08-23 14:30:50.201691+00:00", + "role": "standby_leader", + "server_version": 140009, + "xlog": { + "received_location": 889192448, + "replayed_location": 889192448, + "replayed_timestamp": null, + "paused": false + }, + "timeline": 1, + "dcs_last_seen": 1692805971, + "database_system_identifier": "7270495803765492571", + "patroni": { + "version": "3.1.0", + "scope": "patroni-demo-sb" + } +} diff --git a/tests/test_node_is_leader.py b/tests/test_node_is_leader.py new file mode 100644 index 0000000..2822307 --- /dev/null +++ b/tests/test_node_is_leader.py @@ -0,0 +1,53 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from .tools import my_mock + + +def test_node_is_leader_ok(mocker: MockerFixture, use_old_replica_state: bool) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_leader_ok", 200) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"]) + assert result.exit_code == 0 + assert ( + result.stdout + == "NODEISLEADER OK - This node is a leader node. | is_leader=1;;@0\n" + ) + + my_mock(mocker, "node_is_leader_ok_standby_leader", 200) + result = runner.invoke( + main, + ["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"], + ) + print(result.stdout) + assert result.exit_code == 0 + assert ( + result.stdout + == "NODEISLEADER OK - This node is a standby leader node. | is_leader=1;;@0\n" + ) + + +def test_node_is_leader_ko(mocker: MockerFixture, use_old_replica_state: bool) -> None: + runner = CliRunner() + + my_mock(mocker, "node_is_leader_ko", 503) + result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_leader"]) + assert result.exit_code == 2 + assert ( + result.stdout + == "NODEISLEADER CRITICAL - This node is not a leader node. | is_leader=0;;@0\n" + ) + + my_mock(mocker, "node_is_leader_ko_standby_leader", 503) + result = runner.invoke( + main, + ["-e", "https://10.20.199.3:8008", "node_is_leader", "--is-standby-leader"], + ) + assert result.exit_code == 2 + assert ( + result.stdout + == "NODEISLEADER CRITICAL - This node is not a standby leader node. | is_leader=0;;@0\n" + ) diff --git a/vagrant/check_patroni.sh b/vagrant/check_patroni.sh index 56b8391..71e0182 100755 --- a/vagrant/check_patroni.sh +++ b/vagrant/check_patroni.sh @@ -18,6 +18,7 @@ echo "-- Node checks" check_patroni -e "$1" node_is_alive check_patroni -e "$1" node_is_pending_restart check_patroni -e "$1" node_is_primary +check_patroni -e "$1" node_is_leader --is-standby-leader check_patroni -e "$1" node_is_replica check_patroni -e "$1" node_is_replica --is-sync check_patroni -e "$1" node_patroni_version --patroni-version 3.1.0