From dd8130a459e9961004d9a0f2031ad8f5c24c96c4 Mon Sep 17 00:00:00 2001 From: benoit Date: Thu, 12 Aug 2021 12:48:55 +0200 Subject: [PATCH] New service cluster_is_in_maintenance --- check_patroni/cli.py | 31 ++++++++++++++-- check_patroni/cluster.py | 16 +++++++-- test/json/cluster_is_in_maintenance_ko.json | 34 ++++++++++++++++++ ...ster_is_in_maintenance_ko_pause_false.json | 34 ++++++++++++++++++ test/json/cluster_is_in_maintenance_ok.json | 33 +++++++++++++++++ ...ster_is_in_maintenance_ok_pause_false.json | 34 ++++++++++++++++++ test/test_cluster_is_in_maintenance.py | 35 +++++++++++++++++++ 7 files changed, 213 insertions(+), 4 deletions(-) create mode 100644 test/json/cluster_is_in_maintenance_ko.json create mode 100644 test/json/cluster_is_in_maintenance_ko_pause_false.json create mode 100644 test/json/cluster_is_in_maintenance_ok.json create mode 100644 test/json/cluster_is_in_maintenance_ok_pause_false.json create mode 100644 test/test_cluster_is_in_maintenance.py diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 52c2538..8b0807a 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -12,6 +12,7 @@ from .cluster import ( ClusterHasLeaderSummary, ClusterHasReplica, ClusterNodeCount, + ClusterIsInMaintenance, ) from .node import ( NodeIsAlive, @@ -183,6 +184,7 @@ def cluster_node_count( Perfdata: * `members`: the member count. * all the roles of the nodes in the cluster with their number. + * all the statuses of the nodes in the cluster with their number. """ check = nagiosplugin.Check() check.add( @@ -336,6 +338,31 @@ def cluster_config_has_changed( ) +@main.command(name="cluster_is_in_maintenance") +@click.pass_context +@nagiosplugin.guarded +def cluster_is_in_maintenance(ctx: click.Context) -> None: + """Check if the cluster is in maintenance mode ie paused. + + \b + Check: + * `OK`: If the cluster is in maintenance mode. + * `CRITICAL`: otherwise. + + \b + Perfdata : + * `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise + """ + check = nagiosplugin.Check() + check.add( + ClusterIsInMaintenance(ctx.obj), + nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"), + ) + check.main( + verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"] + ) + + @main.command(name="node_is_primary") @click.pass_context @nagiosplugin.guarded @@ -393,7 +420,7 @@ def node_is_pending_restart(ctx: click.Context) -> None: """Check if the node is in pending restart state. This situation can arise if the configuration has been modified but - requiers arestart of PostgreSQL. + requiers a restart of PostgreSQL to take effect. \b Check: @@ -427,7 +454,7 @@ def node_is_pending_restart(ctx: click.Context) -> None: @click.pass_context @nagiosplugin.guarded def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None: - """Check if the timeline hash changed. + """Check if the timeline has changed. Note: either a timeline or a state file must be provided for this service to work. diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index a7ec089..1c0e73a 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -21,9 +21,9 @@ class ClusterNodeCount(PatroniResource): _log.debug(f"api call data: {r.data}") item_dict = json.loads(r.data) - role_counters = Counter() + role_counters: Counter[str] = Counter() roles = [] - status_counters = Counter() + status_counters: Counter[str] = Counter() statuses = [] for member in item_dict["members"]: @@ -159,3 +159,15 @@ class ClusterConfigHasChangedSummary(nagiosplugin.Summary): self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result ) -> str: return "The hash of patroni's dynamic configuration has changed." + + +class ClusterIsInMaintenance(PatroniResource): + def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]: + r = self.rest_api("cluster") + _log.debug(f"api call status: {r.status}") + _log.debug(f"api call data: {r.data}") + + item_dict = json.loads(r.data) + + # The actual check + return [nagiosplugin.Metric("is_in_maintenance", 1 if "pause" in item_dict and item_dict["pause"] else 0)] diff --git a/test/json/cluster_is_in_maintenance_ko.json b/test/json/cluster_is_in_maintenance_ko.json new file mode 100644 index 0000000..be962c0 --- /dev/null +++ b/test/json/cluster_is_in_maintenance_ko.json @@ -0,0 +1,34 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ], + "pause": true +} diff --git a/test/json/cluster_is_in_maintenance_ko_pause_false.json b/test/json/cluster_is_in_maintenance_ko_pause_false.json new file mode 100644 index 0000000..d10c75c --- /dev/null +++ b/test/json/cluster_is_in_maintenance_ko_pause_false.json @@ -0,0 +1,34 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ], + "pause": false +} diff --git a/test/json/cluster_is_in_maintenance_ok.json b/test/json/cluster_is_in_maintenance_ok.json new file mode 100644 index 0000000..547d6c8 --- /dev/null +++ b/test/json/cluster_is_in_maintenance_ok.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/test/json/cluster_is_in_maintenance_ok_pause_false.json b/test/json/cluster_is_in_maintenance_ok_pause_false.json new file mode 100644 index 0000000..d10c75c --- /dev/null +++ b/test/json/cluster_is_in_maintenance_ok_pause_false.json @@ -0,0 +1,34 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "replica", + "state": "running", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ], + "pause": false +} diff --git a/test/test_cluster_is_in_maintenance.py b/test/test_cluster_is_in_maintenance.py new file mode 100644 index 0000000..6b908f9 --- /dev/null +++ b/test/test_cluster_is_in_maintenance.py @@ -0,0 +1,35 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from tools import my_mock + + +def test_cluster_is_in_maintenance_ok(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_is_in_maintenance_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"] + ) + assert result.exit_code == 0 + + +def test_cluster_is_in_maintenance_ko(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_is_in_maintenance_ko", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"] + ) + assert result.exit_code == 2 + +def test_cluster_is_in_maintenance_ok_pause_false(mocker: MockerFixture) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_is_in_maintenance_ok_pause_false", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"] + ) + assert result.exit_code == 0