New service cluster_is_in_maintenance

This commit is contained in:
benoit 2021-08-12 12:48:55 +02:00
parent 4169766a2f
commit dd8130a459
7 changed files with 213 additions and 4 deletions

View file

@ -12,6 +12,7 @@ from .cluster import (
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterNodeCount,
ClusterIsInMaintenance,
)
from .node import (
NodeIsAlive,
@ -183,6 +184,7 @@ def cluster_node_count(
Perfdata:
* `members`: the member count.
* all the roles of the nodes in the cluster with their number.
* all the statuses of the nodes in the cluster with their number.
"""
check = nagiosplugin.Check()
check.add(
@ -336,6 +338,31 @@ def cluster_config_has_changed(
)
@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
"""Check if the cluster is in maintenance mode ie paused.
\b
Check:
* `OK`: If the cluster is in maintenance mode.
* `CRITICAL`: otherwise.
\b
Perfdata :
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterIsInMaintenance(ctx.obj),
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
@ -393,7 +420,7 @@ def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but
requiers arestart of PostgreSQL.
requiers a restart of PostgreSQL to take effect.
\b
Check:
@ -427,7 +454,7 @@ def node_is_pending_restart(ctx: click.Context) -> None:
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None:
"""Check if the timeline hash changed.
"""Check if the timeline has changed.
Note: either a timeline or a state file must be provided for this service to work.

View file

@ -21,9 +21,9 @@ class ClusterNodeCount(PatroniResource):
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
role_counters = Counter()
role_counters: Counter[str] = Counter()
roles = []
status_counters = Counter()
status_counters: Counter[str] = Counter()
statuses = []
for member in item_dict["members"]:
@ -159,3 +159,15 @@ class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
) -> str:
return "The hash of patroni's dynamic configuration has changed."
class ClusterIsInMaintenance(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
# The actual check
return [nagiosplugin.Metric("is_in_maintenance", 1 if "pause" in item_dict and item_dict["pause"] else 0)]

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": true
}

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": false
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": false
}

View file

@ -0,0 +1,35 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_cluster_is_in_maintenance_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 0
def test_cluster_is_in_maintenance_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ko", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 2
def test_cluster_is_in_maintenance_ok_pause_false(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ok_pause_false", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 0