New service cluster_is_in_maintenance

This commit is contained in:
benoit 2021-08-12 12:48:55 +02:00
parent 4169766a2f
commit dd8130a459
7 changed files with 213 additions and 4 deletions

View file

@ -12,6 +12,7 @@ from .cluster import (
ClusterHasLeaderSummary, ClusterHasLeaderSummary,
ClusterHasReplica, ClusterHasReplica,
ClusterNodeCount, ClusterNodeCount,
ClusterIsInMaintenance,
) )
from .node import ( from .node import (
NodeIsAlive, NodeIsAlive,
@ -183,6 +184,7 @@ def cluster_node_count(
Perfdata: Perfdata:
* `members`: the member count. * `members`: the member count.
* all the roles of the nodes in the cluster with their number. * all the roles of the nodes in the cluster with their number.
* all the statuses of the nodes in the cluster with their number.
""" """
check = nagiosplugin.Check() check = nagiosplugin.Check()
check.add( check.add(
@ -336,6 +338,31 @@ def cluster_config_has_changed(
) )
@main.command(name="cluster_is_in_maintenance")
@click.pass_context
@nagiosplugin.guarded
def cluster_is_in_maintenance(ctx: click.Context) -> None:
"""Check if the cluster is in maintenance mode ie paused.
\b
Check:
* `OK`: If the cluster is in maintenance mode.
* `CRITICAL`: otherwise.
\b
Perfdata :
* `is_in_maintenance` is 1 the cluster is in maintenance mode, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterIsInMaintenance(ctx.obj),
nagiosplugin.ScalarContext("is_in_maintenance", None, "0:0"),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_primary") @main.command(name="node_is_primary")
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
@ -393,7 +420,7 @@ def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state. """Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but This situation can arise if the configuration has been modified but
requiers arestart of PostgreSQL. requiers a restart of PostgreSQL to take effect.
\b \b
Check: Check:
@ -427,7 +454,7 @@ def node_is_pending_restart(ctx: click.Context) -> None:
@click.pass_context @click.pass_context
@nagiosplugin.guarded @nagiosplugin.guarded
def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None: def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None:
"""Check if the timeline hash changed. """Check if the timeline has changed.
Note: either a timeline or a state file must be provided for this service to work. Note: either a timeline or a state file must be provided for this service to work.

View file

@ -21,9 +21,9 @@ class ClusterNodeCount(PatroniResource):
_log.debug(f"api call data: {r.data}") _log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data) item_dict = json.loads(r.data)
role_counters = Counter() role_counters: Counter[str] = Counter()
roles = [] roles = []
status_counters = Counter() status_counters: Counter[str] = Counter()
statuses = [] statuses = []
for member in item_dict["members"]: for member in item_dict["members"]:
@ -159,3 +159,15 @@ class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
) -> str: ) -> str:
return "The hash of patroni's dynamic configuration has changed." return "The hash of patroni's dynamic configuration has changed."
class ClusterIsInMaintenance(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
# The actual check
return [nagiosplugin.Metric("is_in_maintenance", 1 if "pause" in item_dict and item_dict["pause"] else 0)]

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": true
}

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": false
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,34 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
],
"pause": false
}

View file

@ -0,0 +1,35 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_cluster_is_in_maintenance_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 0
def test_cluster_is_in_maintenance_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ko", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 2
def test_cluster_is_in_maintenance_ok_pause_false(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_is_in_maintenance_ok_pause_false", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_is_in_maintenance"]
)
assert result.exit_code == 0