Add new service cluster_has_scheduled_action

This commit is contained in:
benoit 2023-08-22 17:29:13 +02:00 committed by Benoit
parent 7f6a03a3cc
commit 99bf1c5bb5
8 changed files with 196 additions and 0 deletions

View file

@ -5,6 +5,7 @@
### Added
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
### Fixed

View file

@ -13,6 +13,7 @@ from .cluster import (
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterHasScheduledAction,
ClusterIsInMaintenance,
ClusterNodeCount,
)
@ -436,6 +437,33 @@ def cluster_is_in_maintenance(ctx: click.Context) -> None:
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="cluster_has_scheduled_action")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_scheduled_action(ctx: click.Context) -> None:
"""Check if the cluster has a scheduled action (switchover or restart)
\b
Check:
* `OK`: If the cluster has no scheduled action
* `CRITICAL`: otherwise.
\b
Perfdata:
* `scheduled_actions` is 1 if the cluster has scheduled actions.
* `scheduled_switchover` is 1 if the cluster has a scheduled switchover.
* `scheduled_restart` counts the number of scheduled restart in the cluster.
"""
check = nagiosplugin.Check()
check.add(
ClusterHasScheduledAction(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"),
nagiosplugin.ScalarContext("scheduled_switchover"),
nagiosplugin.ScalarContext("scheduled_restart"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded

View file

@ -191,3 +191,27 @@ class ClusterIsInMaintenance(PatroniResource):
1 if "pause" in item_dict and item_dict["pause"] else 0,
)
]
class ClusterHasScheduledAction(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
item_dict = self.rest_api("cluster")
scheduled_switchover = 0
scheduled_restart = 0
if "scheduled_switchover" in item_dict:
scheduled_switchover = 1
for member in item_dict["members"]:
if "scheduled_restart" in member:
scheduled_restart += 1
# The actual check
yield nagiosplugin.Metric(
"has_scheduled_actions",
1 if (scheduled_switchover + scheduled_restart) > 0 else 0,
)
# The performance data : scheduled_switchover, scheduled action count
yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover)
yield nagiosplugin.Metric("scheduled_restart", scheduled_restart)

View file

@ -0,0 +1,27 @@
{
"members": [
{
"name": "p1",
"role": "sync_standby",
"state": "streaming",
"api_url": "http://10.20.30.51:8008/patroni",
"host": "10.20.30.51",
"port": 5432,
"timeline": 3,
"scheduled_restart": {
"schedule": "2023-10-08T11:30:00+00:00",
"postmaster_start_time": "2023-08-21 08:08:33.415237+00:00"
},
"lag": 0
},
{
"name": "p2",
"role": "leader",
"state": "running",
"api_url": "http://10.20.30.52:8008/patroni",
"host": "10.20.30.52",
"port": 5432,
"timeline": 3
}
]
}

View file

@ -0,0 +1,28 @@
{
"members": [
{
"name": "p1",
"role": "sync_standby",
"state": "streaming",
"api_url": "http://10.20.30.51:8008/patroni",
"host": "10.20.30.51",
"port": 5432,
"timeline": 3,
"lag": 0
},
{
"name": "p2",
"role": "leader",
"state": "running",
"api_url": "http://10.20.30.52:8008/patroni",
"host": "10.20.30.52",
"port": 5432,
"timeline": 3
}
],
"scheduled_switchover": {
"at": "2023-10-08T11:30:00+00:00",
"from": "p1",
"to": "p2"
}
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "sync_standby",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,54 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from .tools import my_mock
def test_cluster_has_scheduled_action_ok(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_scheduled_action_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION OK - has_scheduled_actions is 0 | has_scheduled_actions=0;;0 scheduled_restart=0 scheduled_switchover=0\n"
)
def test_cluster_has_scheduled_action_ko_switchover(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_scheduled_action_ko_switchover", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=0 scheduled_switchover=1\n"
)
def test_cluster_has_scheduled_action_ko_restart(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_scheduled_action_ko_restart", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=1 scheduled_switchover=0\n"
)

View file

@ -12,6 +12,7 @@ check_patroni -e "$1" cluster_config_has_changed --state-file cluster.sate_file
check_patroni -e "$1" cluster_has_leader
check_patroni -e "$1" cluster_has_replica
check_patroni -e "$1" cluster_is_in_maintenance
check_patroni -e "$1" cluster_has_scheduled_action
check_patroni -e "$1" cluster_node_count
echo "-- Node checks"
check_patroni -e "$1" node_is_alive