Add new service cluster_has_scheduled_action
This commit is contained in:
parent
7f6a03a3cc
commit
99bf1c5bb5
|
@ -5,6 +5,7 @@
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
|
* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
|
||||||
|
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ from .cluster import (
|
||||||
ClusterHasLeader,
|
ClusterHasLeader,
|
||||||
ClusterHasLeaderSummary,
|
ClusterHasLeaderSummary,
|
||||||
ClusterHasReplica,
|
ClusterHasReplica,
|
||||||
|
ClusterHasScheduledAction,
|
||||||
ClusterIsInMaintenance,
|
ClusterIsInMaintenance,
|
||||||
ClusterNodeCount,
|
ClusterNodeCount,
|
||||||
)
|
)
|
||||||
|
@ -436,6 +437,33 @@ def cluster_is_in_maintenance(ctx: click.Context) -> None:
|
||||||
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
|
||||||
|
@main.command(name="cluster_has_scheduled_action")
|
||||||
|
@click.pass_context
|
||||||
|
@nagiosplugin.guarded
|
||||||
|
def cluster_has_scheduled_action(ctx: click.Context) -> None:
|
||||||
|
"""Check if the cluster has a scheduled action (switchover or restart)
|
||||||
|
|
||||||
|
\b
|
||||||
|
Check:
|
||||||
|
* `OK`: If the cluster has no scheduled action
|
||||||
|
* `CRITICAL`: otherwise.
|
||||||
|
|
||||||
|
\b
|
||||||
|
Perfdata:
|
||||||
|
* `scheduled_actions` is 1 if the cluster has scheduled actions.
|
||||||
|
* `scheduled_switchover` is 1 if the cluster has a scheduled switchover.
|
||||||
|
* `scheduled_restart` counts the number of scheduled restart in the cluster.
|
||||||
|
"""
|
||||||
|
check = nagiosplugin.Check()
|
||||||
|
check.add(
|
||||||
|
ClusterHasScheduledAction(ctx.obj.connection_info),
|
||||||
|
nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"),
|
||||||
|
nagiosplugin.ScalarContext("scheduled_switchover"),
|
||||||
|
nagiosplugin.ScalarContext("scheduled_restart"),
|
||||||
|
)
|
||||||
|
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
|
||||||
|
|
||||||
|
|
||||||
@main.command(name="node_is_primary")
|
@main.command(name="node_is_primary")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@nagiosplugin.guarded
|
@nagiosplugin.guarded
|
||||||
|
|
|
@ -191,3 +191,27 @@ class ClusterIsInMaintenance(PatroniResource):
|
||||||
1 if "pause" in item_dict and item_dict["pause"] else 0,
|
1 if "pause" in item_dict and item_dict["pause"] else 0,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ClusterHasScheduledAction(PatroniResource):
|
||||||
|
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
|
||||||
|
item_dict = self.rest_api("cluster")
|
||||||
|
|
||||||
|
scheduled_switchover = 0
|
||||||
|
scheduled_restart = 0
|
||||||
|
if "scheduled_switchover" in item_dict:
|
||||||
|
scheduled_switchover = 1
|
||||||
|
|
||||||
|
for member in item_dict["members"]:
|
||||||
|
if "scheduled_restart" in member:
|
||||||
|
scheduled_restart += 1
|
||||||
|
|
||||||
|
# The actual check
|
||||||
|
yield nagiosplugin.Metric(
|
||||||
|
"has_scheduled_actions",
|
||||||
|
1 if (scheduled_switchover + scheduled_restart) > 0 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# The performance data : scheduled_switchover, scheduled action count
|
||||||
|
yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover)
|
||||||
|
yield nagiosplugin.Metric("scheduled_restart", scheduled_restart)
|
||||||
|
|
27
tests/json/cluster_has_scheduled_action_ko_restart.json
Normal file
27
tests/json/cluster_has_scheduled_action_ko_restart.json
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"members": [
|
||||||
|
{
|
||||||
|
"name": "p1",
|
||||||
|
"role": "sync_standby",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "http://10.20.30.51:8008/patroni",
|
||||||
|
"host": "10.20.30.51",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 3,
|
||||||
|
"scheduled_restart": {
|
||||||
|
"schedule": "2023-10-08T11:30:00+00:00",
|
||||||
|
"postmaster_start_time": "2023-08-21 08:08:33.415237+00:00"
|
||||||
|
},
|
||||||
|
"lag": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "p2",
|
||||||
|
"role": "leader",
|
||||||
|
"state": "running",
|
||||||
|
"api_url": "http://10.20.30.52:8008/patroni",
|
||||||
|
"host": "10.20.30.52",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
28
tests/json/cluster_has_scheduled_action_ko_switchover.json
Normal file
28
tests/json/cluster_has_scheduled_action_ko_switchover.json
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
"members": [
|
||||||
|
{
|
||||||
|
"name": "p1",
|
||||||
|
"role": "sync_standby",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "http://10.20.30.51:8008/patroni",
|
||||||
|
"host": "10.20.30.51",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 3,
|
||||||
|
"lag": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "p2",
|
||||||
|
"role": "leader",
|
||||||
|
"state": "running",
|
||||||
|
"api_url": "http://10.20.30.52:8008/patroni",
|
||||||
|
"host": "10.20.30.52",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"scheduled_switchover": {
|
||||||
|
"at": "2023-10-08T11:30:00+00:00",
|
||||||
|
"from": "p1",
|
||||||
|
"to": "p2"
|
||||||
|
}
|
||||||
|
}
|
33
tests/json/cluster_has_scheduled_action_ok.json
Normal file
33
tests/json/cluster_has_scheduled_action_ok.json
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"members": [
|
||||||
|
{
|
||||||
|
"name": "srv1",
|
||||||
|
"role": "leader",
|
||||||
|
"state": "running",
|
||||||
|
"api_url": "https://10.20.199.3:8008/patroni",
|
||||||
|
"host": "10.20.199.3",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv2",
|
||||||
|
"role": "replica",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.4:8008/patroni",
|
||||||
|
"host": "10.20.199.4",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "srv3",
|
||||||
|
"role": "sync_standby",
|
||||||
|
"state": "streaming",
|
||||||
|
"api_url": "https://10.20.199.5:8008/patroni",
|
||||||
|
"host": "10.20.199.5",
|
||||||
|
"port": 5432,
|
||||||
|
"timeline": 51,
|
||||||
|
"lag": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
54
tests/test_cluster_has_scheduled_action.py
Normal file
54
tests/test_cluster_has_scheduled_action.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
from click.testing import CliRunner
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
from check_patroni.cli import main
|
||||||
|
|
||||||
|
from .tools import my_mock
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_scheduled_action_ok(
|
||||||
|
mocker: MockerFixture, use_old_replica_state: bool
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_scheduled_action_ok", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASSCHEDULEDACTION OK - has_scheduled_actions is 0 | has_scheduled_actions=0;;0 scheduled_restart=0 scheduled_switchover=0\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_scheduled_action_ko_switchover(
|
||||||
|
mocker: MockerFixture, use_old_replica_state: bool
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_scheduled_action_ko_switchover", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=0 scheduled_switchover=1\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cluster_has_scheduled_action_ko_restart(
|
||||||
|
mocker: MockerFixture, use_old_replica_state: bool
|
||||||
|
) -> None:
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
my_mock(mocker, "cluster_has_scheduled_action_ko_restart", 200)
|
||||||
|
result = runner.invoke(
|
||||||
|
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert (
|
||||||
|
result.stdout
|
||||||
|
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=1 scheduled_switchover=0\n"
|
||||||
|
)
|
|
@ -12,6 +12,7 @@ check_patroni -e "$1" cluster_config_has_changed --state-file cluster.sate_file
|
||||||
check_patroni -e "$1" cluster_has_leader
|
check_patroni -e "$1" cluster_has_leader
|
||||||
check_patroni -e "$1" cluster_has_replica
|
check_patroni -e "$1" cluster_has_replica
|
||||||
check_patroni -e "$1" cluster_is_in_maintenance
|
check_patroni -e "$1" cluster_is_in_maintenance
|
||||||
|
check_patroni -e "$1" cluster_has_scheduled_action
|
||||||
check_patroni -e "$1" cluster_node_count
|
check_patroni -e "$1" cluster_node_count
|
||||||
echo "-- Node checks"
|
echo "-- Node checks"
|
||||||
check_patroni -e "$1" node_is_alive
|
check_patroni -e "$1" node_is_alive
|
||||||
|
|
Loading…
Reference in a new issue