2021-08-11 19:09:14 +02:00
|
|
|
import hashlib
|
|
|
|
import json
|
2022-02-07 15:11:05 +01:00
|
|
|
from collections import Counter
|
2023-09-27 16:37:40 +02:00
|
|
|
from typing import Any, Iterable, Union
|
2022-02-07 15:11:05 +01:00
|
|
|
|
2021-08-11 19:09:14 +02:00
|
|
|
import nagiosplugin
|
|
|
|
|
2023-03-16 13:11:40 +01:00
|
|
|
from . import _log
|
2023-03-20 11:44:19 +01:00
|
|
|
from .types import ConnectionInfo, PatroniResource, handle_unknown
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
def replace_chars(text: str) -> str:
|
|
|
|
return text.replace("'", "").replace(" ", "_")
|
|
|
|
|
|
|
|
|
|
|
|
class ClusterNodeCount(PatroniResource):
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-11-14 12:17:36 +01:00
|
|
|
def debug_member(member: Any, health: str) -> None:
|
|
|
|
_log.debug(
|
|
|
|
"Node %(node_name)s is %(health)s: role %(role)s state %(state)s.",
|
|
|
|
{
|
|
|
|
"node_name": member["name"],
|
|
|
|
"health": health,
|
|
|
|
"role": member["role"],
|
|
|
|
"state": member["state"],
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
# get the cluster info
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("cluster")
|
2023-11-14 12:17:36 +01:00
|
|
|
|
2021-08-12 12:48:55 +02:00
|
|
|
role_counters: Counter[str] = Counter()
|
2021-08-11 19:09:14 +02:00
|
|
|
roles = []
|
2021-08-12 12:48:55 +02:00
|
|
|
status_counters: Counter[str] = Counter()
|
2021-08-11 19:09:14 +02:00
|
|
|
statuses = []
|
2023-11-14 12:17:36 +01:00
|
|
|
healthy_member = 0
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
for member in item_dict["members"]:
|
2023-11-14 12:17:36 +01:00
|
|
|
state, role = member["state"], member["role"]
|
|
|
|
roles.append(replace_chars(role))
|
|
|
|
statuses.append(replace_chars(state))
|
|
|
|
|
|
|
|
if role == "leader" and state == "running":
|
|
|
|
healthy_member += 1
|
|
|
|
debug_member(member, "healthy")
|
|
|
|
continue
|
|
|
|
|
|
|
|
if role in ["standby_leader", "replica", "sync_standby"] and (
|
|
|
|
(self.has_detailed_states() and state == "streaming")
|
|
|
|
or (not self.has_detailed_states() and state == "running")
|
|
|
|
):
|
|
|
|
healthy_member += 1
|
|
|
|
debug_member(member, "healthy")
|
|
|
|
continue
|
|
|
|
|
|
|
|
debug_member(member, "unhealthy")
|
2021-08-11 19:09:14 +02:00
|
|
|
role_counters.update(roles)
|
|
|
|
status_counters.update(statuses)
|
|
|
|
|
2023-08-21 10:45:02 +02:00
|
|
|
# The actual check: members, healthy_members
|
2021-08-11 19:09:14 +02:00
|
|
|
yield nagiosplugin.Metric("members", len(item_dict["members"]))
|
2023-11-14 12:17:36 +01:00
|
|
|
yield nagiosplugin.Metric("healthy_members", healthy_member)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
# The performance data : role
|
|
|
|
for role in role_counters:
|
|
|
|
yield nagiosplugin.Metric(
|
2022-02-07 14:18:14 +01:00
|
|
|
f"role_{role}", role_counters[role], context="member_roles"
|
2021-08-11 19:09:14 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
# The performance data : statuses (except running)
|
|
|
|
for state in status_counters:
|
2023-08-21 10:45:02 +02:00
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
f"state_{state}", status_counters[state], context="member_statuses"
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ClusterHasLeader(PatroniResource):
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("cluster")
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
is_leader_found = False
|
2023-11-08 17:50:32 +01:00
|
|
|
is_standby_leader_found = False
|
|
|
|
is_standby_leader_in_arc_rec = False
|
2021-08-11 19:09:14 +02:00
|
|
|
for member in item_dict["members"]:
|
2023-11-08 17:50:32 +01:00
|
|
|
if member["role"] == "leader" and member["state"] == "running":
|
2021-08-11 19:09:14 +02:00
|
|
|
is_leader_found = True
|
|
|
|
break
|
|
|
|
|
2023-11-08 17:50:32 +01:00
|
|
|
if member["role"] == "standby_leader":
|
|
|
|
if member["state"] not in ["streaming", "in archive recovery"]:
|
|
|
|
# for patroni >= 3.0.4 any state would be wrong
|
|
|
|
# for patroni < 3.0.4 a state different from running would be wrong
|
|
|
|
if self.has_detailed_states() or member["state"] != "running":
|
|
|
|
continue
|
|
|
|
|
|
|
|
if member["state"] in ["in archive recovery"]:
|
|
|
|
is_standby_leader_in_arc_rec = True
|
|
|
|
|
|
|
|
is_standby_leader_found = True
|
|
|
|
break
|
2021-08-11 19:09:14 +02:00
|
|
|
return [
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"has_leader",
|
2023-11-08 17:50:32 +01:00
|
|
|
1 if is_leader_found or is_standby_leader_found else 0,
|
|
|
|
),
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_standby_leader_in_arc_rec",
|
|
|
|
1 if is_standby_leader_in_arc_rec else 0,
|
|
|
|
),
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_standby_leader",
|
|
|
|
1 if is_standby_leader_found else 0,
|
|
|
|
),
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_leader",
|
2021-08-11 19:09:14 +02:00
|
|
|
1 if is_leader_found else 0,
|
2023-11-08 17:50:32 +01:00
|
|
|
),
|
2021-08-11 19:09:14 +02:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
2023-10-02 13:44:49 +02:00
|
|
|
def ok(self, results: nagiosplugin.Result) -> str:
|
2021-08-11 19:09:14 +02:00
|
|
|
return "The cluster has a running leader."
|
|
|
|
|
|
|
|
@handle_unknown
|
2023-10-02 13:44:49 +02:00
|
|
|
def problem(self, results: nagiosplugin.Result) -> str:
|
2023-11-08 17:50:32 +01:00
|
|
|
return "The cluster has no running leader or the standby leader is in archive recovery."
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ClusterHasReplica(PatroniResource):
|
2023-10-02 13:44:49 +02:00
|
|
|
def __init__(self, connection_info: ConnectionInfo, max_lag: Union[int, None]):
|
2021-08-13 11:00:43 +02:00
|
|
|
super().__init__(connection_info)
|
|
|
|
self.max_lag = max_lag
|
|
|
|
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-09-27 16:37:40 +02:00
|
|
|
def debug_member(member: Any, health: str) -> None:
|
|
|
|
_log.debug(
|
|
|
|
"Node %(node_name)s is %(health)s: lag %(lag)s, state %(state)s, tl %(tl)s.",
|
|
|
|
{
|
|
|
|
"node_name": member["name"],
|
|
|
|
"health": health,
|
|
|
|
"lag": member["lag"],
|
|
|
|
"state": member["state"],
|
|
|
|
"tl": member["timeline"],
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
# get the cluster info
|
|
|
|
cluster_item_dict = self.rest_api("cluster")
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
replicas = []
|
2021-08-13 11:00:43 +02:00
|
|
|
healthy_replica = 0
|
|
|
|
unhealthy_replica = 0
|
2023-08-24 15:43:35 +02:00
|
|
|
sync_replica = 0
|
2023-09-27 16:37:40 +02:00
|
|
|
leader_tl = None
|
|
|
|
|
|
|
|
# Look for replicas
|
|
|
|
for member in cluster_item_dict["members"]:
|
2023-08-21 13:07:20 +02:00
|
|
|
if member["role"] in ["replica", "sync_standby"]:
|
2023-09-27 16:37:40 +02:00
|
|
|
if member["lag"] == "unknown":
|
|
|
|
# This could happen if the node is stopped
|
|
|
|
# nagiosplugin doesn't handle strings in perfstats
|
|
|
|
# so we have to ditch all the stats in that case
|
|
|
|
debug_member(member, "unhealthy")
|
|
|
|
unhealthy_replica += 1
|
|
|
|
continue
|
|
|
|
else:
|
2023-08-24 15:43:35 +02:00
|
|
|
replicas.append(
|
|
|
|
{
|
|
|
|
"name": member["name"],
|
|
|
|
"lag": member["lag"],
|
2023-09-27 16:37:40 +02:00
|
|
|
"timeline": member["timeline"],
|
2023-08-24 15:43:35 +02:00
|
|
|
"sync": 1 if member["role"] == "sync_standby" else 0,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2023-09-27 16:37:40 +02:00
|
|
|
# Get the leader tl if we haven't already
|
|
|
|
if leader_tl is None:
|
|
|
|
# If there are no leaders, we will loop here for all
|
|
|
|
# members because leader_tl will remain None. it's not
|
|
|
|
# a big deal since having no leader is rare.
|
|
|
|
for tmember in cluster_item_dict["members"]:
|
|
|
|
if tmember["role"] == "leader":
|
|
|
|
leader_tl = int(tmember["timeline"])
|
|
|
|
break
|
|
|
|
|
|
|
|
_log.debug(
|
|
|
|
"Patroni's leader_timeline is %(leader_tl)s",
|
|
|
|
{
|
|
|
|
"leader_tl": leader_tl,
|
|
|
|
},
|
|
|
|
)
|
2023-08-24 15:43:35 +02:00
|
|
|
|
2023-09-27 16:37:40 +02:00
|
|
|
# Test for an unhealthy replica
|
|
|
|
if (
|
|
|
|
self.has_detailed_states()
|
|
|
|
and not (
|
|
|
|
member["state"] in ["streaming", "in archive recovery"]
|
|
|
|
and int(member["timeline"]) == leader_tl
|
|
|
|
)
|
|
|
|
) or (
|
|
|
|
not self.has_detailed_states()
|
|
|
|
and not (
|
|
|
|
member["state"] == "running"
|
|
|
|
and int(member["timeline"]) == leader_tl
|
|
|
|
)
|
|
|
|
):
|
|
|
|
debug_member(member, "unhealthy")
|
|
|
|
unhealthy_replica += 1
|
|
|
|
continue
|
|
|
|
|
|
|
|
if member["role"] == "sync_standby":
|
|
|
|
sync_replica += 1
|
|
|
|
|
|
|
|
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
|
|
|
debug_member(member, "healthy")
|
|
|
|
healthy_replica += 1
|
|
|
|
else:
|
|
|
|
debug_member(member, "unhealthy")
|
|
|
|
unhealthy_replica += 1
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
# The actual check
|
2021-08-13 11:00:43 +02:00
|
|
|
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
2023-08-24 15:43:35 +02:00
|
|
|
yield nagiosplugin.Metric("sync_replica", sync_replica)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2022-02-07 14:18:14 +01:00
|
|
|
# The performance data : unhealthy replica count, replicas lag
|
2021-08-13 11:00:43 +02:00
|
|
|
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
2021-08-11 19:09:14 +02:00
|
|
|
for replica in replicas:
|
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
|
|
|
)
|
2023-09-27 16:37:40 +02:00
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
f"{replica['name']}_timeline",
|
|
|
|
replica["timeline"],
|
|
|
|
context="replica_timeline",
|
|
|
|
)
|
2023-08-24 15:43:35 +02:00
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
f"{replica['name']}_sync", replica["sync"], context="replica_sync"
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
# FIXME is this needed ??
|
|
|
|
# class ClusterHasReplicaSummary(nagiosplugin.Summary):
|
|
|
|
# def ok(self, results):
|
|
|
|
# def problem(self, results):
|
|
|
|
|
|
|
|
|
|
|
|
class ClusterConfigHasChanged(PatroniResource):
|
|
|
|
def __init__(
|
2023-10-02 13:44:49 +02:00
|
|
|
self,
|
2021-08-11 19:09:14 +02:00
|
|
|
connection_info: ConnectionInfo,
|
2022-02-07 15:01:50 +01:00
|
|
|
config_hash: str, # Always contains the old hash
|
2022-07-11 15:16:19 +02:00
|
|
|
state_file: str, # Only used to update the hash in the state_file (when needed)
|
2023-03-01 16:46:55 +01:00
|
|
|
save: bool = False, # Save the configuration
|
2021-08-11 19:09:14 +02:00
|
|
|
):
|
|
|
|
super().__init__(connection_info)
|
|
|
|
self.state_file = state_file
|
|
|
|
self.config_hash = config_hash
|
2023-03-01 16:46:55 +01:00
|
|
|
self.save = save
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("config")
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-03-12 19:43:06 +01:00
|
|
|
new_hash = hashlib.md5(json.dumps(item_dict).encode()).hexdigest()
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-08-22 11:12:48 +02:00
|
|
|
_log.debug("save result: %(issave)s", {"issave": self.save})
|
2022-02-07 15:01:50 +01:00
|
|
|
old_hash = self.config_hash
|
2023-03-01 16:46:55 +01:00
|
|
|
if self.state_file is not None and self.save:
|
2023-03-16 13:11:40 +01:00
|
|
|
_log.debug(
|
|
|
|
"saving new hash to state file / cookie %(state_file)s",
|
|
|
|
{"state_file": self.state_file},
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
cookie = nagiosplugin.Cookie(self.state_file)
|
|
|
|
cookie.open()
|
|
|
|
cookie["hash"] = new_hash
|
|
|
|
cookie.commit()
|
2022-02-07 15:01:50 +01:00
|
|
|
cookie.close()
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-03-16 13:11:40 +01:00
|
|
|
_log.debug(
|
|
|
|
"hash info: old hash %(old_hash)s, new hash %(new_hash)s",
|
|
|
|
{"old_hash": old_hash, "new_hash": new_hash},
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
return [
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_configuration_changed",
|
|
|
|
1 if new_hash != old_hash else 0,
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
|
2023-10-02 13:44:49 +02:00
|
|
|
def __init__(self, config_hash: str) -> None:
|
2022-02-07 15:01:50 +01:00
|
|
|
self.old_config_hash = config_hash
|
|
|
|
|
|
|
|
# Note: It would be helpful to display the old / new hash here. Unfortunately, it's not a metric.
|
|
|
|
# So we only have the old / expected one.
|
2023-10-02 13:44:49 +02:00
|
|
|
def ok(self, results: nagiosplugin.Result) -> str:
|
2022-02-07 15:01:50 +01:00
|
|
|
return f"The hash of patroni's dynamic configuration has not changed ({self.old_config_hash})."
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
@handle_unknown
|
2023-10-02 13:44:49 +02:00
|
|
|
def problem(self, results: nagiosplugin.Result) -> str:
|
2023-03-02 12:04:29 +01:00
|
|
|
return f"The hash of patroni's dynamic configuration has changed. The old hash was {self.old_config_hash}."
|
2021-08-12 12:48:55 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ClusterIsInMaintenance(PatroniResource):
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("cluster")
|
2021-08-12 12:48:55 +02:00
|
|
|
|
|
|
|
# The actual check
|
2021-08-12 13:02:49 +02:00
|
|
|
return [
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_in_maintenance",
|
|
|
|
1 if "pause" in item_dict and item_dict["pause"] else 0,
|
|
|
|
)
|
|
|
|
]
|
2023-08-22 17:29:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ClusterHasScheduledAction(PatroniResource):
|
2023-10-02 13:44:49 +02:00
|
|
|
def probe(self) -> Iterable[nagiosplugin.Metric]:
|
2023-08-22 17:29:13 +02:00
|
|
|
item_dict = self.rest_api("cluster")
|
|
|
|
|
|
|
|
scheduled_switchover = 0
|
|
|
|
scheduled_restart = 0
|
|
|
|
if "scheduled_switchover" in item_dict:
|
|
|
|
scheduled_switchover = 1
|
|
|
|
|
|
|
|
for member in item_dict["members"]:
|
|
|
|
if "scheduled_restart" in member:
|
|
|
|
scheduled_restart += 1
|
|
|
|
|
|
|
|
# The actual check
|
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
"has_scheduled_actions",
|
|
|
|
1 if (scheduled_switchover + scheduled_restart) > 0 else 0,
|
|
|
|
)
|
|
|
|
|
|
|
|
# The performance data : scheduled_switchover, scheduled action count
|
|
|
|
yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover)
|
|
|
|
yield nagiosplugin.Metric("scheduled_restart", scheduled_restart)
|