check-patroni/check_patroni/cluster.py

194 lines
6.7 KiB
Python
Raw Normal View History

2021-08-11 19:09:14 +02:00
import hashlib
import json
from collections import Counter
from typing import Iterable, Union
2021-08-11 19:09:14 +02:00
import nagiosplugin
from . import _log
from .types import ConnectionInfo, PatroniResource, handle_unknown
2021-08-11 19:09:14 +02:00
def replace_chars(text: str) -> str:
return text.replace("'", "").replace(" ", "_")
class ClusterNodeCount(PatroniResource):
2021-08-12 11:38:55 +02:00
def probe(self: "ClusterNodeCount") -> Iterable[nagiosplugin.Metric]:
2023-03-12 19:43:06 +01:00
item_dict = self.rest_api("cluster")
2021-08-12 12:48:55 +02:00
role_counters: Counter[str] = Counter()
2021-08-11 19:09:14 +02:00
roles = []
2021-08-12 12:48:55 +02:00
status_counters: Counter[str] = Counter()
2021-08-11 19:09:14 +02:00
statuses = []
for member in item_dict["members"]:
roles.append(replace_chars(member["role"]))
statuses.append(replace_chars(member["state"]))
role_counters.update(roles)
status_counters.update(statuses)
# The actual check: members, healthy_members
2021-08-11 19:09:14 +02:00
yield nagiosplugin.Metric("members", len(item_dict["members"]))
yield nagiosplugin.Metric(
"healthy_members",
status_counters["running"] + status_counters.get("streaming", 0),
)
2021-08-11 19:09:14 +02:00
# The performance data : role
for role in role_counters:
yield nagiosplugin.Metric(
2022-02-07 14:18:14 +01:00
f"role_{role}", role_counters[role], context="member_roles"
2021-08-11 19:09:14 +02:00
)
# The performance data : statuses (except running)
for state in status_counters:
yield nagiosplugin.Metric(
f"state_{state}", status_counters[state], context="member_statuses"
)
2021-08-11 19:09:14 +02:00
class ClusterHasLeader(PatroniResource):
2021-08-12 11:38:55 +02:00
def probe(self: "ClusterHasLeader") -> Iterable[nagiosplugin.Metric]:
2023-03-12 19:43:06 +01:00
item_dict = self.rest_api("cluster")
2021-08-11 19:09:14 +02:00
is_leader_found = False
for member in item_dict["members"]:
if member["role"] == "leader" and member["state"] == "running":
is_leader_found = True
break
return [
nagiosplugin.Metric(
"has_leader",
1 if is_leader_found else 0,
)
]
class ClusterHasLeaderSummary(nagiosplugin.Summary):
def ok(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has a running leader."
@handle_unknown
def problem(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has no running leader."
class ClusterHasReplica(PatroniResource):
def __init__(
self: "ClusterHasReplica",
connection_info: ConnectionInfo,
max_lag: Union[int, None],
):
super().__init__(connection_info)
self.max_lag = max_lag
2021-08-12 11:38:55 +02:00
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
2023-03-12 19:43:06 +01:00
item_dict = self.rest_api("cluster")
2021-08-11 19:09:14 +02:00
replicas = []
healthy_replica = 0
unhealthy_replica = 0
2021-08-11 19:09:14 +02:00
for member in item_dict["members"]:
# FIXME are there other acceptable states
if member["role"] in ["replica", "sync_standby"]:
# patroni 3.0.4 changed the standby state from running to streaming
if (
member["state"] in ["running", "streaming"]
and member["lag"] != "unknown" # noqa: W503
):
replicas.append({"name": member["name"], "lag": member["lag"]})
if self.max_lag is None or self.max_lag >= int(member["lag"]):
healthy_replica += 1
continue
unhealthy_replica += 1
2021-08-11 19:09:14 +02:00
# The actual check
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
2021-08-11 19:09:14 +02:00
2022-02-07 14:18:14 +01:00
# The performance data : unhealthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
2021-08-11 19:09:14 +02:00
for replica in replicas:
yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
)
# FIXME is this needed ??
# class ClusterHasReplicaSummary(nagiosplugin.Summary):
# def ok(self, results):
# def problem(self, results):
class ClusterConfigHasChanged(PatroniResource):
def __init__(
self: "ClusterConfigHasChanged",
connection_info: ConnectionInfo,
config_hash: str, # Always contains the old hash
2022-07-11 15:16:19 +02:00
state_file: str, # Only used to update the hash in the state_file (when needed)
save: bool = False, # Save the configuration
2021-08-11 19:09:14 +02:00
):
super().__init__(connection_info)
self.state_file = state_file
self.config_hash = config_hash
self.save = save
2021-08-11 19:09:14 +02:00
2021-08-12 11:38:55 +02:00
def probe(self: "ClusterConfigHasChanged") -> Iterable[nagiosplugin.Metric]:
2023-03-12 19:43:06 +01:00
item_dict = self.rest_api("config")
2021-08-11 19:09:14 +02:00
2023-03-12 19:43:06 +01:00
new_hash = hashlib.md5(json.dumps(item_dict).encode()).hexdigest()
2021-08-11 19:09:14 +02:00
_log.debug("save result: %(save)s", {"issave": self.save})
old_hash = self.config_hash
if self.state_file is not None and self.save:
_log.debug(
"saving new hash to state file / cookie %(state_file)s",
{"state_file": self.state_file},
)
2021-08-11 19:09:14 +02:00
cookie = nagiosplugin.Cookie(self.state_file)
cookie.open()
cookie["hash"] = new_hash
cookie.commit()
cookie.close()
2021-08-11 19:09:14 +02:00
_log.debug(
"hash info: old hash %(old_hash)s, new hash %(new_hash)s",
{"old_hash": old_hash, "new_hash": new_hash},
)
2021-08-11 19:09:14 +02:00
return [
nagiosplugin.Metric(
"is_configuration_changed",
1 if new_hash != old_hash else 0,
)
]
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
def __init__(self: "ClusterConfigHasChangedSummary", config_hash: str) -> None:
self.old_config_hash = config_hash
# Note: It would be helpful to display the old / new hash here. Unfortunately, it's not a metric.
# So we only have the old / expected one.
2021-08-11 19:09:14 +02:00
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
return f"The hash of patroni's dynamic configuration has not changed ({self.old_config_hash})."
2021-08-11 19:09:14 +02:00
@handle_unknown
def problem(
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
) -> str:
return f"The hash of patroni's dynamic configuration has changed. The old hash was {self.old_config_hash}."
2021-08-12 12:48:55 +02:00
class ClusterIsInMaintenance(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
2023-03-12 19:43:06 +01:00
item_dict = self.rest_api("cluster")
2021-08-12 12:48:55 +02:00
# The actual check
2021-08-12 13:02:49 +02:00
return [
nagiosplugin.Metric(
"is_in_maintenance",
1 if "pause" in item_dict and item_dict["pause"] else 0,
)
]