908669f073
The checks `cluster_config_has_changed` and `node_tl_has_changed` use a state file to store the previous value of the config hash and the timeline. Previously the check would fail if something changed, but the new value would be saved directly. This behavious has changed. The new value is saved only if `--save` is passed to the check. The mimics the way [check_pgactivity] manages this kind of checks. [check_pgactivity]: https://github.com/OPMDG/check_pgactivity
200 lines
6.9 KiB
Python
200 lines
6.9 KiB
Python
import hashlib
|
|
import json
|
|
import logging
|
|
from collections import Counter
|
|
|
|
import nagiosplugin
|
|
from typing import Iterable, Union
|
|
|
|
from .types import PatroniResource, ConnectionInfo, handle_unknown
|
|
|
|
_log = logging.getLogger("nagiosplugin")
|
|
|
|
|
|
def replace_chars(text: str) -> str:
|
|
return text.replace("'", "").replace(" ", "_")
|
|
|
|
|
|
class ClusterNodeCount(PatroniResource):
|
|
def probe(self: "ClusterNodeCount") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("cluster")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
role_counters: Counter[str] = Counter()
|
|
roles = []
|
|
status_counters: Counter[str] = Counter()
|
|
statuses = []
|
|
|
|
for member in item_dict["members"]:
|
|
roles.append(replace_chars(member["role"]))
|
|
statuses.append(replace_chars(member["state"]))
|
|
role_counters.update(roles)
|
|
status_counters.update(statuses)
|
|
|
|
# The actual check: members, running state
|
|
yield nagiosplugin.Metric("members", len(item_dict["members"]))
|
|
yield nagiosplugin.Metric("state_running", status_counters["running"])
|
|
|
|
# The performance data : role
|
|
for role in role_counters:
|
|
yield nagiosplugin.Metric(
|
|
f"role_{role}", role_counters[role], context="member_roles"
|
|
)
|
|
|
|
# The performance data : statuses (except running)
|
|
for state in status_counters:
|
|
if state != "running":
|
|
yield nagiosplugin.Metric(
|
|
f"state_{state}", status_counters[state], context="member_statuses"
|
|
)
|
|
|
|
|
|
class ClusterHasLeader(PatroniResource):
|
|
def probe(self: "ClusterHasLeader") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("cluster")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
is_leader_found = False
|
|
for member in item_dict["members"]:
|
|
if member["role"] == "leader" and member["state"] == "running":
|
|
is_leader_found = True
|
|
break
|
|
|
|
return [
|
|
nagiosplugin.Metric(
|
|
"has_leader",
|
|
1 if is_leader_found else 0,
|
|
)
|
|
]
|
|
|
|
|
|
class ClusterHasLeaderSummary(nagiosplugin.Summary):
|
|
def ok(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
|
|
return "The cluster has a running leader."
|
|
|
|
@handle_unknown
|
|
def problem(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
|
|
return "The cluster has no running leader."
|
|
|
|
|
|
class ClusterHasReplica(PatroniResource):
|
|
def __init__(
|
|
self: "ClusterHasReplica",
|
|
connection_info: ConnectionInfo,
|
|
max_lag: Union[int, None],
|
|
):
|
|
super().__init__(connection_info)
|
|
self.max_lag = max_lag
|
|
|
|
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("cluster")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
replicas = []
|
|
healthy_replica = 0
|
|
unhealthy_replica = 0
|
|
for member in item_dict["members"]:
|
|
# FIXME are there other acceptable states
|
|
if member["role"] == "replica":
|
|
if member["state"] == "running" and member["lag"] != "unknown":
|
|
replicas.append({"name": member["name"], "lag": member["lag"]})
|
|
if self.max_lag is None or self.max_lag >= int(member["lag"]):
|
|
healthy_replica += 1
|
|
continue
|
|
unhealthy_replica += 1
|
|
|
|
# The actual check
|
|
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
|
|
|
|
# The performance data : unhealthy replica count, replicas lag
|
|
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
|
|
for replica in replicas:
|
|
yield nagiosplugin.Metric(
|
|
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
|
|
)
|
|
|
|
|
|
# FIXME is this needed ??
|
|
# class ClusterHasReplicaSummary(nagiosplugin.Summary):
|
|
# def ok(self, results):
|
|
# def problem(self, results):
|
|
|
|
|
|
class ClusterConfigHasChanged(PatroniResource):
|
|
def __init__(
|
|
self: "ClusterConfigHasChanged",
|
|
connection_info: ConnectionInfo,
|
|
config_hash: str, # Always contains the old hash
|
|
state_file: str, # Only used to update the hash in the state_file (when needed)
|
|
save: bool = False, # Save the configuration
|
|
):
|
|
super().__init__(connection_info)
|
|
self.state_file = state_file
|
|
self.config_hash = config_hash
|
|
self.save = save
|
|
|
|
def probe(self: "ClusterConfigHasChanged") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("config")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
new_hash = hashlib.md5(r.data).hexdigest()
|
|
|
|
_log.debug(f"save result: {self.save}")
|
|
old_hash = self.config_hash
|
|
if self.state_file is not None and self.save:
|
|
_log.debug(f"saving new hash to state file / cookie {self.state_file}")
|
|
cookie = nagiosplugin.Cookie(self.state_file)
|
|
cookie.open()
|
|
cookie["hash"] = new_hash
|
|
cookie.commit()
|
|
cookie.close()
|
|
|
|
_log.debug(f"hash info: old hash {old_hash}, new hash {new_hash}")
|
|
|
|
return [
|
|
nagiosplugin.Metric(
|
|
"is_configuration_changed",
|
|
1 if new_hash != old_hash else 0,
|
|
)
|
|
]
|
|
|
|
|
|
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
|
|
def __init__(self: "ClusterConfigHasChangedSummary", config_hash: str) -> None:
|
|
self.old_config_hash = config_hash
|
|
|
|
# Note: It would be helpful to display the old / new hash here. Unfortunately, it's not a metric.
|
|
# So we only have the old / expected one.
|
|
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
|
|
return f"The hash of patroni's dynamic configuration has not changed ({self.old_config_hash})."
|
|
|
|
@handle_unknown
|
|
def problem(
|
|
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
|
|
) -> str:
|
|
return "The hash of patroni's dynamic configuration has changed. The old hash was {self.old_config_hash}."
|
|
|
|
|
|
class ClusterIsInMaintenance(PatroniResource):
|
|
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("cluster")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
|
|
# The actual check
|
|
return [
|
|
nagiosplugin.Metric(
|
|
"is_in_maintenance",
|
|
1 if "pause" in item_dict and item_dict["pause"] else 0,
|
|
)
|
|
]
|