check-patroni/check_patroni/cluster.py
benoit 908669f073 Add a --save option when state files are used
The checks `cluster_config_has_changed` and `node_tl_has_changed` use a
state file to store the previous value of the config hash and the
timeline.

Previously the check would fail if something changed, but the new value
would be saved directly. This behavious has changed. The new value
is saved only if `--save` is passed to the check.

The mimics the way [check_pgactivity] manages this kind of checks.

[check_pgactivity]: https://github.com/OPMDG/check_pgactivity
2023-03-02 17:32:18 +01:00

200 lines
6.9 KiB
Python

import hashlib
import json
import logging
from collections import Counter
import nagiosplugin
from typing import Iterable, Union
from .types import PatroniResource, ConnectionInfo, handle_unknown
_log = logging.getLogger("nagiosplugin")
def replace_chars(text: str) -> str:
return text.replace("'", "").replace(" ", "_")
class ClusterNodeCount(PatroniResource):
def probe(self: "ClusterNodeCount") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
role_counters: Counter[str] = Counter()
roles = []
status_counters: Counter[str] = Counter()
statuses = []
for member in item_dict["members"]:
roles.append(replace_chars(member["role"]))
statuses.append(replace_chars(member["state"]))
role_counters.update(roles)
status_counters.update(statuses)
# The actual check: members, running state
yield nagiosplugin.Metric("members", len(item_dict["members"]))
yield nagiosplugin.Metric("state_running", status_counters["running"])
# The performance data : role
for role in role_counters:
yield nagiosplugin.Metric(
f"role_{role}", role_counters[role], context="member_roles"
)
# The performance data : statuses (except running)
for state in status_counters:
if state != "running":
yield nagiosplugin.Metric(
f"state_{state}", status_counters[state], context="member_statuses"
)
class ClusterHasLeader(PatroniResource):
def probe(self: "ClusterHasLeader") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
is_leader_found = False
for member in item_dict["members"]:
if member["role"] == "leader" and member["state"] == "running":
is_leader_found = True
break
return [
nagiosplugin.Metric(
"has_leader",
1 if is_leader_found else 0,
)
]
class ClusterHasLeaderSummary(nagiosplugin.Summary):
def ok(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has a running leader."
@handle_unknown
def problem(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has no running leader."
class ClusterHasReplica(PatroniResource):
def __init__(
self: "ClusterHasReplica",
connection_info: ConnectionInfo,
max_lag: Union[int, None],
):
super().__init__(connection_info)
self.max_lag = max_lag
def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
replicas = []
healthy_replica = 0
unhealthy_replica = 0
for member in item_dict["members"]:
# FIXME are there other acceptable states
if member["role"] == "replica":
if member["state"] == "running" and member["lag"] != "unknown":
replicas.append({"name": member["name"], "lag": member["lag"]})
if self.max_lag is None or self.max_lag >= int(member["lag"]):
healthy_replica += 1
continue
unhealthy_replica += 1
# The actual check
yield nagiosplugin.Metric("healthy_replica", healthy_replica)
# The performance data : unhealthy replica count, replicas lag
yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
for replica in replicas:
yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
)
# FIXME is this needed ??
# class ClusterHasReplicaSummary(nagiosplugin.Summary):
# def ok(self, results):
# def problem(self, results):
class ClusterConfigHasChanged(PatroniResource):
def __init__(
self: "ClusterConfigHasChanged",
connection_info: ConnectionInfo,
config_hash: str, # Always contains the old hash
state_file: str, # Only used to update the hash in the state_file (when needed)
save: bool = False, # Save the configuration
):
super().__init__(connection_info)
self.state_file = state_file
self.config_hash = config_hash
self.save = save
def probe(self: "ClusterConfigHasChanged") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("config")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
new_hash = hashlib.md5(r.data).hexdigest()
_log.debug(f"save result: {self.save}")
old_hash = self.config_hash
if self.state_file is not None and self.save:
_log.debug(f"saving new hash to state file / cookie {self.state_file}")
cookie = nagiosplugin.Cookie(self.state_file)
cookie.open()
cookie["hash"] = new_hash
cookie.commit()
cookie.close()
_log.debug(f"hash info: old hash {old_hash}, new hash {new_hash}")
return [
nagiosplugin.Metric(
"is_configuration_changed",
1 if new_hash != old_hash else 0,
)
]
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
def __init__(self: "ClusterConfigHasChangedSummary", config_hash: str) -> None:
self.old_config_hash = config_hash
# Note: It would be helpful to display the old / new hash here. Unfortunately, it's not a metric.
# So we only have the old / expected one.
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
return f"The hash of patroni's dynamic configuration has not changed ({self.old_config_hash})."
@handle_unknown
def problem(
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
) -> str:
return "The hash of patroni's dynamic configuration has changed. The old hash was {self.old_config_hash}."
class ClusterIsInMaintenance(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
r = self.rest_api("cluster")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
# The actual check
return [
nagiosplugin.Metric(
"is_in_maintenance",
1 if "pause" in item_dict and item_dict["pause"] else 0,
)
]