import hashlib import json from collections import Counter from typing import Any, Iterable, Union import nagiosplugin from . import _log from .types import ConnectionInfo, PatroniResource, handle_unknown def replace_chars(text: str) -> str: return text.replace("'", "").replace(" ", "_") class ClusterNodeCount(PatroniResource): def probe(self) -> Iterable[nagiosplugin.Metric]: def debug_member(member: Any, health: str) -> None: _log.debug( "Node %(node_name)s is %(health)s: role %(role)s state %(state)s.", { "node_name": member["name"], "health": health, "role": member["role"], "state": member["state"], }, ) # get the cluster info item_dict = self.rest_api("cluster") role_counters: Counter[str] = Counter() roles = [] status_counters: Counter[str] = Counter() statuses = [] healthy_member = 0 for member in item_dict["members"]: state, role = member["state"], member["role"] roles.append(replace_chars(role)) statuses.append(replace_chars(state)) if role == "leader" and state == "running": healthy_member += 1 debug_member(member, "healthy") continue if role in ["standby_leader", "replica", "sync_standby"] and ( (self.has_detailed_states() and state == "streaming") or (not self.has_detailed_states() and state == "running") ): healthy_member += 1 debug_member(member, "healthy") continue debug_member(member, "unhealthy") role_counters.update(roles) status_counters.update(statuses) # The actual check: members, healthy_members yield nagiosplugin.Metric("members", len(item_dict["members"])) yield nagiosplugin.Metric("healthy_members", healthy_member) # The performance data : role for role in role_counters: yield nagiosplugin.Metric( f"role_{role}", role_counters[role], context="member_roles" ) # The performance data : statuses (except running) for state in status_counters: yield nagiosplugin.Metric( f"state_{state}", status_counters[state], context="member_statuses" ) class ClusterHasLeader(PatroniResource): def probe(self) -> Iterable[nagiosplugin.Metric]: item_dict = self.rest_api("cluster") is_leader_found = False is_standby_leader_found = False is_standby_leader_in_arc_rec = False for member in item_dict["members"]: if member["role"] == "leader" and member["state"] == "running": is_leader_found = True break if member["role"] == "standby_leader": if member["state"] not in ["streaming", "in archive recovery"]: # for patroni >= 3.0.4 any state would be wrong # for patroni < 3.0.4 a state different from running would be wrong if self.has_detailed_states() or member["state"] != "running": continue if member["state"] in ["in archive recovery"]: is_standby_leader_in_arc_rec = True is_standby_leader_found = True break return [ nagiosplugin.Metric( "has_leader", 1 if is_leader_found or is_standby_leader_found else 0, ), nagiosplugin.Metric( "is_standby_leader_in_arc_rec", 1 if is_standby_leader_in_arc_rec else 0, ), nagiosplugin.Metric( "is_standby_leader", 1 if is_standby_leader_found else 0, ), nagiosplugin.Metric( "is_leader", 1 if is_leader_found else 0, ), ] class ClusterHasLeaderSummary(nagiosplugin.Summary): def ok(self, results: nagiosplugin.Result) -> str: return "The cluster has a running leader." @handle_unknown def problem(self, results: nagiosplugin.Result) -> str: return "The cluster has no running leader or the standby leader is in archive recovery." class ClusterHasReplica(PatroniResource): def __init__(self, connection_info: ConnectionInfo, max_lag: Union[int, None]): super().__init__(connection_info) self.max_lag = max_lag def probe(self) -> Iterable[nagiosplugin.Metric]: def debug_member(member: Any, health: str) -> None: _log.debug( "Node %(node_name)s is %(health)s: lag %(lag)s, state %(state)s, tl %(tl)s.", { "node_name": member["name"], "health": health, "lag": member["lag"], "state": member["state"], "tl": member["timeline"], }, ) # get the cluster info cluster_item_dict = self.rest_api("cluster") replicas = [] healthy_replica = 0 unhealthy_replica = 0 sync_replica = 0 leader_tl = None # Look for replicas for member in cluster_item_dict["members"]: if member["role"] in ["replica", "sync_standby"]: if member["lag"] == "unknown": # This could happen if the node is stopped # nagiosplugin doesn't handle strings in perfstats # so we have to ditch all the stats in that case debug_member(member, "unhealthy") unhealthy_replica += 1 continue else: replicas.append( { "name": member["name"], "lag": member["lag"], "timeline": member["timeline"], "sync": 1 if member["role"] == "sync_standby" else 0, } ) # Get the leader tl if we haven't already if leader_tl is None: # If there are no leaders, we will loop here for all # members because leader_tl will remain None. it's not # a big deal since having no leader is rare. for tmember in cluster_item_dict["members"]: if tmember["role"] == "leader": leader_tl = int(tmember["timeline"]) break _log.debug( "Patroni's leader_timeline is %(leader_tl)s", { "leader_tl": leader_tl, }, ) # Test for an unhealthy replica if ( self.has_detailed_states() and not ( member["state"] in ["streaming", "in archive recovery"] and int(member["timeline"]) == leader_tl ) ) or ( not self.has_detailed_states() and not ( member["state"] == "running" and int(member["timeline"]) == leader_tl ) ): debug_member(member, "unhealthy") unhealthy_replica += 1 continue if member["role"] == "sync_standby": sync_replica += 1 if self.max_lag is None or self.max_lag >= int(member["lag"]): debug_member(member, "healthy") healthy_replica += 1 else: debug_member(member, "unhealthy") unhealthy_replica += 1 # The actual check yield nagiosplugin.Metric("healthy_replica", healthy_replica) yield nagiosplugin.Metric("sync_replica", sync_replica) # The performance data : unhealthy replica count, replicas lag yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica) for replica in replicas: yield nagiosplugin.Metric( f"{replica['name']}_lag", replica["lag"], context="replica_lag" ) yield nagiosplugin.Metric( f"{replica['name']}_timeline", replica["timeline"], context="replica_timeline", ) yield nagiosplugin.Metric( f"{replica['name']}_sync", replica["sync"], context="replica_sync" ) # FIXME is this needed ?? # class ClusterHasReplicaSummary(nagiosplugin.Summary): # def ok(self, results): # def problem(self, results): class ClusterConfigHasChanged(PatroniResource): def __init__( self, connection_info: ConnectionInfo, config_hash: str, # Always contains the old hash state_file: str, # Only used to update the hash in the state_file (when needed) save: bool = False, # Save the configuration ): super().__init__(connection_info) self.state_file = state_file self.config_hash = config_hash self.save = save def probe(self) -> Iterable[nagiosplugin.Metric]: item_dict = self.rest_api("config") new_hash = hashlib.md5(json.dumps(item_dict).encode()).hexdigest() _log.debug("save result: %(issave)s", {"issave": self.save}) old_hash = self.config_hash if self.state_file is not None and self.save: _log.debug( "saving new hash to state file / cookie %(state_file)s", {"state_file": self.state_file}, ) cookie = nagiosplugin.Cookie(self.state_file) cookie.open() cookie["hash"] = new_hash cookie.commit() cookie.close() _log.debug( "hash info: old hash %(old_hash)s, new hash %(new_hash)s", {"old_hash": old_hash, "new_hash": new_hash}, ) return [ nagiosplugin.Metric( "is_configuration_changed", 1 if new_hash != old_hash else 0, ) ] class ClusterConfigHasChangedSummary(nagiosplugin.Summary): def __init__(self, config_hash: str) -> None: self.old_config_hash = config_hash # Note: It would be helpful to display the old / new hash here. Unfortunately, it's not a metric. # So we only have the old / expected one. def ok(self, results: nagiosplugin.Result) -> str: return f"The hash of patroni's dynamic configuration has not changed ({self.old_config_hash})." @handle_unknown def problem(self, results: nagiosplugin.Result) -> str: return f"The hash of patroni's dynamic configuration has changed. The old hash was {self.old_config_hash}." class ClusterIsInMaintenance(PatroniResource): def probe(self) -> Iterable[nagiosplugin.Metric]: item_dict = self.rest_api("cluster") # The actual check return [ nagiosplugin.Metric( "is_in_maintenance", 1 if "pause" in item_dict and item_dict["pause"] else 0, ) ] class ClusterHasScheduledAction(PatroniResource): def probe(self) -> Iterable[nagiosplugin.Metric]: item_dict = self.rest_api("cluster") scheduled_switchover = 0 scheduled_restart = 0 if "scheduled_switchover" in item_dict: scheduled_switchover = 1 for member in item_dict["members"]: if "scheduled_restart" in member: scheduled_restart += 1 # The actual check yield nagiosplugin.Metric( "has_scheduled_actions", 1 if (scheduled_switchover + scheduled_restart) > 0 else 0, ) # The performance data : scheduled_switchover, scheduled action count yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover) yield nagiosplugin.Metric("scheduled_restart", scheduled_restart)