908669f073
The checks `cluster_config_has_changed` and `node_tl_has_changed` use a state file to store the previous value of the config hash and the timeline. Previously the check would fail if something changed, but the new value would be saved directly. This behavious has changed. The new value is saved only if `--save` is passed to the check. The mimics the way [check_pgactivity] manages this kind of checks. [check_pgactivity]: https://github.com/OPMDG/check_pgactivity
207 lines
7.3 KiB
Python
207 lines
7.3 KiB
Python
import json
|
|
import logging
|
|
|
|
import nagiosplugin
|
|
from typing import Iterable
|
|
|
|
from .types import ConnectionInfo, handle_unknown, PatroniResource
|
|
|
|
|
|
_log = logging.getLogger("nagiosplugin")
|
|
|
|
|
|
class NodeIsPrimary(PatroniResource):
|
|
def probe(self: "NodeIsPrimary") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("primary")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
return [nagiosplugin.Metric("is_primary", 1 if r.status == 200 else 0)]
|
|
|
|
|
|
class NodeIsPrimarySummary(nagiosplugin.Summary):
|
|
def ok(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
|
|
return "This node is the primary with the leader lock."
|
|
|
|
@handle_unknown
|
|
def problem(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
|
|
return "This node is not the primary with the leader lock."
|
|
|
|
|
|
class NodeIsReplica(PatroniResource):
|
|
def __init__(
|
|
self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
|
|
) -> None:
|
|
super().__init__(connection_info)
|
|
self.max_lag = max_lag
|
|
|
|
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
|
|
if self.max_lag is None:
|
|
r = self.rest_api("replica")
|
|
else:
|
|
r = self.rest_api(f"replica?lag={self.max_lag}")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
return [nagiosplugin.Metric("is_replica", 1 if r.status == 200 else 0)]
|
|
|
|
|
|
class NodeIsReplicaSummary(nagiosplugin.Summary):
|
|
def __init__(self: "NodeIsReplicaSummary", lag: str) -> None:
|
|
self.lag = lag
|
|
|
|
def ok(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
|
|
if self.lag is None:
|
|
return "This node is a running replica with no noloadbalance tag."
|
|
return f"This node is a running replica with no noloadbalance tag and the lag is under {self.lag}."
|
|
|
|
@handle_unknown
|
|
def problem(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
|
|
if self.lag is None:
|
|
return "This node is not a running replica with no noloadbalance tag."
|
|
return f"This node is not a running replica with no noloadbalance tag and a lag under {self.lag}."
|
|
|
|
|
|
class NodeIsPendingRestart(PatroniResource):
|
|
def probe(self: "NodeIsPendingRestart") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("patroni")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
is_pending_restart = item_dict.get("pending_restart", False)
|
|
return [
|
|
nagiosplugin.Metric(
|
|
"is_pending_restart",
|
|
1 if is_pending_restart else 0,
|
|
)
|
|
]
|
|
|
|
|
|
class NodeIsPendingRestartSummary(nagiosplugin.Summary):
|
|
def ok(self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result) -> str:
|
|
return "This node doesn't have the pending restart flag."
|
|
|
|
@handle_unknown
|
|
def problem(
|
|
self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result
|
|
) -> str:
|
|
return "This node has the pending restart flag."
|
|
|
|
|
|
class NodeTLHasChanged(PatroniResource):
|
|
def __init__(
|
|
self: "NodeTLHasChanged",
|
|
connection_info: ConnectionInfo,
|
|
timeline: str, # Always contains the old timeline
|
|
state_file: str, # Only used to update the timeline in the state_file (when needed)
|
|
save: bool, # save timeline in state file
|
|
) -> None:
|
|
super().__init__(connection_info)
|
|
self.state_file = state_file
|
|
self.timeline = timeline
|
|
self.save = save
|
|
|
|
def probe(self: "NodeTLHasChanged") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("patroni")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
new_tl = item_dict["timeline"]
|
|
|
|
_log.debug(f"save result: {self.save}")
|
|
old_tl = self.timeline
|
|
if self.state_file is not None and self.save:
|
|
_log.debug(f"saving new timeline to state file / cookie {self.state_file}")
|
|
cookie = nagiosplugin.Cookie(self.state_file)
|
|
cookie.open()
|
|
cookie["timeline"] = new_tl
|
|
cookie.commit()
|
|
cookie.close()
|
|
|
|
_log.debug(f"Tl data: old tl {old_tl}, new tl {new_tl}")
|
|
|
|
# The actual check
|
|
yield nagiosplugin.Metric(
|
|
"is_timeline_changed",
|
|
1 if str(new_tl) != str(old_tl) else 0,
|
|
)
|
|
|
|
# The performance data : the timeline number
|
|
yield nagiosplugin.Metric("timeline", new_tl)
|
|
|
|
|
|
class NodeTLHasChangedSummary(nagiosplugin.Summary):
|
|
def __init__(self: "NodeTLHasChangedSummary", timeline: str) -> None:
|
|
self.timeline = timeline
|
|
|
|
def ok(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
|
|
return f"The timeline is still {self.timeline}."
|
|
|
|
@handle_unknown
|
|
def problem(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
|
|
return f"The expected timeline was {self.timeline} got {results['timeline'].metric}."
|
|
|
|
|
|
class NodePatroniVersion(PatroniResource):
|
|
def __init__(
|
|
self: "NodePatroniVersion",
|
|
connection_info: ConnectionInfo,
|
|
patroni_version: str,
|
|
) -> None:
|
|
super().__init__(connection_info)
|
|
self.patroni_version = patroni_version
|
|
|
|
def probe(self: "NodePatroniVersion") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("patroni")
|
|
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
item_dict = json.loads(r.data)
|
|
version = item_dict["patroni"]["version"]
|
|
_log.debug(
|
|
f"Version data: patroni version {version} input version {self.patroni_version}"
|
|
)
|
|
|
|
# The actual check
|
|
return [
|
|
nagiosplugin.Metric(
|
|
"is_version_ok",
|
|
1 if version == self.patroni_version else 0,
|
|
)
|
|
]
|
|
|
|
|
|
class NodePatroniVersionSummary(nagiosplugin.Summary):
|
|
def __init__(self: "NodePatroniVersionSummary", patroni_version: str) -> None:
|
|
self.patroni_version = patroni_version
|
|
|
|
def ok(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
|
|
return f"Patroni's version is {self.patroni_version}."
|
|
|
|
@handle_unknown
|
|
def problem(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
|
|
# FIXME find a way to make the following work, check is perf data can be strings
|
|
# return f"The expected patroni version was {self.patroni_version} got {results['patroni_version'].metric}."
|
|
return f"Patroni's version is not {self.patroni_version}."
|
|
|
|
|
|
class NodeIsAlive(PatroniResource):
|
|
def probe(self: "NodeIsAlive") -> Iterable[nagiosplugin.Metric]:
|
|
r = self.rest_api("liveness")
|
|
_log.debug(f"api call status: {r.status}")
|
|
_log.debug(f"api call data: {r.data}")
|
|
|
|
return [nagiosplugin.Metric("is_alive", 1 if r.status == 200 else 0)]
|
|
|
|
|
|
class NodeIsAliveSummary(nagiosplugin.Summary):
|
|
def ok(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
|
|
return "This node is alive (patroni is running)."
|
|
|
|
@handle_unknown
|
|
def problem(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
|
|
return "This node is not alive (patroni is not running)."
|