2021-08-12 11:38:55 +02:00
|
|
|
from typing import Iterable
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-03-20 11:44:19 +01:00
|
|
|
import nagiosplugin
|
|
|
|
|
2023-03-16 13:11:40 +01:00
|
|
|
from . import _log
|
2023-03-20 11:44:19 +01:00
|
|
|
from .types import APIError, ConnectionInfo, PatroniResource, handle_unknown
|
2021-08-12 11:38:55 +02:00
|
|
|
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
class NodeIsPrimary(PatroniResource):
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodeIsPrimary") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
try:
|
|
|
|
self.rest_api("primary")
|
|
|
|
except APIError:
|
|
|
|
return [nagiosplugin.Metric("is_primary", 0)]
|
|
|
|
return [nagiosplugin.Metric("is_primary", 1)]
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
class NodeIsPrimarySummary(nagiosplugin.Summary):
|
|
|
|
def ok(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return "This node is the primary with the leader lock."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return "This node is not the primary with the leader lock."
|
|
|
|
|
|
|
|
|
|
|
|
class NodeIsReplica(PatroniResource):
|
|
|
|
def __init__(
|
2021-08-13 11:00:43 +02:00
|
|
|
self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
|
2021-08-11 19:09:14 +02:00
|
|
|
) -> None:
|
|
|
|
super().__init__(connection_info)
|
2021-08-13 11:00:43 +02:00
|
|
|
self.max_lag = max_lag
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
try:
|
|
|
|
if self.max_lag is None:
|
|
|
|
self.rest_api("replica")
|
|
|
|
else:
|
|
|
|
self.rest_api(f"replica?lag={self.max_lag}")
|
|
|
|
except APIError:
|
|
|
|
return [nagiosplugin.Metric("is_replica", 0)]
|
|
|
|
return [nagiosplugin.Metric("is_replica", 1)]
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
class NodeIsReplicaSummary(nagiosplugin.Summary):
|
|
|
|
def __init__(self: "NodeIsReplicaSummary", lag: str) -> None:
|
|
|
|
self.lag = lag
|
|
|
|
|
|
|
|
def ok(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
if self.lag is None:
|
|
|
|
return "This node is a running replica with no noloadbalance tag."
|
|
|
|
return f"This node is a running replica with no noloadbalance tag and the lag is under {self.lag}."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
if self.lag is None:
|
|
|
|
return "This node is not a running replica with no noloadbalance tag."
|
|
|
|
return f"This node is not a running replica with no noloadbalance tag and a lag under {self.lag}."
|
|
|
|
|
|
|
|
|
|
|
|
class NodeIsPendingRestart(PatroniResource):
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodeIsPendingRestart") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("patroni")
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
is_pending_restart = item_dict.get("pending_restart", False)
|
|
|
|
return [
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_pending_restart",
|
|
|
|
1 if is_pending_restart else 0,
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class NodeIsPendingRestartSummary(nagiosplugin.Summary):
|
|
|
|
def ok(self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return "This node doesn't have the pending restart flag."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(
|
|
|
|
self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result
|
|
|
|
) -> str:
|
|
|
|
return "This node has the pending restart flag."
|
|
|
|
|
|
|
|
|
|
|
|
class NodeTLHasChanged(PatroniResource):
|
|
|
|
def __init__(
|
|
|
|
self: "NodeTLHasChanged",
|
|
|
|
connection_info: ConnectionInfo,
|
2022-07-11 15:16:19 +02:00
|
|
|
timeline: str, # Always contains the old timeline
|
2022-02-07 15:01:50 +01:00
|
|
|
state_file: str, # Only used to update the timeline in the state_file (when needed)
|
2023-03-01 16:46:55 +01:00
|
|
|
save: bool, # save timeline in state file
|
2021-08-11 19:09:14 +02:00
|
|
|
) -> None:
|
|
|
|
super().__init__(connection_info)
|
|
|
|
self.state_file = state_file
|
|
|
|
self.timeline = timeline
|
2023-03-01 16:46:55 +01:00
|
|
|
self.save = save
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodeTLHasChanged") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("patroni")
|
2021-08-11 19:09:14 +02:00
|
|
|
new_tl = item_dict["timeline"]
|
|
|
|
|
2023-03-16 13:11:40 +01:00
|
|
|
_log.debug("save result: %(issave)s", {"issave": self.save})
|
2022-02-07 15:01:50 +01:00
|
|
|
old_tl = self.timeline
|
2023-03-01 16:46:55 +01:00
|
|
|
if self.state_file is not None and self.save:
|
2023-03-16 13:11:40 +01:00
|
|
|
_log.debug(
|
|
|
|
"saving new timeline to state file / cookie %(state_file)s",
|
|
|
|
{"state_file": self.state_file},
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
cookie = nagiosplugin.Cookie(self.state_file)
|
|
|
|
cookie.open()
|
|
|
|
cookie["timeline"] = new_tl
|
|
|
|
cookie.commit()
|
2022-02-07 15:01:50 +01:00
|
|
|
cookie.close()
|
2021-08-11 19:09:14 +02:00
|
|
|
|
2023-03-16 13:11:40 +01:00
|
|
|
_log.debug(
|
|
|
|
"Tl data: old tl %(old_tl)s, new tl %(new_tl)s",
|
|
|
|
{"old_tl": old_tl, "new_tl": new_tl},
|
|
|
|
)
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
# The actual check
|
|
|
|
yield nagiosplugin.Metric(
|
|
|
|
"is_timeline_changed",
|
|
|
|
1 if str(new_tl) != str(old_tl) else 0,
|
|
|
|
)
|
|
|
|
|
|
|
|
# The performance data : the timeline number
|
|
|
|
yield nagiosplugin.Metric("timeline", new_tl)
|
|
|
|
|
|
|
|
|
|
|
|
class NodeTLHasChangedSummary(nagiosplugin.Summary):
|
|
|
|
def __init__(self: "NodeTLHasChangedSummary", timeline: str) -> None:
|
|
|
|
self.timeline = timeline
|
|
|
|
|
|
|
|
def ok(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return f"The timeline is still {self.timeline}."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return f"The expected timeline was {self.timeline} got {results['timeline'].metric}."
|
|
|
|
|
|
|
|
|
|
|
|
class NodePatroniVersion(PatroniResource):
|
|
|
|
def __init__(
|
|
|
|
self: "NodePatroniVersion",
|
|
|
|
connection_info: ConnectionInfo,
|
|
|
|
patroni_version: str,
|
|
|
|
) -> None:
|
|
|
|
super().__init__(connection_info)
|
|
|
|
self.patroni_version = patroni_version
|
|
|
|
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodePatroniVersion") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
item_dict = self.rest_api("patroni")
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
version = item_dict["patroni"]["version"]
|
|
|
|
_log.debug(
|
2023-03-16 13:11:40 +01:00
|
|
|
"Version data: patroni version %(version)s input version %(patroni_version)s",
|
|
|
|
{"version": version, "patroni_version": self.patroni_version},
|
2021-08-11 19:09:14 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
# The actual check
|
|
|
|
return [
|
|
|
|
nagiosplugin.Metric(
|
|
|
|
"is_version_ok",
|
|
|
|
1 if version == self.patroni_version else 0,
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class NodePatroniVersionSummary(nagiosplugin.Summary):
|
|
|
|
def __init__(self: "NodePatroniVersionSummary", patroni_version: str) -> None:
|
|
|
|
self.patroni_version = patroni_version
|
|
|
|
|
|
|
|
def ok(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return f"Patroni's version is {self.patroni_version}."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
# FIXME find a way to make the following work, check is perf data can be strings
|
|
|
|
# return f"The expected patroni version was {self.patroni_version} got {results['patroni_version'].metric}."
|
|
|
|
return f"Patroni's version is not {self.patroni_version}."
|
|
|
|
|
|
|
|
|
|
|
|
class NodeIsAlive(PatroniResource):
|
2021-08-12 11:38:55 +02:00
|
|
|
def probe(self: "NodeIsAlive") -> Iterable[nagiosplugin.Metric]:
|
2023-03-12 19:43:06 +01:00
|
|
|
try:
|
|
|
|
self.rest_api("liveness")
|
|
|
|
except APIError:
|
|
|
|
return [nagiosplugin.Metric("is_alive", 0)]
|
|
|
|
return [nagiosplugin.Metric("is_alive", 1)]
|
2021-08-11 19:09:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
class NodeIsAliveSummary(nagiosplugin.Summary):
|
|
|
|
def ok(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return "This node is alive (patroni is running)."
|
|
|
|
|
|
|
|
@handle_unknown
|
|
|
|
def problem(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
|
|
|
|
return "This node is not alive (patroni is not running)."
|