check-patroni/check_patroni/node.py

import json
import logging

import nagiosplugin
from typing import Iterable

from .types import ConnectionInfo, handle_unknown, PatroniResource


_log = logging.getLogger("nagiosplugin")


class NodeIsPrimary(PatroniResource):
    def probe(self: "NodeIsPrimary") -> Iterable[nagiosplugin.Metric]:
        r = self.rest_api("primary")
        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        return [nagiosplugin.Metric("is_primary", 1 if r.status == 200 else 0)]


class NodeIsPrimarySummary(nagiosplugin.Summary):
    def ok(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
        return "This node is the primary with the leader lock."

    @handle_unknown
    def problem(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
        return "This node is not the primary with the leader lock."


class NodeIsReplica(PatroniResource):
    def __init__(
        self: "NodeIsReplica", connection_info: ConnectionInfo, max_lag: str
    ) -> None:
        super().__init__(connection_info)
        self.max_lag = max_lag

    def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]:
        if self.max_lag is None:
            r = self.rest_api("replica")
        else:
            r = self.rest_api(f"replica?lag={self.max_lag}")
        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        return [nagiosplugin.Metric("is_replica", 1 if r.status == 200 else 0)]


class NodeIsReplicaSummary(nagiosplugin.Summary):
    def __init__(self: "NodeIsReplicaSummary", lag: str) -> None:
        self.lag = lag

    def ok(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
        if self.lag is None:
            return "This node is a running replica with no noloadbalance tag."
        return f"This node is a running replica with no noloadbalance tag and the lag is under {self.lag}."

    @handle_unknown
    def problem(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
        if self.lag is None:
            return "This node is not a running replica with no noloadbalance tag."
        return f"This node is not a running replica with no noloadbalance tag and a lag under {self.lag}."


class NodeIsPendingRestart(PatroniResource):
    def probe(self: "NodeIsPendingRestart") -> Iterable[nagiosplugin.Metric]:
        r = self.rest_api("patroni")
        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        item_dict = json.loads(r.data)
        is_pending_restart = item_dict.get("pending_restart", False)
        return [
            nagiosplugin.Metric(
                "is_pending_restart",
                1 if is_pending_restart else 0,
            )
        ]


class NodeIsPendingRestartSummary(nagiosplugin.Summary):
    def ok(self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result) -> str:
        return "This node doesn't have the pending restart flag."

    @handle_unknown
    def problem(
        self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result
    ) -> str:
        return "This node has the pending restart flag."


class NodeTLHasChanged(PatroniResource):
    def __init__(
        self: "NodeTLHasChanged",
        connection_info: ConnectionInfo,
        timeline: str,  # Always contains the old timeline
        state_file: str,  # Only used to update the timeline in the state_file (when needed)
        save: bool,  # save timeline in state file
    ) -> None:
        super().__init__(connection_info)
        self.state_file = state_file
        self.timeline = timeline
        self.save = save

    def probe(self: "NodeTLHasChanged") -> Iterable[nagiosplugin.Metric]:
        r = self.rest_api("patroni")
        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        item_dict = json.loads(r.data)
        new_tl = item_dict["timeline"]

        _log.debug(f"save result: {self.save}")
        old_tl = self.timeline
        if self.state_file is not None and self.save:
            _log.debug(f"saving new timeline to state file / cookie {self.state_file}")
            cookie = nagiosplugin.Cookie(self.state_file)
            cookie.open()
            cookie["timeline"] = new_tl
            cookie.commit()
            cookie.close()

        _log.debug(f"Tl data: old tl {old_tl}, new tl {new_tl}")

        # The actual check
        yield nagiosplugin.Metric(
            "is_timeline_changed",
            1 if str(new_tl) != str(old_tl) else 0,
        )

        # The performance data : the timeline number
        yield nagiosplugin.Metric("timeline", new_tl)


class NodeTLHasChangedSummary(nagiosplugin.Summary):
    def __init__(self: "NodeTLHasChangedSummary", timeline: str) -> None:
        self.timeline = timeline

    def ok(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
        return f"The timeline is still {self.timeline}."

    @handle_unknown
    def problem(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
        return f"The expected timeline was {self.timeline} got {results['timeline'].metric}."


class NodePatroniVersion(PatroniResource):
    def __init__(
        self: "NodePatroniVersion",
        connection_info: ConnectionInfo,
        patroni_version: str,
    ) -> None:
        super().__init__(connection_info)
        self.patroni_version = patroni_version

    def probe(self: "NodePatroniVersion") -> Iterable[nagiosplugin.Metric]:
        r = self.rest_api("patroni")

        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        item_dict = json.loads(r.data)
        version = item_dict["patroni"]["version"]
        _log.debug(
            f"Version data: patroni version  {version} input version {self.patroni_version}"
        )

        # The actual check
        return [
            nagiosplugin.Metric(
                "is_version_ok",
                1 if version == self.patroni_version else 0,
            )
        ]


class NodePatroniVersionSummary(nagiosplugin.Summary):
    def __init__(self: "NodePatroniVersionSummary", patroni_version: str) -> None:
        self.patroni_version = patroni_version

    def ok(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
        return f"Patroni's version is {self.patroni_version}."

    @handle_unknown
    def problem(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
        # FIXME find a way to make the following work, check is perf data can be strings
        # return f"The expected patroni version was {self.patroni_version} got {results['patroni_version'].metric}."
        return f"Patroni's version is not {self.patroni_version}."


class NodeIsAlive(PatroniResource):
    def probe(self: "NodeIsAlive") -> Iterable[nagiosplugin.Metric]:
        r = self.rest_api("liveness")
        _log.debug(f"api call status: {r.status}")
        _log.debug(f"api call data: {r.data}")

        return [nagiosplugin.Metric("is_alive", 1 if r.status == 200 else 0)]


class NodeIsAliveSummary(nagiosplugin.Summary):
    def ok(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
        return "This node is alive (patroni is running)."

    @handle_unknown
    def problem(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
        return "This node is not alive (patroni is not running)."