First working version

This commit is contained in:
benoit 2021-08-11 19:09:14 +02:00
parent aa17162871
commit 1e6adc6a1a
52 changed files with 2033 additions and 22 deletions

10
.flake8 Normal file
View file

@ -0,0 +1,10 @@
[flake8]
doctests = True
ignore =
E501, # line too long
exclude =
.git,
.mypy_cache,
.tox,
.venv,
mypy_config = mypy.ini

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
check_patroni/__pycache__/
test/*.state_file

0
README.md Normal file
View file

View file

@ -1,17 +1,521 @@
import requests
import click
from configparser import ConfigParser
import nagiosplugin
import re
from typing import List
from . import __version__
from .cluster import (
ClusterConfigHasChanged,
ClusterConfigHasChangedSummary,
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterNodeCount,
)
from .node import (
NodeIsAlive,
NodeIsAliveSummary,
NodeIsPendingRestart,
NodeIsPendingRestartSummary,
NodeIsPrimary,
NodeIsPrimarySummary,
NodeIsReplica,
NodeIsReplicaSummary,
NodePatroniVersion,
NodePatroniVersionSummary,
NodeTLHasChanged,
NodeTLHasChangedSummary,
)
from .types import ConnectionInfo
def check_is_master(address: str = "127.0.0.1", port: int = 8008):
r = requests.get(f"{address}:{int(port)}/leader")
return r.status_code == 200
def print_version(ctx: click.Context, param: str, value: str) -> None:
if not value or ctx.resilient_parsing:
return
click.echo(f"Version {__version__}")
ctx.exit()
def check_is_replica(address: str = "127.0.0.1", port: int = 8008):
r = requests.get(f"{address}:{int(port)}/replica")
return r.status_code == 200
DEFAULT_CFG = "config.ini"
def main() -> None:
print(check_is_master())
print(check_is_replica())
print("allgood")
def configure(ctx: click.Context, param: str, filename: str) -> None:
"""Use a config file for the parameters
stolen from https://jwodder.github.io/kbits/posts/click-config/
"""
# FIXME should use click-configfile / click-config-file ?
cfg = ConfigParser()
cfg.read(filename)
ctx.default_map = {}
for sect in cfg.sections():
command_path = sect.split(".")
if command_path[0] != "options":
continue
defaults = ctx.default_map
for cmdname in command_path[1:]:
defaults = defaults.setdefault(cmdname, {})
defaults.update(cfg[sect])
try:
# endpoints is an array of addresses separated by ,
if isinstance(defaults["endpoints"], str):
defaults["endpoints"] = re.split(r"\s*,\s*", defaults["endpoints"])
except KeyError:
pass
@click.group()
@click.option(
"--config",
type=click.Path(dir_okay=False),
default=DEFAULT_CFG,
callback=configure,
is_eager=True,
expose_value=False,
help="Read option defaults from the specified INI file",
show_default=True,
)
@click.option(
"-e",
"--endpoints",
"endpoints",
type=str,
multiple=True,
default=["http://127.0.0.1:8008"],
help="API endpoint. Can be specified multiple times.",
)
@click.option(
"--cert_file",
"cert_file",
type=str,
help="File with the client certificate.",
)
@click.option(
"--key_file",
"key_file",
type=str,
help="File with the client key.",
)
@click.option(
"--ca_file",
"ca_file",
type=str,
help="The CA certificate.",
)
@click.option(
"-v",
"--verbose",
"verbose",
count=True,
help="Increase verbosity -v (info)/-vv (warning)/-vvv (debug)",
)
@click.option(
"--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True
)
@click.option(
"--timeout",
"timeout",
default=2,
type=int,
help="Timeout in seconds for the API queries (0 to disable)",
)
@click.pass_context
@nagiosplugin.guarded
def main(
ctx: click.Context,
endpoints: List[str],
cert_file: str,
key_file: str,
ca_file: str,
verbose: bool,
timeout: int,
) -> None:
"""Nagios plugin for patroni."""
ctx.obj = ConnectionInfo(endpoints, cert_file, key_file, ca_file)
# TODO Not all "is/has" services have the same return code for ok. Check if it's ok
# Typing
@main.command(name="cluster_node_count") # required otherwise _ are converted to -
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the nimber of nodes.",
)
@click.option(
"--running-warning",
"running_warning",
type=str,
help="Warning threshold for the number of running nodes.",
)
@click.option(
"--running-critical",
"running_critical",
type=str,
help="Critical threshold for the nimber of running nodes.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_node_count(
ctx: click.Context,
warning: str,
critical: str,
running_warning: str,
running_critical: str,
) -> None:
"""Count the number of nodes in the cluster.
\b
Check:
* Compares the number of nodes against the normal and running node warning and critical thresholds.
* `OK`! If they are not provided.
\b
Perfdata:
* `members`: the member count.
* all the roles of the nodes in the cluster with their number.
"""
check = nagiosplugin.Check()
check.add(
ClusterNodeCount(ctx.obj),
nagiosplugin.ScalarContext(
"members",
warning,
critical,
),
nagiosplugin.ScalarContext(
"state_running",
running_warning,
running_critical,
),
nagiosplugin.ScalarContext("members_roles"),
nagiosplugin.ScalarContext("members_statuses"),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="cluster_has_leader")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_leader(ctx: click.Context) -> None:
"""Check if the cluster has a leader.
\b
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
Perfdata : `has_leader` is 1 if there is a leader node, 0 otherwise
"""
# TODO: Manage primary or standby leader in the same place ?
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
ClusterHasLeaderSummary(),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="cluster_has_replica")
@click.option(
"-w",
"--warning",
"warning",
type=str,
help="Warning threshold for the number of nodes.",
)
@click.option(
"-c",
"--critical",
"critical",
type=str,
help="Critical threshold for the number of replica nodes.",
)
@click.option(
"--lag-warning", "lag_warning", type=str, help="Warning threshold for the lag."
)
# FIWME how do we manage maximum_lag_on_failover without doing many api calls
@click.option(
"--lag-critical", "lag_critical", type=str, help="Critical threshold for the lag."
)
@click.pass_context
@nagiosplugin.guarded
def cluster_has_replica(
ctx: click.Context, warning: str, critical: str, lag_warning: str, lag_critical: str
) -> None:
"""Check if the cluster has replicas and their lag.
\b
Check:
* `OK`: if the replica count and their lag are compatible with the replica count and lag thresholds.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata :
* replica count
* the lag of each replica labelled with "member name"_lag
"""
# FIXME the idea here would be to make sur we have a replica.
# lag should be check to prune invalid replicas
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj),
nagiosplugin.ScalarContext(
"replica_count",
warning,
critical,
),
nagiosplugin.ScalarContext(
"replica_lag",
lag_warning,
lag_critical,
),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="cluster_config_has_changed")
@click.option("--hash", "config_hash", type=str, help="A hash to compare with.")
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the tl number into.",
)
@click.pass_context
@nagiosplugin.guarded
def cluster_config_has_changed(
ctx: click.Context, config_hash: str, state_file: str
) -> None:
"""Check if the hash of the configuration has changed.
Note: either a hash or a state file must be provided for this service to work.
\b
Check:
* `OK`: The hash didn't change
* `CRITICAL`: The hash of the configuration has changed compared to the input (`--hash`) or last time (`--state_file`)
\b
Perfdata :
* `is_configuration_changed` is 1 if the configuration has changed
"""
# FIXME hash in perfdata ?
if (config_hash is None and state_file is None) or (
config_hash is not None and state_file is not None
):
raise click.UsageError(
"Either --hash or --state-file should be provided for this service", ctx
)
check = nagiosplugin.Check()
check.add(
ClusterConfigHasChanged(ctx.obj, config_hash, state_file),
nagiosplugin.ScalarContext("is_configuration_changed", None, "@1:1"),
ClusterConfigHasChangedSummary(),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
def node_is_primary(ctx: click.Context) -> None:
"""Check if the node is the primary with the leader lock.
\b
Check:
* `OK`: if the node is a primary with the leader lock.
* `CRITICAL:` otherwise
Perfdata: `is_primary` is 1 if the node is a primary with the leader lock, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPrimary(ctx.obj),
nagiosplugin.ScalarContext("is_primary", None, "@0:0"),
NodeIsPrimarySummary(),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_replica")
@click.option("--lag", "lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
def node_is_replica(ctx: click.Context, lag: str) -> None:
"""Check if the node is a running replica with no noloadbalance tag.
\b
Check:
* `OK`: if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold.
* `CRITICAL`: otherwise
Perfdata : `is_replica` is 1 if the node is a running replica with noloadbalance tag and the lag is under the maximum threshold, 0 otherwise.
"""
# add a lag check ??
check = nagiosplugin.Check()
check.add(
NodeIsReplica(ctx.obj, lag),
nagiosplugin.ScalarContext("is_replica", None, "@0:0"),
NodeIsReplicaSummary(lag),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_pending_restart")
@click.pass_context
@nagiosplugin.guarded
def node_is_pending_restart(ctx: click.Context) -> None:
"""Check if the node is in pending restart state.
This situation can arise if the configuration has been modified but
requiers arestart of PostgreSQL.
\b
Check:
* `OK`: if the node has pending restart tag.
* `CRITICAL`: otherwise
Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 otherwise.
"""
check = nagiosplugin.Check()
check.add(
NodeIsPendingRestart(ctx.obj),
nagiosplugin.ScalarContext("is_pending_restart", None, "@1:1"),
NodeIsPendingRestartSummary(),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_tl_has_changed")
@click.option(
"--timeline", "timeline", type=str, help="A timeline number to compare with."
)
@click.option(
"-s",
"--state-file",
"state_file",
type=str,
help="A state file to store the last tl number into.",
)
@click.pass_context
@nagiosplugin.guarded
def node_tl_has_changed(ctx: click.Context, timeline: str, state_file: str) -> None:
"""Check if the timeline hash changed.
Note: either a timeline or a state file must be provided for this service to work.
\b
Check:
* `OK`: The timeline is the same as last time (`--state_file`) or the inputed timeline (`--timeline`)
* `CRITICAL`: The tl is not the same.
\b
Perfdata :
* `is_configuration_changed` is 1 if the configuration has changed, 0 otherwise
"""
if (timeline is None and state_file is None) or (
timeline is not None and state_file is not None
):
raise click.UsageError(
"Either --timeline or --state-file should be provided for this service", ctx
)
check = nagiosplugin.Check()
check.add(
NodeTLHasChanged(ctx.obj, timeline, state_file),
nagiosplugin.ScalarContext("is_timeline_changed", None, "@1:1"),
nagiosplugin.ScalarContext("timeline"),
NodeTLHasChangedSummary(timeline),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_patroni_version")
@click.option(
"--patroni-version",
"patroni_version",
type=str,
help="Patroni version to compare to",
required=True,
)
@click.pass_context
@nagiosplugin.guarded
def node_patroni_version(ctx: click.Context, patroni_version: str) -> None:
"""Check if the version is equal to the input
\b
Check:
* `OK`: The version is the same as the input `--patroni-version`
* `CRITICAL`: otherwise.
\b
Perfdata :
* `is_version_ok` is 1 if version is ok, 0 otherwise
"""
# TODO the version cannot be written in perfdata find something else ?
check = nagiosplugin.Check()
check.add(
NodePatroniVersion(ctx.obj, patroni_version),
nagiosplugin.ScalarContext("is_version_ok", None, "@0:0"),
nagiosplugin.ScalarContext("patroni_version"),
NodePatroniVersionSummary(patroni_version),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)
@main.command(name="node_is_alive")
@click.pass_context
@nagiosplugin.guarded
def node_is_alive(ctx: click.Context) -> None:
"""Check if the node is alive ie patroni is running.
\b
Check:
* `OK`: If patroni is running.
* `CRITICAL`: otherwise.
\b
Perfdata :
* `is_running` is 1 if patroni is running, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
NodeIsAlive(ctx.obj),
nagiosplugin.ScalarContext("is_alive", None, "@0:0"),
NodeIsAliveSummary(),
)
check.main(
verbose=ctx.parent.params["verbose"], timeout=ctx.parent.params["timeout"]
)

164
check_patroni/cluster.py Normal file
View file

@ -0,0 +1,164 @@
from collections import Counter
import hashlib
import json
import logging
import nagiosplugin
from .types import PatroniResource, ConnectionInfo, handle_unknown
_log = logging.getLogger("nagiosplugin")
def replace_chars(text: str) -> str:
return text.replace("'", "").replace(" ", "_")
class ClusterNodeCount(PatroniResource):
def probe(self: "ClusterNodeCount") -> nagiosplugin.Metric:
r = self.rest_api("cluster")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
role_counters = Counter()
roles = []
status_counters = Counter()
statuses = []
for member in item_dict["members"]:
roles.append(replace_chars(member["role"]))
statuses.append(replace_chars(member["state"]))
role_counters.update(roles)
status_counters.update(statuses)
# The actual check: members, running state
yield nagiosplugin.Metric("members", len(item_dict["members"]))
yield nagiosplugin.Metric("state_running", status_counters["running"])
# The performance data : role
for role in role_counters:
yield nagiosplugin.Metric(
f"role_{role}", role_counters[role], context="members_roles"
)
# The performance data : statuses (except running)
for state in status_counters:
if state != "running":
yield nagiosplugin.Metric(
f"state_{state}", status_counters[state], context="members_statuses"
)
class ClusterHasLeader(PatroniResource):
def probe(self: "ClusterHasLeader") -> nagiosplugin.Metric:
r = self.rest_api("cluster")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
is_leader_found = False
for member in item_dict["members"]:
if member["role"] == "leader" and member["state"] == "running":
is_leader_found = True
break
return [
nagiosplugin.Metric(
"has_leader",
1 if is_leader_found else 0,
)
]
class ClusterHasLeaderSummary(nagiosplugin.Summary):
def ok(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has a running leader."
@handle_unknown
def problem(self: "ClusterHasLeaderSummary", results: nagiosplugin.Result) -> str:
return "The cluster has no running leader."
class ClusterHasReplica(PatroniResource):
def probe(self: "ClusterHasReplica") -> nagiosplugin.Metric:
r = self.rest_api("cluster")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
replicas = []
for member in item_dict["members"]:
# FIXME are there other acceptable states
if member["role"] == "replica" and member["state"] == "running":
# FIXME which lag ?
replicas.append({"name": member["name"], "lag": member["lag"]})
break
# The actual check
yield nagiosplugin.Metric("replica_count", len(replicas))
# The performance data : replicas lag
for replica in replicas:
yield nagiosplugin.Metric(
f"{replica['name']}_lag", replica["lag"], context="replica_lag"
)
# FIXME is this needed ??
# class ClusterHasReplicaSummary(nagiosplugin.Summary):
# def ok(self, results):
# def problem(self, results):
class ClusterConfigHasChanged(PatroniResource):
def __init__(
self: "ClusterConfigHasChanged",
connection_info: ConnectionInfo,
config_hash: str,
state_file: str,
):
super().__init__(connection_info)
self.state_file = state_file
self.config_hash = config_hash
def probe(self: "ClusterConfigHasChanged") -> nagiosplugin.Metric:
r = self.rest_api("config")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
new_hash = hashlib.md5(r.data).hexdigest()
if self.state_file is not None:
_log.debug(f"Using state file / cookie {self.state_file}")
cookie = nagiosplugin.Cookie(self.state_file)
cookie.open()
old_hash = cookie.get("hash")
cookie["hash"] = new_hash
cookie.commit()
else:
_log.debug(f"Using input value {self.config_hash}")
old_hash = self.config_hash
_log.debug(f"hash info: old hash {old_hash}, new hash {new_hash}")
return [
nagiosplugin.Metric(
"is_configuration_changed",
1 if new_hash != old_hash else 0,
)
]
class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
return "The hash of patroni's dynamic configuration has not changed."
@handle_unknown
def problem(
self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result
) -> str:
return "The hash of patroni's dynamic configuration has changed."

205
check_patroni/node.py Normal file
View file

@ -0,0 +1,205 @@
import json
import logging
import nagiosplugin
from .types import ConnectionInfo, handle_unknown, PatroniResource
_log = logging.getLogger("nagiosplugin")
class NodeIsPrimary(PatroniResource):
def probe(self: "NodeIsPrimary") -> nagiosplugin.Metric:
r = self.rest_api("primary")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
return [nagiosplugin.Metric("is_primary", 1 if r.status == 200 else 0)]
class NodeIsPrimarySummary(nagiosplugin.Summary):
def ok(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
return "This node is the primary with the leader lock."
@handle_unknown
def problem(self: "NodeIsPrimarySummary", results: nagiosplugin.Result) -> str:
return "This node is not the primary with the leader lock."
class NodeIsReplica(PatroniResource):
def __init__(
self: "NodeIsReplica", connection_info: ConnectionInfo, lag: str
) -> None:
super().__init__(connection_info)
self.lag = lag
def probe(self: "NodeIsReplica") -> nagiosplugin.Metric:
if self.lag is None:
r = self.rest_api("replica")
else:
r = self.rest_api(f"replica?lag={self.lag}")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
return [nagiosplugin.Metric("is_replica", 1 if r.status == 200 else 0)]
class NodeIsReplicaSummary(nagiosplugin.Summary):
def __init__(self: "NodeIsReplicaSummary", lag: str) -> None:
self.lag = lag
def ok(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
if self.lag is None:
return "This node is a running replica with no noloadbalance tag."
return f"This node is a running replica with no noloadbalance tag and the lag is under {self.lag}."
@handle_unknown
def problem(self: "NodeIsReplicaSummary", results: nagiosplugin.Result) -> str:
if self.lag is None:
return "This node is not a running replica with no noloadbalance tag."
return f"This node is not a running replica with no noloadbalance tag and a lag under {self.lag}."
class NodeIsPendingRestart(PatroniResource):
def probe(self: "NodeIsPendingRestart") -> nagiosplugin.Metric:
r = self.rest_api("patroni")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
is_pending_restart = item_dict.get("pending_restart", False)
return [
nagiosplugin.Metric(
"is_pending_restart",
1 if is_pending_restart else 0,
)
]
class NodeIsPendingRestartSummary(nagiosplugin.Summary):
def ok(self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result) -> str:
return "This node doesn't have the pending restart flag."
@handle_unknown
def problem(
self: "NodeIsPendingRestartSummary", results: nagiosplugin.Result
) -> str:
return "This node has the pending restart flag."
class NodeTLHasChanged(PatroniResource):
def __init__(
self: "NodeTLHasChanged",
connection_info: ConnectionInfo,
timeline: str,
state_file: str,
) -> None:
super().__init__(connection_info)
self.state_file = state_file
self.timeline = timeline
def probe(self: "NodeTLHasChanged") -> nagiosplugin.Metric:
r = self.rest_api("patroni")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
new_tl = item_dict["timeline"]
if self.state_file is not None:
_log.debug(f"Using state file / cookie {self.state_file}")
cookie = nagiosplugin.Cookie(self.state_file)
cookie.open()
old_tl = cookie.get("timeline")
cookie["timeline"] = new_tl
cookie.commit()
else:
_log.debug(f"Using input value {self.timeline}")
old_tl = self.timeline
_log.debug(f"Tl data: old tl {old_tl}, new tl {new_tl}")
# The actual check
yield nagiosplugin.Metric(
"is_timeline_changed",
1 if str(new_tl) != str(old_tl) else 0,
)
# The performance data : the timeline number
yield nagiosplugin.Metric("timeline", new_tl)
class NodeTLHasChangedSummary(nagiosplugin.Summary):
def __init__(self: "NodeTLHasChangedSummary", timeline: str) -> None:
self.timeline = timeline
def ok(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
return f"The timeline is still {self.timeline}."
@handle_unknown
def problem(self: "NodeTLHasChangedSummary", results: nagiosplugin.Result) -> str:
return f"The expected timeline was {self.timeline} got {results['timeline'].metric}."
class NodePatroniVersion(PatroniResource):
def __init__(
self: "NodePatroniVersion",
connection_info: ConnectionInfo,
patroni_version: str,
) -> None:
super().__init__(connection_info)
self.patroni_version = patroni_version
def probe(self: "NodePatroniVersion") -> nagiosplugin.Metric:
r = self.rest_api("patroni")
# FIXME RC <> 200 ?
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
item_dict = json.loads(r.data)
version = item_dict["patroni"]["version"]
_log.debug(
f"Version data: patroni version {version} input version {self.patroni_version}"
)
# The actual check
return [
nagiosplugin.Metric(
"is_version_ok",
1 if version == self.patroni_version else 0,
)
]
class NodePatroniVersionSummary(nagiosplugin.Summary):
def __init__(self: "NodePatroniVersionSummary", patroni_version: str) -> None:
self.patroni_version = patroni_version
def ok(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
return f"Patroni's version is {self.patroni_version}."
@handle_unknown
def problem(self: "NodePatroniVersionSummary", results: nagiosplugin.Result) -> str:
# FIXME find a way to make the following work, check is perf data can be strings
# return f"The expected patroni version was {self.patroni_version} got {results['patroni_version'].metric}."
return f"Patroni's version is not {self.patroni_version}."
class NodeIsAlive(PatroniResource):
def probe(self: "NodeIsAlive") -> nagiosplugin.Metric:
r = self.rest_api("liveness")
_log.debug(f"api call status: {r.status}")
_log.debug(f"api call data: {r.data}")
return [nagiosplugin.Metric("is_alive", 1 if r.status == 200 else 0)]
class NodeIsAliveSummary(nagiosplugin.Summary):
def ok(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
return "This node is alive (patroni is running)."
@handle_unknown
def problem(self: "NodeIsAliveSummary", results: nagiosplugin.Result) -> str:
return "This node is not alive (patroni is not running)."

63
check_patroni/types.py Normal file
View file

@ -0,0 +1,63 @@
import attr
import logging
import nagiosplugin
import urllib3
from typing import Any, Callable, List
_log = logging.getLogger("nagiosplugin")
@attr.s(auto_attribs=True, frozen=True, slots=True)
class ConnectionInfo:
endpoints: List[str] = ["http://127.0.0.1:8008"]
cert_file: str = "./ssl/benoit-dalibo-cert.pem"
key_file: str = "./ssl/benoit-dalibo-key.pem"
ca_cert: str = "./ssl/CA-cert.pem"
@attr.s(auto_attribs=True, slots=True)
class PatroniResource(nagiosplugin.Resource):
conn_info: ConnectionInfo
def rest_api(
self: "PatroniResource", service: str
) -> urllib3.response.HTTPResponse:
"""Try to connect to all the provided endpoints for the requested service"""
for endpoint in self.conn_info.endpoints:
try:
if endpoint[:5] == "https":
pool = urllib3.PoolManager(
cert_reqs="CERT_REQUIRED",
cert_file=self.conn_info.cert_file,
key_file=self.conn_info.key_file,
ca_certs=self.conn_info.ca_cert,
)
else:
pool = urllib3.PoolManager()
_log.debug(f"Trying to connect to {endpoint}/{service}")
return pool.request(
"GET",
f"{endpoint}/{service}",
)
except nagiosplugin.Timeout as e:
raise e
except Exception as e:
_log.debug(e)
continue
raise nagiosplugin.CheckError("Connection failed for all provided endpoints")
HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Result], Any]
def handle_unknown(action: HandleUnknown) -> HandleUnknown:
"""decorator to handle the unknown state in Summary.problem"""
def wrapper(summary: nagiosplugin.Summary, results: nagiosplugin.Result) -> Any:
if results.most_significant[0].state.code == 3:
"""get the appropriate message for all unknown error"""
return results.most_significant[0].hint
return action(summary, results)
return wrapper

9
config.ini Normal file
View file

@ -0,0 +1,9 @@
[options]
endpoints = https://10.20.199.3:8008, https://10.20.199.4:8008,https://10.20.199.5:8008
cert_file = ./ssl/benoit-dalibo-cert.pem
key_file = ./ssl/benoit-dalibo-key.pem
ca_file = ./ssl/CA-cert.pem
timeout = 0
[options.node_is_replica]
lag=100

5
mypy.ini Normal file
View file

@ -0,0 +1,5 @@
[mypy]
# nagiosplugin => Skipping analyzing "nagiosplugin": found module but no type hints or library stubs [import]
ignore_missing_imports = true
show_error_codes = true
strict = true

View file

@ -19,24 +19,29 @@ def get_version() -> str:
setup(
name="check_patroni",
version=get_version(),
# author="Dalibo",
# author_email="contact@dalibo.com",
# author="Dalibo",
# author_email="contact@dalibo.com",
packages=find_packages("."),
include_package_data=True,
# url="https://github.com/dalibo/pg_activity",
# url="https://github.com/dalibo/pg_activity",
license="PostgreSQL",
description="Nagios plugin to check on patroni",
long_description=long_description,
long_description_content_type="text/markdown",
# classifiers=[
# "Development Status :: 5 - Production/Stable",
# "Environment :: Console",
# "License :: OSI Approved :: PostgreSQL License",
# "Programming Language :: Python :: 3",
# "Topic :: Database",
# ],
keywords="patroni nagios cehck",
# classifiers=[
# "Development Status :: 5 - Production/Stable",
# "Environment :: Console",
# "License :: OSI Approved :: PostgreSQL License",
# "Programming Language :: Python :: 3",
# "Topic :: Database",
# ],
keywords="patroni nagios check",
python_requires=">=3.6",
install_requires=[
"urllib3 >= 1.26.6",
"nagiosplugin >= 1.3.2",
"click >= 8.0.1",
],
extras_require={
"dev": [
"black",
@ -44,6 +49,10 @@ setup(
"flake8",
"mypy",
],
"test": [
"pytest",
"pytest-mock",
],
},
entry_points={
"console_scripts": [

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,16 @@
{
"loop_wait": 10,
"master_start_timeout": 300,
"postgresql": {
"parameters": {
"archive_command": "pgbackrest --stanza=main archive-push %p",
"archive_mode": "on",
"max_connections": 300,
"restore_command": "pgbackrest --stanza=main archive-get %f \"%p\""
},
"use_pg_rewind": false,
"use_slot": true
},
"retry_timeout": 10,
"ttl": 30
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,32 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "start failed",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"lag": "unknown"
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,13 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
}
]
}

View file

@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,31 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "start failed",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"lag": "unknown"
},
{
"name": "srv3",
"role": "replica",
"state": "start failed",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"lag": "unknown"
}
]
}

View file

@ -0,0 +1,23 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv3",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,23 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "running",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}

View file

@ -0,0 +1,19 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:57:51.693 UTC",
"role": "replica",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"received_location": 1174407088,
"replayed_location": 1174407088,
"replayed_timestamp": null,
"paused": false
},
"timeline": 58,
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,27 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"pending_restart": true,
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,19 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:57:51.693 UTC",
"role": "replica",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"received_location": 1174407088,
"replayed_location": 1174407088,
"replayed_timestamp": null,
"paused": false
},
"timeline": 58,
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,19 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:57:51.693 UTC",
"role": "replica",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"received_location": 1174407088,
"replayed_location": 1174407088,
"replayed_timestamp": null,
"paused": false
},
"timeline": 58,
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,26 @@
{
"state": "running",
"postmaster_start_time": "2021-08-11 07:02:20.732 UTC",
"role": "master",
"server_version": 110012,
"cluster_unlocked": false,
"xlog": {
"location": 1174407088
},
"timeline": 58,
"replication": [
{
"usename": "replicator",
"application_name": "srv1",
"client_addr": "10.20.199.3",
"state": "streaming",
"sync_state": "async",
"sync_priority": 0
}
],
"database_system_identifier": "6965971025273547206",
"patroni": {
"version": "2.0.2",
"scope": "patroni-demo"
}
}

View file

@ -0,0 +1,103 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock, here
def test_cluster_config_has_changed_params(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_config_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_config_has_changed",
"--hash",
"640df9f0211c791723f18fc3ed9dbb95",
"--state-file",
str(here / "fake_file_name.state_file"),
],
)
assert result.exit_code == 3
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_config_has_changed"]
)
assert result.exit_code == 3
def test_cluster_config_has_changed_ok_with_hash(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_config_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_config_has_changed",
"--hash",
"640df9f0211c791723f18fc3ed9dbb95",
],
)
assert result.exit_code == 0
def test_cluster_config_has_changed_ok_with_state_file(mocker: MockerFixture) -> None:
runner = CliRunner()
with open(here / "cluster_config_has_changed.state_file", "w") as f:
f.write('{"hash": "640df9f0211c791723f18fc3ed9dbb95"}')
my_mock(mocker, "cluster_config_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_config_has_changed",
"--state-file",
str(here / "cluster_config_has_changed.state_file"),
],
)
assert result.exit_code == 0
def test_cluster_config_has_changed_ko_with_hash(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_config_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_config_has_changed",
"--hash",
"640df9f0211c791723f18fc3edffffff",
],
)
assert result.exit_code == 2
def test_cluster_config_has_changed_ko_with_state_file(mocker: MockerFixture) -> None:
runner = CliRunner()
with open(here / "cluster_config_has_changed.state_file", "w") as f:
f.write('{"hash": "640df9f0211c791723f18fc3edffffff"}')
my_mock(mocker, "cluster_config_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_config_has_changed",
"--state-file",
str(here / "cluster_config_has_changed.state_file"),
],
)
assert result.exit_code == 2

View file

@ -0,0 +1,29 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_cluster_has_leader_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_leader_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_leader"]
)
assert result.exit_code == 0
# FIXME Not captured ???
# assert "CLUSTERHASLEADER OK - has_leader is 1 | has_leader=1;;@0" in result.output
def test_cluster_has_leader_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_leader_ko", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_leader"]
)
assert result.exit_code == 2
# assert "CLUSTERHASLEADER CRITICAL - has_leader is 0 (outside range @0:0) | has_leader=0;;@0" in result.output

View file

@ -0,0 +1,36 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
# TODO Lag threshold tests
def test_cluster_has_relica_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_replica"]
)
assert result.exit_code == 0
def test_cluster_has_replica_ok_with_count_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_has_replica_ko", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_has_replica",
"--warninng",
"@2",
"--critical",
"@0:1",
],
)
assert result.exit_code == 2

View file

@ -0,0 +1,115 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_cluster_node_count_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_node_count"]
)
assert result.exit_code == 0
def test_cluster_node_count_ok_with_thresholds(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_ok", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_node_count",
"--warning",
"@0:1",
"--critical",
"@2",
"--running-warning",
"@2",
"--running-critical",
"@0:1",
],
)
assert result.exit_code == 0
def test_cluster_node_count_running_warning(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_running_warning", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_node_count",
"--running-warning",
"@2",
"--running-critical",
"@0:1",
],
)
assert result.exit_code == 1
def test_cluster_node_count_running_critical(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_running_critical", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_node_count",
"--running-warning",
"@2",
"--running-critical",
"@0:1",
],
)
assert result.exit_code == 2
def test_cluster_node_count_warning(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_warning", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_node_count",
"--warning",
"@2",
"--critical",
"@0:1",
],
)
assert result.exit_code == 1
def test_cluster_node_count_critical(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "cluster_node_count_critical", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"cluster_node_count",
"--warning",
"@2",
"--critical",
"@0:1",
],
)
assert result.exit_code == 2

View file

@ -0,0 +1,22 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_node_is_alive_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_alive", 200)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_alive"])
assert result.exit_code == 0
def test_node_is_alive_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_alive", 404)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_alive"])
assert result.exit_code == 2

View file

@ -0,0 +1,26 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_node_is_pending_restart_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_pending_restart_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_is_pending_restart"]
)
assert result.exit_code == 0
def test_node_is_pending_restart_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_pending_restart_ko", 404)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_is_pending_restart"]
)
assert result.exit_code == 2

View file

@ -0,0 +1,22 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_node_is_primary_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_primary_ok", 200)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_primary"])
assert result.exit_code == 0
def test_node_is_primary_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_primary_ko", 404)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_primary"])
assert result.exit_code == 2

View file

@ -0,0 +1,33 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_node_is_replica_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_replica_ok", 200)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_replica"])
assert result.exit_code == 0
def test_node_is_replica_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_is_replica_ko", 404)
result = runner.invoke(main, ["-e", "https://10.20.199.3:8008", "node_is_replica"])
assert result.exit_code == 2
def test_node_is_replica_ko_lag(mocker: MockerFixture) -> None:
runner = CliRunner()
# We don't do the check ourselves, patroni does it and changes the return code
my_mock(mocker, "node_is_replica_ok", 404)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_is_replica", "--lag", "100"]
)
assert result.exit_code == 2

View file

@ -0,0 +1,40 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock
def test_node_patroni_version_ok(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_patroni_version", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_patroni_version",
"--patroni-version",
"2.0.2",
],
)
assert result.exit_code == 0
def test_node_patroni_version_ko(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_patroni_version", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_patroni_version",
"--patroni-version",
"1.0.0",
],
)
assert result.exit_code == 2

View file

@ -0,0 +1,104 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture
from check_patroni.cli import main
from tools import my_mock, here
def test_node_tl_has_changed_params(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_tl_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_tl_has_changed",
"--timeline",
"58",
"--state-file",
str(here / "fake_file_name.state_file"),
],
)
assert result.exit_code == 3
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "node_tl_has_changed"]
)
assert result.exit_code == 3
def test_node_tl_has_changed_ok_with_timeline(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_tl_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_tl_has_changed",
"--timeline",
"58",
],
)
assert result.exit_code == 0
def test_node_tl_has_changed_ok_with_state_file(mocker: MockerFixture) -> None:
runner = CliRunner()
with open(here / "node_tl_has_changed.state_file", "w") as f:
f.write('{"timeline": 58}')
my_mock(mocker, "node_tl_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_tl_has_changed",
"--state-file",
str(here / "node_tl_has_changed.state_file"),
],
)
assert result.exit_code == 0
def test_node_tl_has_changed_ko_with_timeline(mocker: MockerFixture) -> None:
runner = CliRunner()
my_mock(mocker, "node_tl_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_tl_has_changed",
"--timeline",
"700",
],
)
assert result.exit_code == 2
def test_node_tl_has_changed_ko_with_state_file(mocker: MockerFixture) -> None:
runner = CliRunner()
with open(here / "node_tl_has_changed.state_file", "w") as f:
f.write('{"timeline": 700}')
my_mock(mocker, "node_tl_has_changed", 200)
result = runner.invoke(
main,
[
"-e",
"https://10.20.199.3:8008",
"node_tl_has_changed",
"--state-file",
str(here / "node_tl_has_changed.state_file"),
],
)
assert result.exit_code == 2

26
test/tools.py Normal file
View file

@ -0,0 +1,26 @@
import attr
import pathlib
from pytest_mock import MockerFixture
from check_patroni.types import PatroniResource
here = pathlib.Path(__file__).parent
def getjson(name: str) -> bytes:
path = here / "json" / f"{name}.json"
with path.open() as f:
return f.read().encode("utf-8")
@attr.s(auto_attribs=True, frozen=True, slots=True)
class MockApiReturnCode:
data: bytes
status: int
def my_mock(mocker: MockerFixture, json_file: str, status: int) -> None:
def mock_rest_api(self: PatroniResource, service: str) -> MockApiReturnCode:
return MockApiReturnCode(getjson(json_file), status)
mocker.patch("check_patroni.types.PatroniResource.rest_api", mock_rest_api)