diff --git a/README.md b/README.md index 5f9104d..ca015c5 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,33 @@ For example, the following command will raise: ``` check_patroni -e https://10.20.199.3:8008 cluster_has_replica --warning 2: --critical 1: ``` +## SSL + +Several option are available: + +* you have a self-signed certificate: + * `--ca_cert`: your certification chain `cat CA-certificate server-certificate > cabundle` +* you have a valid root certificate: + * `--cert_file`: your certificate or the concatenation of your certificate and private key + * `--key_file`: your private key (optional) + * `--ca_cert`: if your CA certificate is not installed on the server you can provide it here (optional) +* unsafe access: dont provide any info, you will get a warning as described below. + +If you configuration is unsafe you might get warning message such as: + +``` +$ check_patroni -e https://p1:8008 cluster_node_count +/home/vagrant/.local/lib/python3.9/site-packages/urllib3/connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'p1'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings + warnings.warn( +CLUSTERNODECOUNT OK - members is 2 | members=2 role_leader=1 role_replica=1 state_running=2 +``` + +After checking on the message, you can choose to ignore it by redirecting the +standart output to /dev/null: +``` +$ check_patroni -e https://p1:8008 cluster_node_count 2>/dev/null +CLUSTERNODECOUNT OK - members is 2 | members=2 role_leader=1 role_replica=1 state_running=2 +``` ## Cluster services @@ -269,7 +296,7 @@ Usage: check_patroni node_is_pending_restart [OPTIONS] a restart of PostgreSQL to take effect. Check: - * `OK`: if the node has pending restart tag. + * `OK`: if the node has no pending restart tag. * `CRITICAL`: otherwise Perfdata: `is_pending_restart` is 1 if the node has pending restart tag, 0 diff --git a/check_patroni/cli.py b/check_patroni/cli.py index 1ab53b5..748b427 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -1,3 +1,4 @@ +import logging import re from configparser import ConfigParser @@ -32,6 +33,9 @@ from .node import ( from .types import ConnectionInfo, Parameters from .convert import size_to_byte +_log = logging.getLogger("nagiosplugin") +DEFAULT_CFG = "config.ini" + def print_version(ctx: click.Context, param: str, value: str) -> None: if not value or ctx.resilient_parsing: @@ -40,9 +44,6 @@ def print_version(ctx: click.Context, param: str, value: str) -> None: ctx.exit() -DEFAULT_CFG = "config.ini" - - def configure(ctx: click.Context, param: str, filename: str) -> None: """Use a config file for the parameters stolen from https://jwodder.github.io/kbits/posts/click-config/ diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index 94412cc..73a9855 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -17,11 +17,7 @@ def replace_chars(text: str) -> str: class ClusterNodeCount(PatroniResource): def probe(self: "ClusterNodeCount") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("cluster") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - item_dict = json.loads(r.data) + item_dict = self.rest_api("cluster") role_counters: Counter[str] = Counter() roles = [] status_counters: Counter[str] = Counter() @@ -53,11 +49,8 @@ class ClusterNodeCount(PatroniResource): class ClusterHasLeader(PatroniResource): def probe(self: "ClusterHasLeader") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("cluster") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") + item_dict = self.rest_api("cluster") - item_dict = json.loads(r.data) is_leader_found = False for member in item_dict["members"]: if member["role"] == "leader" and member["state"] == "running": @@ -91,11 +84,8 @@ class ClusterHasReplica(PatroniResource): self.max_lag = max_lag def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("cluster") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") + item_dict = self.rest_api("cluster") - item_dict = json.loads(r.data) replicas = [] healthy_replica = 0 unhealthy_replica = 0 @@ -140,11 +130,9 @@ class ClusterConfigHasChanged(PatroniResource): self.save = save def probe(self: "ClusterConfigHasChanged") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("config") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") + item_dict = self.rest_api("config") - new_hash = hashlib.md5(r.data).hexdigest() + new_hash = hashlib.md5(json.dumps(item_dict).encode()).hexdigest() _log.debug(f"save result: {self.save}") old_hash = self.config_hash @@ -184,11 +172,7 @@ class ClusterConfigHasChangedSummary(nagiosplugin.Summary): class ClusterIsInMaintenance(PatroniResource): def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("cluster") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - item_dict = json.loads(r.data) + item_dict = self.rest_api("cluster") # The actual check return [ diff --git a/check_patroni/node.py b/check_patroni/node.py index ae12569..8fa840a 100644 --- a/check_patroni/node.py +++ b/check_patroni/node.py @@ -1,22 +1,20 @@ -import json import logging import nagiosplugin from typing import Iterable -from .types import ConnectionInfo, handle_unknown, PatroniResource - +from .types import APIError, ConnectionInfo, handle_unknown, PatroniResource _log = logging.getLogger("nagiosplugin") class NodeIsPrimary(PatroniResource): def probe(self: "NodeIsPrimary") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("primary") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - return [nagiosplugin.Metric("is_primary", 1 if r.status == 200 else 0)] + try: + self.rest_api("primary") + except APIError: + return [nagiosplugin.Metric("is_primary", 0)] + return [nagiosplugin.Metric("is_primary", 1)] class NodeIsPrimarySummary(nagiosplugin.Summary): @@ -36,14 +34,14 @@ class NodeIsReplica(PatroniResource): self.max_lag = max_lag def probe(self: "NodeIsReplica") -> Iterable[nagiosplugin.Metric]: - if self.max_lag is None: - r = self.rest_api("replica") - else: - r = self.rest_api(f"replica?lag={self.max_lag}") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - return [nagiosplugin.Metric("is_replica", 1 if r.status == 200 else 0)] + try: + if self.max_lag is None: + self.rest_api("replica") + else: + self.rest_api(f"replica?lag={self.max_lag}") + except APIError: + return [nagiosplugin.Metric("is_replica", 0)] + return [nagiosplugin.Metric("is_replica", 1)] class NodeIsReplicaSummary(nagiosplugin.Summary): @@ -64,11 +62,8 @@ class NodeIsReplicaSummary(nagiosplugin.Summary): class NodeIsPendingRestart(PatroniResource): def probe(self: "NodeIsPendingRestart") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("patroni") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") + item_dict = self.rest_api("patroni") - item_dict = json.loads(r.data) is_pending_restart = item_dict.get("pending_restart", False) return [ nagiosplugin.Metric( @@ -103,11 +98,7 @@ class NodeTLHasChanged(PatroniResource): self.save = save def probe(self: "NodeTLHasChanged") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("patroni") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - item_dict = json.loads(r.data) + item_dict = self.rest_api("patroni") new_tl = item_dict["timeline"] _log.debug(f"save result: {self.save}") @@ -154,12 +145,8 @@ class NodePatroniVersion(PatroniResource): self.patroni_version = patroni_version def probe(self: "NodePatroniVersion") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("patroni") + item_dict = self.rest_api("patroni") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - item_dict = json.loads(r.data) version = item_dict["patroni"]["version"] _log.debug( f"Version data: patroni version {version} input version {self.patroni_version}" @@ -190,11 +177,11 @@ class NodePatroniVersionSummary(nagiosplugin.Summary): class NodeIsAlive(PatroniResource): def probe(self: "NodeIsAlive") -> Iterable[nagiosplugin.Metric]: - r = self.rest_api("liveness") - _log.debug(f"api call status: {r.status}") - _log.debug(f"api call data: {r.data}") - - return [nagiosplugin.Metric("is_alive", 1 if r.status == 200 else 0)] + try: + self.rest_api("liveness") + except APIError: + return [nagiosplugin.Metric("is_alive", 0)] + return [nagiosplugin.Metric("is_alive", 1)] class NodeIsAliveSummary(nagiosplugin.Summary): diff --git a/check_patroni/types.py b/check_patroni/types.py index f54f36e..09d7bbe 100644 --- a/check_patroni/types.py +++ b/check_patroni/types.py @@ -1,19 +1,26 @@ import logging -import urllib3 import attr import nagiosplugin -from typing import Any, Callable, List +import requests +import urllib3 +from typing import Any, Callable, List, Optional, Tuple, Union _log = logging.getLogger("nagiosplugin") +class APIError(requests.exceptions.RequestException): + """This exception is raised when the rest api couldn't + be reached and we got a http status code different from 200. + """ + + @attr.s(auto_attribs=True, frozen=True, slots=True) class ConnectionInfo: endpoints: List[str] = ["http://127.0.0.1:8008"] - cert_file: str = "./ssl/benoit-dalibo-cert.pem" - key_file: str = "./ssl/benoit-dalibo-key.pem" - ca_cert: str = "./ssl/CA-cert.pem" + cert_file: Optional[str] = None + key_file: Optional[str] = None + ca_cert: Optional[str] = None @attr.s(auto_attribs=True, frozen=True, slots=True) @@ -27,27 +34,52 @@ class Parameters: class PatroniResource(nagiosplugin.Resource): conn_info: ConnectionInfo - def rest_api( - self: "PatroniResource", service: str - ) -> urllib3.response.HTTPResponse: + def rest_api(self: "PatroniResource", service: str) -> Any: """Try to connect to all the provided endpoints for the requested service""" for endpoint in self.conn_info.endpoints: try: + cert: Optional[Union[Tuple[str, str], str]] = None + verify: Optional[Union[str, bool]] = None if endpoint[:5] == "https": - pool = urllib3.PoolManager( - cert_reqs="CERT_REQUIRED", - cert_file=self.conn_info.cert_file, - key_file=self.conn_info.key_file, - ca_certs=self.conn_info.ca_cert, - ) - else: - pool = urllib3.PoolManager() + if ( + self.conn_info.cert_file is not None + and self.conn_info.key_file is not None # noqa W503 + ): + # we provide a certificate and a private key + cert = (self.conn_info.cert_file, self.conn_info.key_file) + elif ( + self.conn_info.cert_file is not None + and self.conn_info.key_file is None # noqa W503 + ): + # we provide a pem file with the private key and the certificate + cert = self.conn_info.cert_file - _log.debug(f"Trying to connect to {endpoint}/{service}") - return pool.request( - "GET", - f"{endpoint}/{service}", + if self.conn_info.ca_cert is not None: + # if cert is not None: this is the CA certificate + # otherwise this is a ca bundle with root certificate + # then some optional intermediate certificate and finally + # the cerver certificate to validate the certification chain + verify = self.conn_info.ca_cert + else: + if cert is None: + # if cert is None we want to bypass https verification, + # this is in secure and should be avoided for production use + verify = False + + _log.debug( + f"Trying to connect to {endpoint}/{service} with cert: {cert} verify: {verify}" ) + + r = requests.get(f"{endpoint}/{service}", verify=verify, cert=cert) + _log.debug(f"api call status: {r.status_code}") + _log.debug(f"api call data: {r.text}") + + if r.status_code != 200: + raise APIError( + f"Failed to connect to {endpoint}/{service} status code {r.status_code}" + ) + + return r.json() except nagiosplugin.Timeout as e: raise e except Exception as e: diff --git a/docs/make_readme.sh b/docs/make_readme.sh index cd1d393..5d12118 100755 --- a/docs/make_readme.sh +++ b/docs/make_readme.sh @@ -85,6 +85,33 @@ For example, the following command will raise: ``` check_patroni -e https://10.20.199.3:8008 cluster_has_replica --warning 2: --critical 1: ``` +## SSL + +Several option are available: + +* you have a self-signed certificate: + * `--ca_cert`: your certification chain `cat CA-certificate server-certificate > cabundle` +* you have a valid root certificate: + * `--cert_file`: your certificate or the concatenation of your certificate and private key + * `--key_file`: your private key (optional) + * `--ca_cert`: if your CA certificate is not installed on the server you can provide it here (optional) +* unsafe access: dont provide any info, you will get a warning as described below. + +If you configuration is unsafe you might get warning message such as: + +``` +$ check_patroni -e https://p1:8008 cluster_node_count +/home/vagrant/.local/lib/python3.9/site-packages/urllib3/connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'p1'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings + warnings.warn( +CLUSTERNODECOUNT OK - members is 2 | members=2 role_leader=1 role_replica=1 state_running=2 +``` + +After checking on the message, you can choose to ignore it by redirecting the +standart output to /dev/null: +``` +$ check_patroni -e https://p1:8008 cluster_node_count 2>/dev/null +CLUSTERNODECOUNT OK - members is 2 | members=2 role_leader=1 role_replica=1 state_running=2 +``` _EOF_ readme readme "## Cluster services" diff --git a/mypy.ini b/mypy.ini index 233fe5f..e72d338 100644 --- a/mypy.ini +++ b/mypy.ini @@ -6,9 +6,6 @@ exclude = build/ [mypy-setup] ignore_errors = True -[mypy-urllib3.*] -ignore_missing_imports = true - [mypy-nagiosplugin.*] ignore_missing_imports = true diff --git a/requirements-dev.txt b/requirements-dev.txt index 19d85e8..12ba4f0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ flake8 mypy==0.961 pytest pytest-mock +types-requests setuptools tox twine diff --git a/setup.py b/setup.py index 60ed932..47704fc 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( python_requires=">=3.6", install_requires=[ "attrs >= 17, !=21.1", - "urllib3 >= 1.26.6", + "requests", "nagiosplugin >= 1.3.2", "click >= 8.0.1", ], diff --git a/tox.ini b/tox.ini index d8d626c..321abdb 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,8 @@ commands = deps = mypy == 0.961 commands = - mypy {toxinidir}/check_patroni + # we need to install types-requests + mypy --install-types --non-interactive {toxinidir}/check_patroni [testenv:build] deps =