check-patroni/check_patroni/types.py
benoit 8d6b8502b6 cluster_has_replica: fix the way a healthy replica is detected
For patroni >= version 3.0.4:
* the role is `replica` or `sync_standby`
* the state is `streaming` or `in archive recovery`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

For prio versions of patroni:
* the role is `replica` or `sync_standby`
* the state is `running`
* the timeline is the same as the leader
* the lag is lower or equal to `max_lag`

Additionnally, we now display the timeline in the perfstats. We also try
to display the perf stats of unhealthy replica as much as possible.

Update tests for cluster_has_replica:
* Fix the tests to make them work with the new algotithm
* Add a specific test for tl divergences
2023-11-11 10:50:35 +01:00

115 lines
3.9 KiB
Python

import json
from functools import lru_cache
from typing import Any, Callable, List, Optional, Tuple, Union
from urllib.parse import urlparse
import attr
import nagiosplugin
import requests
from . import _log
class APIError(requests.exceptions.RequestException):
"""This exception is raised when the rest api couldn't
be reached and we got a http status code different from 200.
"""
@attr.s(auto_attribs=True, frozen=True, slots=True)
class ConnectionInfo:
endpoints: List[str] = ["http://127.0.0.1:8008"]
cert: Optional[Union[str, Tuple[str, str]]] = None
ca_cert: Optional[str] = None
@attr.s(auto_attribs=True, frozen=True, slots=True)
class Parameters:
connection_info: ConnectionInfo
timeout: int
verbose: int
@attr.s(auto_attribs=True, eq=False, slots=True)
class PatroniResource(nagiosplugin.Resource):
conn_info: ConnectionInfo
def rest_api(self, service: str) -> Any:
"""Try to connect to all the provided endpoints for the requested service"""
for endpoint in self.conn_info.endpoints:
cert: Optional[Union[Tuple[str, str], str]] = None
verify: Optional[Union[str, bool]] = None
if urlparse(endpoint).scheme == "https":
if self.conn_info.cert is not None:
# we can have: a key + a cert or a single file with key and cert.
cert = self.conn_info.cert
if self.conn_info.ca_cert is not None:
verify = self.conn_info.ca_cert
_log.debug(
"Trying to connect to %(endpoint)s/%(service)s with cert: %(cert)s verify: %(verify)s",
{
"endpoint": endpoint,
"service": service,
"cert": cert,
"verify": verify,
},
)
try:
r = requests.get(f"{endpoint}/{service}", verify=verify, cert=cert)
except Exception as e:
_log.debug(e)
continue
# The status code is already displayed by urllib3
_log.debug(
"api call data: %(data)s", {"data": r.text if r.text else "<Empty>"}
)
if r.status_code != 200:
raise APIError(
f"Failed to connect to {endpoint}/{service} status code {r.status_code}"
)
try:
return r.json()
except (json.JSONDecodeError, ValueError):
return None
raise nagiosplugin.CheckError("Connection failed for all provided endpoints")
@lru_cache(maxsize=None)
def has_detailed_states(self) -> bool:
# get patroni's version to find out if the "streaming" and "in archive recovery" states are available
patroni_item_dict = self.rest_api("patroni")
if tuple(
int(v) for v in patroni_item_dict["patroni"]["version"].split(".", 2)
) >= (3, 0, 4):
_log.debug(
"Patroni's version is %(version)s, more detailed states can be used to check for the health of replicas.",
{"version": patroni_item_dict["patroni"]["version"]},
)
return True
_log.debug(
"Patroni's version is %(version)s, the running state and the timelines must be used to check for the health of replicas.",
{"version": patroni_item_dict["patroni"]["version"]},
)
return False
HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Results], Any]
def handle_unknown(func: HandleUnknown) -> HandleUnknown:
"""decorator to handle the unknown state in Summary.problem"""
def wrapper(summary: nagiosplugin.Summary, results: nagiosplugin.Results) -> Any:
if results.most_significant[0].state.code == 3:
"""get the appropriate message for all unknown error"""
return results.most_significant[0].hint
return func(summary, results)
return wrapper