diff --git a/debian/changelog b/debian/changelog index b556377..34a4112 100644 --- a/debian/changelog +++ b/debian/changelog @@ -3,6 +3,10 @@ patroni (2.1.1-2) UNRELEASED; urgency=medium [ Christoph Berg ] * debian/tests/control: Give Test-Commands meaningful names. + [ Michael Banck ] + * debian/patches/dcs-last-seen.patch: New patch, taken from upstream commit + 2f31e88b. + -- Debian PostgreSQL Maintainers Fri, 20 Aug 2021 10:54:10 +0200 patroni (2.1.1-1) unstable; urgency=medium diff --git a/debian/patches/dcs-last-seen.patch b/debian/patches/dcs-last-seen.patch new file mode 100644 index 0000000..1274740 --- /dev/null +++ b/debian/patches/dcs-last-seen.patch @@ -0,0 +1,154 @@ +From 2f31e88bdc3f933f0c3fffdc6ea67a99a7c378cc Mon Sep 17 00:00:00 2001 +From: Michael Banck +Date: Wed, 22 Sep 2021 10:01:35 +0200 +Subject: [PATCH] Add dcs_last_seen field to API (#2051) + +This field notes the last time (as unix epoch) a cluster member has successfully communicated with the DCS. This is useful to identify and/or analyze network partitions. + +Also, expose dcs_last_seen in the MemberStatus class and its from_api_response() method. +--- + patroni/api.py | 6 ++++++ + patroni/dcs/__init__.py | 7 +++++++ + patroni/ha.py | 12 ++++++++---- + tests/test_api.py | 3 ++- + tests/test_ha.py | 7 ++++--- + 5 files changed, 27 insertions(+), 8 deletions(-) + +diff --git a/patroni/api.py b/patroni/api.py +index eeffaf2cf..c6adbb734 100644 +--- a/patroni/api.py ++++ b/patroni/api.py +@@ -288,6 +288,11 @@ def do_GET_metrics(self): + metrics.append("# TYPE patroni_postgres_timeline counter") + metrics.append("patroni_postgres_timeline{0} {1}".format(scope_label, postgres.get('timeline', 0))) + ++ metrics.append("# HELP patroni_dcs_last_seen Epoch timestamp when DCS was last contacted successfully" ++ " by Patroni.") ++ metrics.append("# TYPE patroni_dcs_last_seen gauge") ++ metrics.append("patroni_dcs_last_seen{0} {1}".format(scope_label, postgres.get('dcs_last_seen', 0))) ++ + self._write_response(200, '\n'.join(metrics)+'\n', content_type='text/plain') + + def _read_json_content(self, body_is_optional=False): +@@ -600,6 +605,7 @@ def get_postgresql_status(self, retry=False): + 'role': 'replica' if row[1] == 0 else 'master', + 'server_version': postgresql.server_version, + 'cluster_unlocked': bool(not cluster or cluster.is_unlocked()), ++ 'dcs_last_seen': self.server.patroni.dcs.last_seen, + 'xlog': ({ + 'received_location': row[4] or row[3], + 'replayed_location': row[3], +diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py +index 38b1e27d9..d96087d16 100644 +--- a/patroni/dcs/__init__.py ++++ b/patroni/dcs/__init__.py +@@ -652,6 +652,7 @@ def __init__(self, config): + self._cluster_valid_till = 0 + self._cluster_thread_lock = Lock() + self._last_lsn = '' ++ self._last_seen = 0 + self._last_status = {} + self.event = Event() + +@@ -722,6 +723,10 @@ def reload_config(self, config): + def loop_wait(self): + return self._loop_wait + ++ @property ++ def last_seen(self): ++ return self._last_seen ++ + @abc.abstractmethod + def _load_cluster(self): + """Internally this method should build `Cluster` object which +@@ -744,6 +749,8 @@ def get_cluster(self, force=False): + self.reset_cluster() + raise + ++ self._last_seen = int(time.time()) ++ + with self._cluster_thread_lock: + self._cluster = cluster + self._cluster_valid_till = time.time() + self.ttl +diff --git a/patroni/ha.py b/patroni/ha.py +index 209a52182..83c289c51 100644 +--- a/patroni/ha.py ++++ b/patroni/ha.py +@@ -21,13 +21,15 @@ + logger = logging.getLogger(__name__) + + +-class _MemberStatus(namedtuple('_MemberStatus', ['member', 'reachable', 'in_recovery', 'timeline', +- 'wal_position', 'tags', 'watchdog_failed'])): ++class _MemberStatus(namedtuple('_MemberStatus', ['member', 'reachable', 'in_recovery', ++ 'dcs_last_seen', 'timeline', 'wal_position', ++ 'tags', 'watchdog_failed'])): + """Node status distilled from API response: + + member - dcs.Member object of the node + reachable - `!False` if the node is not reachable or is not responding with correct JSON + in_recovery - `!True` if pg_is_in_recovery() == true ++ dcs_last_seen - timestamp from JSON of last succesful communication with DCS + timeline - timeline value from JSON + wal_position - maximum value of `replayed_location` or `received_location` from JSON + tags - dictionary with values of different tags (i.e. nofailover) +@@ -37,12 +39,14 @@ class _MemberStatus(namedtuple('_MemberStatus', ['member', 'reachable', 'in_reco + def from_api_response(cls, member, json): + is_master = json['role'] == 'master' + timeline = json.get('timeline', 0) ++ dcs_last_seen = json.get('dcs_last_seen', 0) + wal = not is_master and max(json['xlog'].get('received_location', 0), json['xlog'].get('replayed_location', 0)) +- return cls(member, True, not is_master, timeline, wal, json.get('tags', {}), json.get('watchdog_failed', False)) ++ return cls(member, True, not is_master, dcs_last_seen, timeline, wal, ++ json.get('tags', {}), json.get('watchdog_failed', False)) + + @classmethod + def unknown(cls, member): +- return cls(member, False, None, 0, 0, {}, False) ++ return cls(member, False, None, 0, 0, 0, {}, False) + + def failover_limitation(self): + """Returns reason why this node can't promote or None if everything is ok.""" +diff --git a/tests/test_api.py b/tests/test_api.py +index 34a6224eb..60c3dd038 100644 +--- a/tests/test_api.py ++++ b/tests/test_api.py +@@ -54,6 +54,7 @@ class MockHa(object): + + state_handler = MockPostgresql() + watchdog = MockWatchdog() ++ dcs_last_seen = 0 + + @staticmethod + def is_leader(): +@@ -77,7 +78,7 @@ def delete_future_restart(): + + @staticmethod + def fetch_nodes_statuses(members): +- return [_MemberStatus(None, True, None, 0, None, {}, False)] ++ return [_MemberStatus(None, True, None, 0, 0, None, {}, False)] + + @staticmethod + def schedule_future_restart(data): +diff --git a/tests/test_ha.py b/tests/test_ha.py +index 3228be697..0f30a1102 100644 +--- a/tests/test_ha.py ++++ b/tests/test_ha.py +@@ -80,13 +80,14 @@ def get_standby_cluster_initialized_with_only_leader(failover=None, sync=None): + ) + + +-def get_node_status(reachable=True, in_recovery=True, timeline=2, +- wal_position=10, nofailover=False, watchdog_failed=False): ++def get_node_status(reachable=True, in_recovery=True, dcs_last_seen=0, ++ timeline=2, wal_position=10, nofailover=False, ++ watchdog_failed=False): + def fetch_node_status(e): + tags = {} + if nofailover: + tags['nofailover'] = True +- return _MemberStatus(e, reachable, in_recovery, timeline, wal_position, tags, watchdog_failed) ++ return _MemberStatus(e, reachable, in_recovery, dcs_last_seen, timeline, wal_position, tags, watchdog_failed) + return fetch_node_status + + diff --git a/debian/patches/series b/debian/patches/series index 29713ae..bbd6fd4 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -5,3 +5,4 @@ offline_intersphinx.patch regression_tests_disable_requirement_download.patch requirements_cdiff.patch regression_tests_disable_raft_tests.py +dcs-last-seen.patch