Node and Cluster services reviews

2022-02-07 14:18:14 +01:00 · 2022-02-07 14:18:14 +01:00 · 4de20fefdc
parent ec3b8f6806
commit 4de20fefdc
3 changed files with 28 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -23,7 +23,7 @@ Options:
 Commands:
  cluster_config_has_changed  Check if the hash of the configuration has...
  cluster_has_leader          Check if the cluster has a leader.
-  cluster_has_replica         Check if the cluster has healthy replicates.
+  cluster_has_replica         Check if the cluster has healthy replicas.
  cluster_is_in_maintenance   Check if the cluster is in maintenance mode...
  cluster_node_count          Count the number of nodes in the cluster.
  node_is_alive               Check if the node is alive ie patroni is...
@ -133,6 +133,8 @@ Usage: check_patroni cluster_has_leader [OPTIONS]

  Check if the cluster has a leader.

+  Note: there is no difference between a normal and standby leader.
+
  Check:
  * `OK`: if there is a leader node.
  * `CRITICAL`: otherwise
@ -148,9 +150,9 @@ Options:
 ```
 Usage: check_patroni cluster_has_replica [OPTIONS]

-  Check if the cluster has healthy replicates.
+  Check if the cluster has healthy replicas.

-  A healthy replicate:
+  A healthy replica:
  * is in running state
  * has a replica role
  * has a lag lower or equal to max_lag
@ -164,8 +166,10 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
  * the lag of each replica labelled with  "member name"_lag

 Options:
-  -w, --warning TEXT   Warning threshold for the number of nodes.
-  -c, --critical TEXT  Critical threshold for the number of replica nodes.
+  -w, --warning TEXT   Warning threshold for the number of healthy replica
+                       nodes.
+  -c, --critical TEXT  Critical threshold for the number of healthy replica
+                       nodes.
  --max-lag TEXT       maximum allowed lag
  --help               Show this message and exit.
 ```
@ -197,7 +201,7 @@ Usage: check_patroni cluster_node_count [OPTIONS]

  Check:
  * Compares the number of nodes against the normal and running node warning and critical thresholds.
-  * `OK`!  If they are not provided.
+  * `OK`:  If they are not provided.

  Perfdata:
  * `members`: the member count.
@ -206,9 +210,9 @@ Usage: check_patroni cluster_node_count [OPTIONS]

 Options:
  -w, --warning TEXT       Warning threshold for the number of nodes.
-  -c, --critical TEXT      Critical threshold for the nimber of nodes.
+  -c, --critical TEXT      Critical threshold for the number of nodes.
  --running-warning TEXT   Warning threshold for the number of running nodes.
-  --running-critical TEXT  Critical threshold for the nimber of running nodes.
+  --running-critical TEXT  Critical threshold for the number of running nodes.
  --help                   Show this message and exit.
 ```

--- a/check_patroni/cli.py
+++ b/check_patroni/cli.py
@ -164,7 +164,7 @@ def main(
    "--critical",
    "critical",
    type=str,
-    help="Critical threshold for the nimber of nodes.",
+    help="Critical threshold for the number of nodes.",
 )
@click.option(
    "--running-warning",
@ -176,7 +176,7 @@ def main(
    "--running-critical",
    "running_critical",
    type=str,
-    help="Critical threshold for the nimber of running nodes.",
+    help="Critical threshold for the number of running nodes.",
 )
@click.pass_context
@nagiosplugin.guarded
@ -192,7 +192,7 @@ def cluster_node_count(
    \b
    Check:
    * Compares the number of nodes against the normal and running node warning and critical thresholds.
-    * `OK`!  If they are not provided.
+    * `OK`:  If they are not provided.

    \b
    Perfdata:
@ -213,8 +213,8 @@ def cluster_node_count(
            running_warning,
            running_critical,
        ),
-        nagiosplugin.ScalarContext("members_roles"),
-        nagiosplugin.ScalarContext("members_statuses"),
+        nagiosplugin.ScalarContext("member_roles"),
+        nagiosplugin.ScalarContext("member_statuses"),
    )
    check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)

@ -225,6 +225,8 @@ def cluster_node_count(
 def cluster_has_leader(ctx: click.Context) -> None:
    """Check if the cluster has a leader.

+    Note: there is no difference between a normal and standby leader.
+
    \b
    Check:
    * `OK`: if there is a leader node.
@ -232,7 +234,6 @@ def cluster_has_leader(ctx: click.Context) -> None:

    Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
    """
-    # FIXME: Manage primary or standby leader in the same place ?
    check = nagiosplugin.Check()
    check.add(
        ClusterHasLeader(ctx.obj.connection_info),
@ -248,14 +249,14 @@ def cluster_has_leader(ctx: click.Context) -> None:
    "--warning",
    "warning",
    type=str,
-    help="Warning threshold for the number of nodes.",
+    help="Warning threshold for the number of healthy replica nodes.",
 )
@click.option(
    "-c",
    "--critical",
    "critical",
    type=str,
-    help="Critical threshold for the number of replica nodes.",
+    help="Critical threshold for the number of healthy replica nodes.",
 )
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@ -263,10 +264,10 @@ def cluster_has_leader(ctx: click.Context) -> None:
 def cluster_has_replica(
    ctx: click.Context, warning: str, critical: str, max_lag: str
 ) -> None:
-    """Check if the cluster has healthy replicates.
+    """Check if the cluster has healthy replicas.

    \b
-    A healthy replicate:
+    A healthy replica:
    * is in running state
    * has a replica role
    * has a lag lower or equal to max_lag
@ -324,7 +325,7 @@ def cluster_config_has_changed(
    Perfdata:
    * `is_configuration_changed` is 1 if the configuration has changed
    """
-    # FIXME hash in perfdata ?
+    # Note: hash cannot be in the perf data = not a number
    if (config_hash is None and state_file is None) or (
        config_hash is not None and state_file is not None
    ):
--- a/check_patroni/cluster.py
+++ b/check_patroni/cluster.py
@ -39,14 +39,14 @@ class ClusterNodeCount(PatroniResource):
        # The performance data : role
        for role in role_counters:
            yield nagiosplugin.Metric(
-                f"role_{role}", role_counters[role], context="members_roles"
+                f"role_{role}", role_counters[role], context="member_roles"
            )

        # The performance data : statuses (except running)
        for state in status_counters:
            if state != "running":
                yield nagiosplugin.Metric(
-                    f"state_{state}", status_counters[state], context="members_statuses"
+                    f"state_{state}", status_counters[state], context="member_statuses"
                )


@ -111,7 +111,7 @@ class ClusterHasReplica(PatroniResource):
        # The actual check
        yield nagiosplugin.Metric("healthy_replica", healthy_replica)

-        # The performance data : unheakthy replica count, replicas lag
+        # The performance data : unhealthy replica count, replicas lag
        yield nagiosplugin.Metric("unhealthy_replica", unhealthy_replica)
        for replica in replicas:
            yield nagiosplugin.Metric(
@ -165,6 +165,7 @@ class ClusterConfigHasChanged(PatroniResource):


 class ClusterConfigHasChangedSummary(nagiosplugin.Summary):
+    # TODO: It would be helpful to display the old / new hash here, but it's not a metric.
    def ok(self: "ClusterConfigHasChangedSummary", results: nagiosplugin.Result) -> str:
        return "The hash of patroni's dynamic configuration has not changed."