Blame SOURCES/021-daemon-tracking.patch

d4e586
From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001
d4e586
From: Klaus Wenninger <klaus.wenninger@aon.at>
d4e586
Date: Thu, 18 Nov 2021 13:23:34 +0100
d4e586
Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for
d4e586
 liveness
d4e586
d4e586
---
d4e586
 daemons/pacemakerd/pacemakerd.c       |   2 +
d4e586
 daemons/pacemakerd/pacemakerd.h       |   3 +-
d4e586
 daemons/pacemakerd/pcmkd_messages.c   |   6 +-
d4e586
 daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++---------
d4e586
 4 files changed, 98 insertions(+), 52 deletions(-)
d4e586
d4e586
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
d4e586
index 34d64c4053..062c2d5326 100644
d4e586
--- a/daemons/pacemakerd/pacemakerd.c
d4e586
+++ b/daemons/pacemakerd/pacemakerd.c
d4e586
@@ -259,6 +259,8 @@ main(int argc, char **argv)
d4e586
     pcmk_ipc_api_t *old_instance = NULL;
d4e586
     qb_ipcs_service_t *ipcs = NULL;
d4e586
 
d4e586
+    subdaemon_check_progress = time(NULL);
d4e586
+
d4e586
     crm_log_preinit(NULL, argc, argv);
d4e586
     mainloop_add_signal(SIGHUP, pcmk_ignore);
d4e586
     mainloop_add_signal(SIGQUIT, pcmk_sigquit);
d4e586
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
d4e586
index 7c541bbf9e..424dbbcc5d 100644
d4e586
--- a/daemons/pacemakerd/pacemakerd.h
d4e586
+++ b/daemons/pacemakerd/pacemakerd.h
d4e586
@@ -1,5 +1,5 @@
d4e586
 /*
d4e586
- * Copyright 2010-2021 the Pacemaker project contributors
d4e586
+ * Copyright 2010-2022 the Pacemaker project contributors
d4e586
  *
d4e586
  * The version control history for this file may have further details.
d4e586
  *
d4e586
@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
d4e586
 extern gboolean shutdown_complete_state_reported_client_closed;
d4e586
 extern crm_trigger_t *shutdown_trigger;
d4e586
 extern crm_trigger_t *startup_trigger;
d4e586
+extern time_t subdaemon_check_progress;
d4e586
 
d4e586
 gboolean mcp_read_config(void);
d4e586
 
d4e586
diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c
d4e586
index 0439986ecf..f2cddc353e 100644
d4e586
--- a/daemons/pacemakerd/pcmkd_messages.c
d4e586
+++ b/daemons/pacemakerd/pcmkd_messages.c
d4e586
@@ -1,5 +1,5 @@
d4e586
 /*
d4e586
- * Copyright 2010-2021 the Pacemaker project contributors
d4e586
+ * Copyright 2010-2022 the Pacemaker project contributors
d4e586
  *
d4e586
  * The version control history for this file may have further details.
d4e586
  *
d4e586
@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
d4e586
     const char *value = NULL;
d4e586
     xmlNode *ping = NULL;
d4e586
     xmlNode *reply = NULL;
d4e586
-    time_t pinged = time(NULL);
d4e586
     const char *from = crm_element_value(msg, F_CRM_SYS_FROM);
d4e586
 
d4e586
     /* Pinged for status */
d4e586
@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
d4e586
     value = crm_element_value(msg, F_CRM_SYS_TO);
d4e586
     crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
d4e586
     crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
d4e586
-    crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
d4e586
+    crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
d4e586
+                   (long long) subdaemon_check_progress);
d4e586
     crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
d4e586
     reply = create_reply(msg, ping);
d4e586
     free_xml(ping);
d4e586
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
index a54fcce1ba..c03903c99e 100644
d4e586
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
@@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
d4e586
     const char *command;
d4e586
     const char *endpoint;  /* IPC server name */
d4e586
     bool needs_cluster;
d4e586
+    int check_count;
d4e586
 
d4e586
     /* Anything below here will be dynamically initialized */
d4e586
     bool needs_retry;
d4e586
     bool active_before_startup;
d4e586
 } pcmk_child_t;
d4e586
 
d4e586
-#define PCMK_PROCESS_CHECK_INTERVAL 5
d4e586
-#define SHUTDOWN_ESCALATION_PERIOD 180000  /* 3m */
d4e586
+#define PCMK_PROCESS_CHECK_INTERVAL 1
d4e586
+#define PCMK_PROCESS_CHECK_RETRIES  5
d4e586
+#define SHUTDOWN_ESCALATION_PERIOD  180000  /* 3m */
d4e586
 
d4e586
 /* Index into the array below */
d4e586
 #define PCMK_CHILD_CONTROLD  5
d4e586
@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
d4e586
 
d4e586
 crm_trigger_t *shutdown_trigger = NULL;
d4e586
 crm_trigger_t *startup_trigger = NULL;
d4e586
+time_t subdaemon_check_progress = 0;
d4e586
 
d4e586
 /* When contacted via pacemakerd-api by a client having sbd in
d4e586
  * the name we assume it is sbd-daemon which wants to know
d4e586
@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
d4e586
 GMainLoop *mainloop = NULL;
d4e586
 
d4e586
 static gboolean fatal_error = FALSE;
d4e586
-static bool global_keep_tracking = false;
d4e586
 
d4e586
 static gboolean check_active_before_startup_processes(gpointer user_data);
d4e586
 static int child_liveness(pcmk_child_t *child);
d4e586
@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
d4e586
 static gboolean
d4e586
 check_active_before_startup_processes(gpointer user_data)
d4e586
 {
d4e586
-    gboolean keep_tracking = FALSE;
d4e586
-
d4e586
-    for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
d4e586
-        if (!pcmk_children[i].active_before_startup) {
d4e586
-            /* we are already tracking it as a child process. */
d4e586
-            continue;
d4e586
-        } else {
d4e586
-            int rc = child_liveness(&pcmk_children[i]);
d4e586
-
d4e586
-            switch (rc) {
d4e586
-                case pcmk_rc_ok:
d4e586
-                    break;
d4e586
-                case pcmk_rc_ipc_unresponsive:
d4e586
-                case pcmk_rc_ipc_pid_only: // This case: it was previously OK
d4e586
-                    if (pcmk_children[i].respawn) {
d4e586
-                        crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
d4e586
-                                (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
d4e586
-                                (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
d4e586
-                    } else {
d4e586
-                        /* orderly shutdown */
d4e586
-                        crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
d4e586
-                                   (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
d4e586
-                                   (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
d4e586
-                    }
d4e586
-                    pcmk_process_exit(&(pcmk_children[i]));
d4e586
-                    continue;
d4e586
-                default:
d4e586
-                    crm_exit(CRM_EX_FATAL);
d4e586
-                    break;  /* static analysis/noreturn */
d4e586
+    static int next_child = 0;
d4e586
+    int rc = child_liveness(&pcmk_children[next_child]);
d4e586
+
d4e586
+    crm_trace("%s[%lld] checked as %d",
d4e586
+                           pcmk_children[next_child].name,
d4e586
+                           (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                            pcmk_children[next_child].pid),
d4e586
+                            rc);
d4e586
+
d4e586
+    switch (rc) {
d4e586
+        case pcmk_rc_ok:
d4e586
+            pcmk_children[next_child].check_count = 0;
d4e586
+            next_child++;
d4e586
+            subdaemon_check_progress = time(NULL);
d4e586
+            break;
d4e586
+        case pcmk_rc_ipc_pid_only: // This case: it was previously OK
d4e586
+            pcmk_children[next_child].check_count++;
d4e586
+            if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
d4e586
+                crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
d4e586
+                        "we found the pid so have it killed that we can restart",
d4e586
+                        pcmk_children[next_child].name,
d4e586
+                        (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                            pcmk_children[next_child].pid),
d4e586
+                        pcmk_children[next_child].check_count);
d4e586
+                stop_child(&pcmk_children[next_child], SIGKILL);
d4e586
+                if (pcmk_children[next_child].respawn) {
d4e586
+                    /* as long as the respawn-limit isn't reached
d4e586
+                       give it another round of check retries
d4e586
+                     */
d4e586
+                    pcmk_children[next_child].check_count = 0;
d4e586
+                }
d4e586
+            } else {
d4e586
+                crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
d4e586
+                        pcmk_children[next_child].name,
d4e586
+                        (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                            pcmk_children[next_child].pid),
d4e586
+                        pcmk_children[next_child].check_count);
d4e586
+                if (pcmk_children[next_child].respawn) {
d4e586
+                    /* as long as the respawn-limit isn't reached
d4e586
+                       and we haven't run out of connect retries
d4e586
+                       we account this as progress we are willing
d4e586
+                       to tell to sbd
d4e586
+                     */
d4e586
+                    subdaemon_check_progress = time(NULL);
d4e586
+                }
d4e586
             }
d4e586
-        }
d4e586
-        /* at least one of the processes found at startup
d4e586
-         * is still going, so keep this recurring timer around */
d4e586
-        keep_tracking = TRUE;
d4e586
+            /* go to the next child and see if
d4e586
+               we can make progress there
d4e586
+             */
d4e586
+            next_child++;
d4e586
+            break;
d4e586
+        case pcmk_rc_ipc_unresponsive:
d4e586
+            if (pcmk_children[next_child].respawn) {
d4e586
+                crm_err("%s[%lld] terminated",
d4e586
+                        pcmk_children[next_child].name,
d4e586
+                        (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                            pcmk_children[next_child].pid));
d4e586
+            } else {
d4e586
+                /* orderly shutdown */
d4e586
+                crm_notice("%s[%lld] terminated",
d4e586
+                           pcmk_children[next_child].name,
d4e586
+                           (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                                pcmk_children[next_child].pid));
d4e586
+            }
d4e586
+            pcmk_process_exit(&(pcmk_children[next_child]));
d4e586
+            if (!pcmk_children[next_child].respawn) {
d4e586
+                /* if a subdaemon is down and we don't want it
d4e586
+                   to be restarted this is a success during
d4e586
+                   shutdown. if it isn't restarted anymore
d4e586
+                   due to MAX_RESPAWN it is
d4e586
+                   rather no success.
d4e586
+                 */
d4e586
+                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
d4e586
+                    subdaemon_check_progress = time(NULL);
d4e586
+                }
d4e586
+                next_child++;
d4e586
+            }
d4e586
+            break;
d4e586
+        default:
d4e586
+            crm_exit(CRM_EX_FATAL);
d4e586
+            break;  /* static analysis/noreturn */
d4e586
     }
d4e586
 
d4e586
-    global_keep_tracking = keep_tracking;
d4e586
-    return keep_tracking;
d4e586
+    if (next_child >= PCMK__NELEM(pcmk_children)) {
d4e586
+        next_child = 0;
d4e586
+    }
d4e586
+
d4e586
+    return G_SOURCE_CONTINUE;
d4e586
 }
d4e586
 
d4e586
 static gboolean
d4e586
@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
d4e586
                  child->name, child->endpoint);
d4e586
         /* need to monitor how it evolves, and start new process if badly */
d4e586
         child->active_before_startup = true;
d4e586
-        if (!global_keep_tracking) {
d4e586
-            global_keep_tracking = true;
d4e586
-            g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
d4e586
-                                  check_active_before_startup_processes, NULL);
d4e586
-        }
d4e586
 
d4e586
     } else {
d4e586
         if (child->needs_cluster && !pcmkd_cluster_connected()) {
d4e586
@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
d4e586
 int
d4e586
 find_and_track_existing_processes(void)
d4e586
 {
d4e586
-    bool tracking = false;
d4e586
     bool wait_in_progress;
d4e586
     int rc;
d4e586
     size_t i, rounds;
d4e586
@@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
d4e586
                                                pcmk_children[i].pid));
d4e586
                     pcmk_children[i].respawn_count = -1;  /* 0~keep watching */
d4e586
                     pcmk_children[i].active_before_startup = true;
d4e586
-                    tracking = true;
d4e586
                     break;
d4e586
                 case pcmk_rc_ipc_pid_only:
d4e586
                     if (pcmk_children[i].respawn_count == WAIT_TRIES) {
d4e586
@@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
d4e586
         pcmk_children[i].respawn_count = 0;  /* restore pristine state */
d4e586
     }
d4e586
 
d4e586
-    if (tracking) {
d4e586
-        g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
d4e586
+    g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
d4e586
                               check_active_before_startup_processes, NULL);
d4e586
-    }
d4e586
     return pcmk_rc_ok;
d4e586
 }
d4e586
 
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001
d4e586
From: Klaus Wenninger <klaus.wenninger@aon.at>
d4e586
Date: Thu, 9 Dec 2021 11:25:22 +0100
d4e586
Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect
d4e586
d4e586
---
d4e586
 configure.ac            |  3 +++
d4e586
 lib/common/ipc_client.c | 22 ++++++++++++++++++++++
d4e586
 2 files changed, 25 insertions(+)
d4e586
d4e586
diff --git a/configure.ac b/configure.ac
d4e586
index f43fb724c7..c747fe1193 100644
d4e586
--- a/configure.ac
d4e586
+++ b/configure.ac
d4e586
@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
d4e586
 CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
d4e586
 LIBS="$libqb_LIBS $LIBS"
d4e586
 
d4e586
+dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
d4e586
+AC_CHECK_FUNCS([qb_ipcc_connect_async])
d4e586
+
d4e586
 dnl libqb 2.0.2+ (2020-10)
d4e586
 AC_CHECK_FUNCS(qb_ipcc_auth_get,
d4e586
                AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
d4e586
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
d4e586
index c5afdf3a3d..417b9ef175 100644
d4e586
--- a/lib/common/ipc_client.c
d4e586
+++ b/lib/common/ipc_client.c
d4e586
@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
d4e586
     int32_t qb_rc;
d4e586
     pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
d4e586
     qb_ipcc_connection_t *c;
d4e586
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
d4e586
+    struct pollfd pollfd = { 0, };
d4e586
+    int poll_rc;
d4e586
 
d4e586
+    c = qb_ipcc_connect_async(name, 0,
d4e586
+                              &(pollfd.fd));
d4e586
+#else
d4e586
     c = qb_ipcc_connect(name, 0);
d4e586
+#endif
d4e586
     if (c == NULL) {
d4e586
         crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
d4e586
         rc = pcmk_rc_ipc_unresponsive;
d4e586
         goto bail;
d4e586
     }
d4e586
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
d4e586
+    pollfd.events = POLLIN;
d4e586
+    do {
d4e586
+        poll_rc = poll(&pollfd, 1, 2000);
d4e586
+    } while ((poll_rc == -1) && (errno == EINTR));
d4e586
+    if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
d4e586
+        crm_info("Could not connect to %s IPC: %s", name,
d4e586
+                 (poll_rc == 0)?"timeout":strerror(errno));
d4e586
+        rc = pcmk_rc_ipc_unresponsive;
d4e586
+        if (poll_rc > 0) {
d4e586
+            c = NULL; // qb_ipcc_connect_continue cleaned up for us
d4e586
+        }
d4e586
+        goto bail;
d4e586
+    }
d4e586
+#endif
d4e586
 
d4e586
     qb_rc = qb_ipcc_fd_get(c, &fd;;
d4e586
     if (qb_rc != 0) {
d4e586
-- 
d4e586
2.27.0
d4e586