Blame SOURCES/021-daemon-tracking.patch

97a979
From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001
97a979
From: Klaus Wenninger <klaus.wenninger@aon.at>
97a979
Date: Thu, 18 Nov 2021 13:23:34 +0100
97a979
Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for
97a979
 liveness
97a979
97a979
---
97a979
 daemons/pacemakerd/pacemakerd.c       |   2 +
97a979
 daemons/pacemakerd/pacemakerd.h       |   3 +-
97a979
 daemons/pacemakerd/pcmkd_messages.c   |   6 +-
97a979
 daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++---------
97a979
 4 files changed, 98 insertions(+), 52 deletions(-)
97a979
97a979
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
97a979
index 34d64c4053..062c2d5326 100644
97a979
--- a/daemons/pacemakerd/pacemakerd.c
97a979
+++ b/daemons/pacemakerd/pacemakerd.c
97a979
@@ -259,6 +259,8 @@ main(int argc, char **argv)
97a979
     pcmk_ipc_api_t *old_instance = NULL;
97a979
     qb_ipcs_service_t *ipcs = NULL;
97a979
 
97a979
+    subdaemon_check_progress = time(NULL);
97a979
+
97a979
     crm_log_preinit(NULL, argc, argv);
97a979
     mainloop_add_signal(SIGHUP, pcmk_ignore);
97a979
     mainloop_add_signal(SIGQUIT, pcmk_sigquit);
97a979
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
97a979
index 7c541bbf9e..424dbbcc5d 100644
97a979
--- a/daemons/pacemakerd/pacemakerd.h
97a979
+++ b/daemons/pacemakerd/pacemakerd.h
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2010-2021 the Pacemaker project contributors
97a979
+ * Copyright 2010-2022 the Pacemaker project contributors
97a979
  *
97a979
  * The version control history for this file may have further details.
97a979
  *
97a979
@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
97a979
 extern gboolean shutdown_complete_state_reported_client_closed;
97a979
 extern crm_trigger_t *shutdown_trigger;
97a979
 extern crm_trigger_t *startup_trigger;
97a979
+extern time_t subdaemon_check_progress;
97a979
 
97a979
 gboolean mcp_read_config(void);
97a979
 
97a979
diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c
97a979
index 0439986ecf..f2cddc353e 100644
97a979
--- a/daemons/pacemakerd/pcmkd_messages.c
97a979
+++ b/daemons/pacemakerd/pcmkd_messages.c
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2010-2021 the Pacemaker project contributors
97a979
+ * Copyright 2010-2022 the Pacemaker project contributors
97a979
  *
97a979
  * The version control history for this file may have further details.
97a979
  *
97a979
@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
97a979
     const char *value = NULL;
97a979
     xmlNode *ping = NULL;
97a979
     xmlNode *reply = NULL;
97a979
-    time_t pinged = time(NULL);
97a979
     const char *from = crm_element_value(msg, F_CRM_SYS_FROM);
97a979
 
97a979
     /* Pinged for status */
97a979
@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
97a979
     value = crm_element_value(msg, F_CRM_SYS_TO);
97a979
     crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
97a979
     crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
97a979
-    crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
97a979
+    crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
97a979
+                   (long long) subdaemon_check_progress);
97a979
     crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
97a979
     reply = create_reply(msg, ping);
97a979
     free_xml(ping);
97a979
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
index a54fcce1ba..c03903c99e 100644
97a979
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
@@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
97a979
     const char *command;
97a979
     const char *endpoint;  /* IPC server name */
97a979
     bool needs_cluster;
97a979
+    int check_count;
97a979
 
97a979
     /* Anything below here will be dynamically initialized */
97a979
     bool needs_retry;
97a979
     bool active_before_startup;
97a979
 } pcmk_child_t;
97a979
 
97a979
-#define PCMK_PROCESS_CHECK_INTERVAL 5
97a979
-#define SHUTDOWN_ESCALATION_PERIOD 180000  /* 3m */
97a979
+#define PCMK_PROCESS_CHECK_INTERVAL 1
97a979
+#define PCMK_PROCESS_CHECK_RETRIES  5
97a979
+#define SHUTDOWN_ESCALATION_PERIOD  180000  /* 3m */
97a979
 
97a979
 /* Index into the array below */
97a979
 #define PCMK_CHILD_CONTROLD  5
97a979
@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
97a979
 
97a979
 crm_trigger_t *shutdown_trigger = NULL;
97a979
 crm_trigger_t *startup_trigger = NULL;
97a979
+time_t subdaemon_check_progress = 0;
97a979
 
97a979
 /* When contacted via pacemakerd-api by a client having sbd in
97a979
  * the name we assume it is sbd-daemon which wants to know
97a979
@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
97a979
 GMainLoop *mainloop = NULL;
97a979
 
97a979
 static gboolean fatal_error = FALSE;
97a979
-static bool global_keep_tracking = false;
97a979
 
97a979
 static gboolean check_active_before_startup_processes(gpointer user_data);
97a979
 static int child_liveness(pcmk_child_t *child);
97a979
@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
97a979
 static gboolean
97a979
 check_active_before_startup_processes(gpointer user_data)
97a979
 {
97a979
-    gboolean keep_tracking = FALSE;
97a979
-
97a979
-    for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
97a979
-        if (!pcmk_children[i].active_before_startup) {
97a979
-            /* we are already tracking it as a child process. */
97a979
-            continue;
97a979
-        } else {
97a979
-            int rc = child_liveness(&pcmk_children[i]);
97a979
-
97a979
-            switch (rc) {
97a979
-                case pcmk_rc_ok:
97a979
-                    break;
97a979
-                case pcmk_rc_ipc_unresponsive:
97a979
-                case pcmk_rc_ipc_pid_only: // This case: it was previously OK
97a979
-                    if (pcmk_children[i].respawn) {
97a979
-                        crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
97a979
-                                (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
97a979
-                                (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
97a979
-                    } else {
97a979
-                        /* orderly shutdown */
97a979
-                        crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
97a979
-                                   (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
97a979
-                                   (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
97a979
-                    }
97a979
-                    pcmk_process_exit(&(pcmk_children[i]));
97a979
-                    continue;
97a979
-                default:
97a979
-                    crm_exit(CRM_EX_FATAL);
97a979
-                    break;  /* static analysis/noreturn */
97a979
+    static int next_child = 0;
97a979
+    int rc = child_liveness(&pcmk_children[next_child]);
97a979
+
97a979
+    crm_trace("%s[%lld] checked as %d",
97a979
+                           pcmk_children[next_child].name,
97a979
+                           (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                            pcmk_children[next_child].pid),
97a979
+                            rc);
97a979
+
97a979
+    switch (rc) {
97a979
+        case pcmk_rc_ok:
97a979
+            pcmk_children[next_child].check_count = 0;
97a979
+            next_child++;
97a979
+            subdaemon_check_progress = time(NULL);
97a979
+            break;
97a979
+        case pcmk_rc_ipc_pid_only: // This case: it was previously OK
97a979
+            pcmk_children[next_child].check_count++;
97a979
+            if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
97a979
+                crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
97a979
+                        "we found the pid so have it killed that we can restart",
97a979
+                        pcmk_children[next_child].name,
97a979
+                        (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                            pcmk_children[next_child].pid),
97a979
+                        pcmk_children[next_child].check_count);
97a979
+                stop_child(&pcmk_children[next_child], SIGKILL);
97a979
+                if (pcmk_children[next_child].respawn) {
97a979
+                    /* as long as the respawn-limit isn't reached
97a979
+                       give it another round of check retries
97a979
+                     */
97a979
+                    pcmk_children[next_child].check_count = 0;
97a979
+                }
97a979
+            } else {
97a979
+                crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
97a979
+                        pcmk_children[next_child].name,
97a979
+                        (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                            pcmk_children[next_child].pid),
97a979
+                        pcmk_children[next_child].check_count);
97a979
+                if (pcmk_children[next_child].respawn) {
97a979
+                    /* as long as the respawn-limit isn't reached
97a979
+                       and we haven't run out of connect retries
97a979
+                       we account this as progress we are willing
97a979
+                       to tell to sbd
97a979
+                     */
97a979
+                    subdaemon_check_progress = time(NULL);
97a979
+                }
97a979
             }
97a979
-        }
97a979
-        /* at least one of the processes found at startup
97a979
-         * is still going, so keep this recurring timer around */
97a979
-        keep_tracking = TRUE;
97a979
+            /* go to the next child and see if
97a979
+               we can make progress there
97a979
+             */
97a979
+            next_child++;
97a979
+            break;
97a979
+        case pcmk_rc_ipc_unresponsive:
97a979
+            if (pcmk_children[next_child].respawn) {
97a979
+                crm_err("%s[%lld] terminated",
97a979
+                        pcmk_children[next_child].name,
97a979
+                        (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                            pcmk_children[next_child].pid));
97a979
+            } else {
97a979
+                /* orderly shutdown */
97a979
+                crm_notice("%s[%lld] terminated",
97a979
+                           pcmk_children[next_child].name,
97a979
+                           (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                                pcmk_children[next_child].pid));
97a979
+            }
97a979
+            pcmk_process_exit(&(pcmk_children[next_child]));
97a979
+            if (!pcmk_children[next_child].respawn) {
97a979
+                /* if a subdaemon is down and we don't want it
97a979
+                   to be restarted this is a success during
97a979
+                   shutdown. if it isn't restarted anymore
97a979
+                   due to MAX_RESPAWN it is
97a979
+                   rather no success.
97a979
+                 */
97a979
+                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
97a979
+                    subdaemon_check_progress = time(NULL);
97a979
+                }
97a979
+                next_child++;
97a979
+            }
97a979
+            break;
97a979
+        default:
97a979
+            crm_exit(CRM_EX_FATAL);
97a979
+            break;  /* static analysis/noreturn */
97a979
     }
97a979
 
97a979
-    global_keep_tracking = keep_tracking;
97a979
-    return keep_tracking;
97a979
+    if (next_child >= PCMK__NELEM(pcmk_children)) {
97a979
+        next_child = 0;
97a979
+    }
97a979
+
97a979
+    return G_SOURCE_CONTINUE;
97a979
 }
97a979
 
97a979
 static gboolean
97a979
@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
97a979
                  child->name, child->endpoint);
97a979
         /* need to monitor how it evolves, and start new process if badly */
97a979
         child->active_before_startup = true;
97a979
-        if (!global_keep_tracking) {
97a979
-            global_keep_tracking = true;
97a979
-            g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
97a979
-                                  check_active_before_startup_processes, NULL);
97a979
-        }
97a979
 
97a979
     } else {
97a979
         if (child->needs_cluster && !pcmkd_cluster_connected()) {
97a979
@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
97a979
 int
97a979
 find_and_track_existing_processes(void)
97a979
 {
97a979
-    bool tracking = false;
97a979
     bool wait_in_progress;
97a979
     int rc;
97a979
     size_t i, rounds;
97a979
@@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
97a979
                                                pcmk_children[i].pid));
97a979
                     pcmk_children[i].respawn_count = -1;  /* 0~keep watching */
97a979
                     pcmk_children[i].active_before_startup = true;
97a979
-                    tracking = true;
97a979
                     break;
97a979
                 case pcmk_rc_ipc_pid_only:
97a979
                     if (pcmk_children[i].respawn_count == WAIT_TRIES) {
97a979
@@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
97a979
         pcmk_children[i].respawn_count = 0;  /* restore pristine state */
97a979
     }
97a979
 
97a979
-    if (tracking) {
97a979
-        g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
97a979
+    g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
97a979
                               check_active_before_startup_processes, NULL);
97a979
-    }
97a979
     return pcmk_rc_ok;
97a979
 }
97a979
 
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001
97a979
From: Klaus Wenninger <klaus.wenninger@aon.at>
97a979
Date: Thu, 9 Dec 2021 11:25:22 +0100
97a979
Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect
97a979
97a979
---
97a979
 configure.ac            |  3 +++
97a979
 lib/common/ipc_client.c | 22 ++++++++++++++++++++++
97a979
 2 files changed, 25 insertions(+)
97a979
97a979
diff --git a/configure.ac b/configure.ac
97a979
index f43fb724c7..c747fe1193 100644
97a979
--- a/configure.ac
97a979
+++ b/configure.ac
97a979
@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
97a979
 CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
97a979
 LIBS="$libqb_LIBS $LIBS"
97a979
 
97a979
+dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
97a979
+AC_CHECK_FUNCS([qb_ipcc_connect_async])
97a979
+
97a979
 dnl libqb 2.0.2+ (2020-10)
97a979
 AC_CHECK_FUNCS(qb_ipcc_auth_get,
97a979
                AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
97a979
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
97a979
index c5afdf3a3d..417b9ef175 100644
97a979
--- a/lib/common/ipc_client.c
97a979
+++ b/lib/common/ipc_client.c
97a979
@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
97a979
     int32_t qb_rc;
97a979
     pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
97a979
     qb_ipcc_connection_t *c;
97a979
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
97a979
+    struct pollfd pollfd = { 0, };
97a979
+    int poll_rc;
97a979
 
97a979
+    c = qb_ipcc_connect_async(name, 0,
97a979
+                              &(pollfd.fd));
97a979
+#else
97a979
     c = qb_ipcc_connect(name, 0);
97a979
+#endif
97a979
     if (c == NULL) {
97a979
         crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
97a979
         rc = pcmk_rc_ipc_unresponsive;
97a979
         goto bail;
97a979
     }
97a979
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
97a979
+    pollfd.events = POLLIN;
97a979
+    do {
97a979
+        poll_rc = poll(&pollfd, 1, 2000);
97a979
+    } while ((poll_rc == -1) && (errno == EINTR));
97a979
+    if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
97a979
+        crm_info("Could not connect to %s IPC: %s", name,
97a979
+                 (poll_rc == 0)?"timeout":strerror(errno));
97a979
+        rc = pcmk_rc_ipc_unresponsive;
97a979
+        if (poll_rc > 0) {
97a979
+            c = NULL; // qb_ipcc_connect_continue cleaned up for us
97a979
+        }
97a979
+        goto bail;
97a979
+    }
97a979
+#endif
97a979
 
97a979
     qb_rc = qb_ipcc_fd_get(c, &fd;;
97a979
     if (qb_rc != 0) {
97a979
-- 
97a979
2.27.0
97a979