Blame SOURCES/024-daemon-tracking.patch

97a979
From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
97a979
From: Klaus Wenninger <klaus.wenninger@aon.at>
97a979
Date: Mon, 24 Jan 2022 12:18:42 +0100
97a979
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
97a979
 processes
97a979
97a979
regression from introduction of periodic subdaemon checking
97a979
in cases they are pacemakerd children - previously it was either
97a979
periodic checking or signal-handler per process.
97a979
---
97a979
 daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
97a979
 1 file changed, 22 insertions(+), 16 deletions(-)
97a979
97a979
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
index c03903c99e..84ecdc1ee8 100644
97a979
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
97a979
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
97a979
     switch (rc) {
97a979
         case pcmk_rc_ok:
97a979
             pcmk_children[next_child].check_count = 0;
97a979
-            next_child++;
97a979
             subdaemon_check_progress = time(NULL);
97a979
             break;
97a979
         case pcmk_rc_ipc_pid_only: // This case: it was previously OK
97a979
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
97a979
             /* go to the next child and see if
97a979
                we can make progress there
97a979
              */
97a979
-            next_child++;
97a979
             break;
97a979
         case pcmk_rc_ipc_unresponsive:
97a979
+            if (!pcmk_children[next_child].respawn) {
97a979
+                /* if a subdaemon is down and we don't want it
97a979
+                   to be restarted this is a success during
97a979
+                   shutdown. if it isn't restarted anymore
97a979
+                   due to MAX_RESPAWN it is
97a979
+                   rather no success.
97a979
+                 */
97a979
+                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
97a979
+                    subdaemon_check_progress = time(NULL);
97a979
+                }
97a979
+            }
97a979
+            if (!pcmk_children[next_child].active_before_startup) {
97a979
+                crm_trace("found %s[%lld] missing - signal-handler "
97a979
+                          "will take care of it",
97a979
+                           pcmk_children[next_child].name,
97a979
+                           (long long) PCMK__SPECIAL_PID_AS_0(
97a979
+                            pcmk_children[next_child].pid));
97a979
+                break;
97a979
+            }
97a979
             if (pcmk_children[next_child].respawn) {
97a979
                 crm_err("%s[%lld] terminated",
97a979
                         pcmk_children[next_child].name,
97a979
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
97a979
                                 pcmk_children[next_child].pid));
97a979
             }
97a979
             pcmk_process_exit(&(pcmk_children[next_child]));
97a979
-            if (!pcmk_children[next_child].respawn) {
97a979
-                /* if a subdaemon is down and we don't want it
97a979
-                   to be restarted this is a success during
97a979
-                   shutdown. if it isn't restarted anymore
97a979
-                   due to MAX_RESPAWN it is
97a979
-                   rather no success.
97a979
-                 */
97a979
-                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
97a979
-                    subdaemon_check_progress = time(NULL);
97a979
-                }
97a979
-                next_child++;
97a979
-            }
97a979
             break;
97a979
         default:
97a979
             crm_exit(CRM_EX_FATAL);
97a979
             break;  /* static analysis/noreturn */
97a979
     }
97a979
 
97a979
+    next_child++;
97a979
     if (next_child >= PCMK__NELEM(pcmk_children)) {
97a979
         next_child = 0;
97a979
     }
97a979
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
97a979
 {
97a979
     child->pid = 0;
97a979
     child->active_before_startup = false;
97a979
+    child->check_count = 0;
97a979
 
97a979
     child->respawn_count += 1;
97a979
     if (child->respawn_count > MAX_RESPAWN) {
97a979
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
97a979
         crm_warn("One-off suppressing strict respawning of a child process %s,"
97a979
                  " appears alright per %s IPC end-point",
97a979
                  child->name, child->endpoint);
97a979
-        /* need to monitor how it evolves, and start new process if badly */
97a979
-        child->active_before_startup = true;
97a979
 
97a979
     } else {
97a979
         if (child->needs_cluster && !pcmkd_cluster_connected()) {
97a979
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
97a979
     const char *env_callgrind = getenv("PCMK_callgrind_enabled");
97a979
 
97a979
     child->active_before_startup = false;
97a979
+    child->check_count = 0;
97a979
 
97a979
     if (child->command == NULL) {
97a979
         crm_info("Nothing to do for child \"%s\"", child->name);
97a979
-- 
97a979
2.27.0
97a979