From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Mon, 24 Jan 2022 12:18:42 +0100
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
 processes

regression from introduction of periodic subdaemon checking
in cases they are pacemakerd children - previously it was either
periodic checking or signal-handler per process.
---
 daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
index c03903c99e..84ecdc1ee8 100644
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
     switch (rc) {
         case pcmk_rc_ok:
             pcmk_children[next_child].check_count = 0;
-            next_child++;
             subdaemon_check_progress = time(NULL);
             break;
         case pcmk_rc_ipc_pid_only: // This case: it was previously OK
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
             /* go to the next child and see if
                we can make progress there
              */
-            next_child++;
             break;
         case pcmk_rc_ipc_unresponsive:
+            if (!pcmk_children[next_child].respawn) {
+                /* if a subdaemon is down and we don't want it
+                   to be restarted this is a success during
+                   shutdown. if it isn't restarted anymore
+                   due to MAX_RESPAWN it is
+                   rather no success.
+                 */
+                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+                    subdaemon_check_progress = time(NULL);
+                }
+            }
+            if (!pcmk_children[next_child].active_before_startup) {
+                crm_trace("found %s[%lld] missing - signal-handler "
+                          "will take care of it",
+                           pcmk_children[next_child].name,
+                           (long long) PCMK__SPECIAL_PID_AS_0(
+                            pcmk_children[next_child].pid));
+                break;
+            }
             if (pcmk_children[next_child].respawn) {
                 crm_err("%s[%lld] terminated",
                         pcmk_children[next_child].name,
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
                                 pcmk_children[next_child].pid));
             }
             pcmk_process_exit(&(pcmk_children[next_child]));
-            if (!pcmk_children[next_child].respawn) {
-                /* if a subdaemon is down and we don't want it
-                   to be restarted this is a success during
-                   shutdown. if it isn't restarted anymore
-                   due to MAX_RESPAWN it is
-                   rather no success.
-                 */
-                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
-                    subdaemon_check_progress = time(NULL);
-                }
-                next_child++;
-            }
             break;
         default:
             crm_exit(CRM_EX_FATAL);
             break;  /* static analysis/noreturn */
     }
 
+    next_child++;
     if (next_child >= PCMK__NELEM(pcmk_children)) {
         next_child = 0;
     }
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
 {
     child->pid = 0;
     child->active_before_startup = false;
+    child->check_count = 0;
 
     child->respawn_count += 1;
     if (child->respawn_count > MAX_RESPAWN) {
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
         crm_warn("One-off suppressing strict respawning of a child process %s,"
                  " appears alright per %s IPC end-point",
                  child->name, child->endpoint);
-        /* need to monitor how it evolves, and start new process if badly */
-        child->active_before_startup = true;
 
     } else {
         if (child->needs_cluster && !pcmkd_cluster_connected()) {
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
     const char *env_callgrind = getenv("PCMK_callgrind_enabled");
 
     child->active_before_startup = false;
+    child->check_count = 0;
 
     if (child->command == NULL) {
         crm_info("Nothing to do for child \"%s\"", child->name);
-- 
2.27.0