Blame SOURCES/024-daemon-tracking.patch

d4e586
From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
d4e586
From: Klaus Wenninger <klaus.wenninger@aon.at>
d4e586
Date: Mon, 24 Jan 2022 12:18:42 +0100
d4e586
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
d4e586
 processes
d4e586
d4e586
regression from introduction of periodic subdaemon checking
d4e586
in cases they are pacemakerd children - previously it was either
d4e586
periodic checking or signal-handler per process.
d4e586
---
d4e586
 daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
d4e586
 1 file changed, 22 insertions(+), 16 deletions(-)
d4e586
d4e586
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
index c03903c99e..84ecdc1ee8 100644
d4e586
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
d4e586
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
d4e586
     switch (rc) {
d4e586
         case pcmk_rc_ok:
d4e586
             pcmk_children[next_child].check_count = 0;
d4e586
-            next_child++;
d4e586
             subdaemon_check_progress = time(NULL);
d4e586
             break;
d4e586
         case pcmk_rc_ipc_pid_only: // This case: it was previously OK
d4e586
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
d4e586
             /* go to the next child and see if
d4e586
                we can make progress there
d4e586
              */
d4e586
-            next_child++;
d4e586
             break;
d4e586
         case pcmk_rc_ipc_unresponsive:
d4e586
+            if (!pcmk_children[next_child].respawn) {
d4e586
+                /* if a subdaemon is down and we don't want it
d4e586
+                   to be restarted this is a success during
d4e586
+                   shutdown. if it isn't restarted anymore
d4e586
+                   due to MAX_RESPAWN it is
d4e586
+                   rather no success.
d4e586
+                 */
d4e586
+                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
d4e586
+                    subdaemon_check_progress = time(NULL);
d4e586
+                }
d4e586
+            }
d4e586
+            if (!pcmk_children[next_child].active_before_startup) {
d4e586
+                crm_trace("found %s[%lld] missing - signal-handler "
d4e586
+                          "will take care of it",
d4e586
+                           pcmk_children[next_child].name,
d4e586
+                           (long long) PCMK__SPECIAL_PID_AS_0(
d4e586
+                            pcmk_children[next_child].pid));
d4e586
+                break;
d4e586
+            }
d4e586
             if (pcmk_children[next_child].respawn) {
d4e586
                 crm_err("%s[%lld] terminated",
d4e586
                         pcmk_children[next_child].name,
d4e586
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
d4e586
                                 pcmk_children[next_child].pid));
d4e586
             }
d4e586
             pcmk_process_exit(&(pcmk_children[next_child]));
d4e586
-            if (!pcmk_children[next_child].respawn) {
d4e586
-                /* if a subdaemon is down and we don't want it
d4e586
-                   to be restarted this is a success during
d4e586
-                   shutdown. if it isn't restarted anymore
d4e586
-                   due to MAX_RESPAWN it is
d4e586
-                   rather no success.
d4e586
-                 */
d4e586
-                if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
d4e586
-                    subdaemon_check_progress = time(NULL);
d4e586
-                }
d4e586
-                next_child++;
d4e586
-            }
d4e586
             break;
d4e586
         default:
d4e586
             crm_exit(CRM_EX_FATAL);
d4e586
             break;  /* static analysis/noreturn */
d4e586
     }
d4e586
 
d4e586
+    next_child++;
d4e586
     if (next_child >= PCMK__NELEM(pcmk_children)) {
d4e586
         next_child = 0;
d4e586
     }
d4e586
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
d4e586
 {
d4e586
     child->pid = 0;
d4e586
     child->active_before_startup = false;
d4e586
+    child->check_count = 0;
d4e586
 
d4e586
     child->respawn_count += 1;
d4e586
     if (child->respawn_count > MAX_RESPAWN) {
d4e586
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
d4e586
         crm_warn("One-off suppressing strict respawning of a child process %s,"
d4e586
                  " appears alright per %s IPC end-point",
d4e586
                  child->name, child->endpoint);
d4e586
-        /* need to monitor how it evolves, and start new process if badly */
d4e586
-        child->active_before_startup = true;
d4e586
 
d4e586
     } else {
d4e586
         if (child->needs_cluster && !pcmkd_cluster_connected()) {
d4e586
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
d4e586
     const char *env_callgrind = getenv("PCMK_callgrind_enabled");
d4e586
 
d4e586
     child->active_before_startup = false;
d4e586
+    child->check_count = 0;
d4e586
 
d4e586
     if (child->command == NULL) {
d4e586
         crm_info("Nothing to do for child \"%s\"", child->name);
d4e586
-- 
d4e586
2.27.0
d4e586