|
|
97a979 |
From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
|
|
|
97a979 |
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
|
97a979 |
Date: Mon, 24 Jan 2022 12:18:42 +0100
|
|
|
97a979 |
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
|
|
|
97a979 |
processes
|
|
|
97a979 |
|
|
|
97a979 |
regression from introduction of periodic subdaemon checking
|
|
|
97a979 |
in cases they are pacemakerd children - previously it was either
|
|
|
97a979 |
periodic checking or signal-handler per process.
|
|
|
97a979 |
---
|
|
|
97a979 |
daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
|
|
|
97a979 |
1 file changed, 22 insertions(+), 16 deletions(-)
|
|
|
97a979 |
|
|
|
97a979 |
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
|
97a979 |
index c03903c99e..84ecdc1ee8 100644
|
|
|
97a979 |
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
|
97a979 |
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
|
97a979 |
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
|
|
|
97a979 |
switch (rc) {
|
|
|
97a979 |
case pcmk_rc_ok:
|
|
|
97a979 |
pcmk_children[next_child].check_count = 0;
|
|
|
97a979 |
- next_child++;
|
|
|
97a979 |
subdaemon_check_progress = time(NULL);
|
|
|
97a979 |
break;
|
|
|
97a979 |
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
|
|
|
97a979 |
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
|
|
|
97a979 |
/* go to the next child and see if
|
|
|
97a979 |
we can make progress there
|
|
|
97a979 |
*/
|
|
|
97a979 |
- next_child++;
|
|
|
97a979 |
break;
|
|
|
97a979 |
case pcmk_rc_ipc_unresponsive:
|
|
|
97a979 |
+ if (!pcmk_children[next_child].respawn) {
|
|
|
97a979 |
+ /* if a subdaemon is down and we don't want it
|
|
|
97a979 |
+ to be restarted this is a success during
|
|
|
97a979 |
+ shutdown. if it isn't restarted anymore
|
|
|
97a979 |
+ due to MAX_RESPAWN it is
|
|
|
97a979 |
+ rather no success.
|
|
|
97a979 |
+ */
|
|
|
97a979 |
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
|
|
|
97a979 |
+ subdaemon_check_progress = time(NULL);
|
|
|
97a979 |
+ }
|
|
|
97a979 |
+ }
|
|
|
97a979 |
+ if (!pcmk_children[next_child].active_before_startup) {
|
|
|
97a979 |
+ crm_trace("found %s[%lld] missing - signal-handler "
|
|
|
97a979 |
+ "will take care of it",
|
|
|
97a979 |
+ pcmk_children[next_child].name,
|
|
|
97a979 |
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
|
97a979 |
+ pcmk_children[next_child].pid));
|
|
|
97a979 |
+ break;
|
|
|
97a979 |
+ }
|
|
|
97a979 |
if (pcmk_children[next_child].respawn) {
|
|
|
97a979 |
crm_err("%s[%lld] terminated",
|
|
|
97a979 |
pcmk_children[next_child].name,
|
|
|
97a979 |
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
|
|
|
97a979 |
pcmk_children[next_child].pid));
|
|
|
97a979 |
}
|
|
|
97a979 |
pcmk_process_exit(&(pcmk_children[next_child]));
|
|
|
97a979 |
- if (!pcmk_children[next_child].respawn) {
|
|
|
97a979 |
- /* if a subdaemon is down and we don't want it
|
|
|
97a979 |
- to be restarted this is a success during
|
|
|
97a979 |
- shutdown. if it isn't restarted anymore
|
|
|
97a979 |
- due to MAX_RESPAWN it is
|
|
|
97a979 |
- rather no success.
|
|
|
97a979 |
- */
|
|
|
97a979 |
- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
|
|
|
97a979 |
- subdaemon_check_progress = time(NULL);
|
|
|
97a979 |
- }
|
|
|
97a979 |
- next_child++;
|
|
|
97a979 |
- }
|
|
|
97a979 |
break;
|
|
|
97a979 |
default:
|
|
|
97a979 |
crm_exit(CRM_EX_FATAL);
|
|
|
97a979 |
break; /* static analysis/noreturn */
|
|
|
97a979 |
}
|
|
|
97a979 |
|
|
|
97a979 |
+ next_child++;
|
|
|
97a979 |
if (next_child >= PCMK__NELEM(pcmk_children)) {
|
|
|
97a979 |
next_child = 0;
|
|
|
97a979 |
}
|
|
|
97a979 |
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
|
|
|
97a979 |
{
|
|
|
97a979 |
child->pid = 0;
|
|
|
97a979 |
child->active_before_startup = false;
|
|
|
97a979 |
+ child->check_count = 0;
|
|
|
97a979 |
|
|
|
97a979 |
child->respawn_count += 1;
|
|
|
97a979 |
if (child->respawn_count > MAX_RESPAWN) {
|
|
|
97a979 |
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
|
|
|
97a979 |
crm_warn("One-off suppressing strict respawning of a child process %s,"
|
|
|
97a979 |
" appears alright per %s IPC end-point",
|
|
|
97a979 |
child->name, child->endpoint);
|
|
|
97a979 |
- /* need to monitor how it evolves, and start new process if badly */
|
|
|
97a979 |
- child->active_before_startup = true;
|
|
|
97a979 |
|
|
|
97a979 |
} else {
|
|
|
97a979 |
if (child->needs_cluster && !pcmkd_cluster_connected()) {
|
|
|
97a979 |
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
|
|
|
97a979 |
const char *env_callgrind = getenv("PCMK_callgrind_enabled");
|
|
|
97a979 |
|
|
|
97a979 |
child->active_before_startup = false;
|
|
|
97a979 |
+ child->check_count = 0;
|
|
|
97a979 |
|
|
|
97a979 |
if (child->command == NULL) {
|
|
|
97a979 |
crm_info("Nothing to do for child \"%s\"", child->name);
|
|
|
97a979 |
--
|
|
|
97a979 |
2.27.0
|
|
|
97a979 |
|