From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Thu, 18 Nov 2021 13:23:34 +0100 Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for liveness --- daemons/pacemakerd/pacemakerd.c | 2 + daemons/pacemakerd/pacemakerd.h | 3 +- daemons/pacemakerd/pcmkd_messages.c | 6 +- daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++--------- 4 files changed, 98 insertions(+), 52 deletions(-) diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c index 34d64c4053..062c2d5326 100644 --- a/daemons/pacemakerd/pacemakerd.c +++ b/daemons/pacemakerd/pacemakerd.c @@ -259,6 +259,8 @@ main(int argc, char **argv) pcmk_ipc_api_t *old_instance = NULL; qb_ipcs_service_t *ipcs = NULL; + subdaemon_check_progress = time(NULL); + crm_log_preinit(NULL, argc, argv); mainloop_add_signal(SIGHUP, pcmk_ignore); mainloop_add_signal(SIGQUIT, pcmk_sigquit); diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h index 7c541bbf9e..424dbbcc5d 100644 --- a/daemons/pacemakerd/pacemakerd.h +++ b/daemons/pacemakerd/pacemakerd.h @@ -1,5 +1,5 @@ /* - * Copyright 2010-2021 the Pacemaker project contributors + * Copyright 2010-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to; extern gboolean shutdown_complete_state_reported_client_closed; extern crm_trigger_t *shutdown_trigger; extern crm_trigger_t *startup_trigger; +extern time_t subdaemon_check_progress; gboolean mcp_read_config(void); diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c index 0439986ecf..f2cddc353e 100644 --- a/daemons/pacemakerd/pcmkd_messages.c +++ b/daemons/pacemakerd/pcmkd_messages.c @@ -1,5 +1,5 @@ /* - * Copyright 2010-2021 the Pacemaker project contributors + * Copyright 2010-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) const char *value = NULL; xmlNode *ping = NULL; xmlNode *reply = NULL; - time_t pinged = time(NULL); const char *from = crm_element_value(msg, F_CRM_SYS_FROM); /* Pinged for status */ @@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) value = crm_element_value(msg, F_CRM_SYS_TO); crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state); - crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged); + crm_xml_add_ll(ping, XML_ATTR_TSTAMP, + (long long) subdaemon_check_progress); crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); reply = create_reply(msg, ping); free_xml(ping); diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c index a54fcce1ba..c03903c99e 100644 --- a/daemons/pacemakerd/pcmkd_subdaemons.c +++ b/daemons/pacemakerd/pcmkd_subdaemons.c @@ -32,14 +32,16 @@ typedef struct pcmk_child_s { const char *command; const char *endpoint; /* IPC server name */ bool needs_cluster; + int check_count; /* Anything below here will be dynamically initialized */ bool needs_retry; bool active_before_startup; } pcmk_child_t; -#define PCMK_PROCESS_CHECK_INTERVAL 5 -#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ +#define PCMK_PROCESS_CHECK_INTERVAL 1 +#define PCMK_PROCESS_CHECK_RETRIES 5 +#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ /* Index into the array below */ #define PCMK_CHILD_CONTROLD 5 @@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; crm_trigger_t *shutdown_trigger = NULL; crm_trigger_t *startup_trigger = NULL; +time_t subdaemon_check_progress = 0; /* When contacted via pacemakerd-api by a client having sbd in * the name we assume it is sbd-daemon which wants to know @@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */ GMainLoop *mainloop = NULL; static gboolean fatal_error = FALSE; -static bool global_keep_tracking = false; static gboolean check_active_before_startup_processes(gpointer user_data); static int child_liveness(pcmk_child_t *child); @@ -127,44 +129,94 @@ pcmkd_cluster_connected(void) static gboolean check_active_before_startup_processes(gpointer user_data) { - gboolean keep_tracking = FALSE; - - for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) { - if (!pcmk_children[i].active_before_startup) { - /* we are already tracking it as a child process. */ - continue; - } else { - int rc = child_liveness(&pcmk_children[i]); - - switch (rc) { - case pcmk_rc_ok: - break; - case pcmk_rc_ipc_unresponsive: - case pcmk_rc_ipc_pid_only: // This case: it was previously OK - if (pcmk_children[i].respawn) { - crm_err("%s[%lld] terminated%s", pcmk_children[i].name, - (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), - (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); - } else { - /* orderly shutdown */ - crm_notice("%s[%lld] terminated%s", pcmk_children[i].name, - (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), - (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); - } - pcmk_process_exit(&(pcmk_children[i])); - continue; - default: - crm_exit(CRM_EX_FATAL); - break; /* static analysis/noreturn */ + static int next_child = 0; + int rc = child_liveness(&pcmk_children[next_child]); + + crm_trace("%s[%lld] checked as %d", + pcmk_children[next_child].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[next_child].pid), + rc); + + switch (rc) { + case pcmk_rc_ok: + pcmk_children[next_child].check_count = 0; + next_child++; + subdaemon_check_progress = time(NULL); + break; + case pcmk_rc_ipc_pid_only: // This case: it was previously OK + pcmk_children[next_child].check_count++; + if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) { + crm_err("%s[%lld] is unresponsive to ipc after %d tries but " + "we found the pid so have it killed that we can restart", + pcmk_children[next_child].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[next_child].pid), + pcmk_children[next_child].check_count); + stop_child(&pcmk_children[next_child], SIGKILL); + if (pcmk_children[next_child].respawn) { + /* as long as the respawn-limit isn't reached + give it another round of check retries + */ + pcmk_children[next_child].check_count = 0; + } + } else { + crm_notice("%s[%lld] is unresponsive to ipc after %d tries", + pcmk_children[next_child].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[next_child].pid), + pcmk_children[next_child].check_count); + if (pcmk_children[next_child].respawn) { + /* as long as the respawn-limit isn't reached + and we haven't run out of connect retries + we account this as progress we are willing + to tell to sbd + */ + subdaemon_check_progress = time(NULL); + } } - } - /* at least one of the processes found at startup - * is still going, so keep this recurring timer around */ - keep_tracking = TRUE; + /* go to the next child and see if + we can make progress there + */ + next_child++; + break; + case pcmk_rc_ipc_unresponsive: + if (pcmk_children[next_child].respawn) { + crm_err("%s[%lld] terminated", + pcmk_children[next_child].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[next_child].pid)); + } else { + /* orderly shutdown */ + crm_notice("%s[%lld] terminated", + pcmk_children[next_child].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[next_child].pid)); + } + pcmk_process_exit(&(pcmk_children[next_child])); + if (!pcmk_children[next_child].respawn) { + /* if a subdaemon is down and we don't want it + to be restarted this is a success during + shutdown. if it isn't restarted anymore + due to MAX_RESPAWN it is + rather no success. + */ + if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { + subdaemon_check_progress = time(NULL); + } + next_child++; + } + break; + default: + crm_exit(CRM_EX_FATAL); + break; /* static analysis/noreturn */ } - global_keep_tracking = keep_tracking; - return keep_tracking; + if (next_child >= PCMK__NELEM(pcmk_children)) { + next_child = 0; + } + + return G_SOURCE_CONTINUE; } static gboolean @@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child) child->name, child->endpoint); /* need to monitor how it evolves, and start new process if badly */ child->active_before_startup = true; - if (!global_keep_tracking) { - global_keep_tracking = true; - g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, - check_active_before_startup_processes, NULL); - } } else { if (child->needs_cluster && !pcmkd_cluster_connected()) { @@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child) int find_and_track_existing_processes(void) { - bool tracking = false; bool wait_in_progress; int rc; size_t i, rounds; @@ -716,7 +762,6 @@ find_and_track_existing_processes(void) pcmk_children[i].pid)); pcmk_children[i].respawn_count = -1; /* 0~keep watching */ pcmk_children[i].active_before_startup = true; - tracking = true; break; case pcmk_rc_ipc_pid_only: if (pcmk_children[i].respawn_count == WAIT_TRIES) { @@ -751,10 +796,8 @@ find_and_track_existing_processes(void) pcmk_children[i].respawn_count = 0; /* restore pristine state */ } - if (tracking) { - g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, + g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_active_before_startup_processes, NULL); - } return pcmk_rc_ok; } -- 2.27.0 From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Thu, 9 Dec 2021 11:25:22 +0100 Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect --- configure.ac | 3 +++ lib/common/ipc_client.c | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/configure.ac b/configure.ac index f43fb724c7..c747fe1193 100644 --- a/configure.ac +++ b/configure.ac @@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17) CPPFLAGS="$libqb_CFLAGS $CPPFLAGS" LIBS="$libqb_LIBS $LIBS" +dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01) +AC_CHECK_FUNCS([qb_ipcc_connect_async]) + dnl libqb 2.0.2+ (2020-10) AC_CHECK_FUNCS(qb_ipcc_auth_get, AC_DEFINE(HAVE_IPCC_AUTH_GET, 1, diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c index c5afdf3a3d..417b9ef175 100644 --- a/lib/common/ipc_client.c +++ b/lib/common/ipc_client.c @@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid, int32_t qb_rc; pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; qb_ipcc_connection_t *c; +#ifdef HAVE_QB_IPCC_CONNECT_ASYNC + struct pollfd pollfd = { 0, }; + int poll_rc; + c = qb_ipcc_connect_async(name, 0, + &(pollfd.fd)); +#else c = qb_ipcc_connect(name, 0); +#endif if (c == NULL) { crm_info("Could not connect to %s IPC: %s", name, strerror(errno)); rc = pcmk_rc_ipc_unresponsive; goto bail; } +#ifdef HAVE_QB_IPCC_CONNECT_ASYNC + pollfd.events = POLLIN; + do { + poll_rc = poll(&pollfd, 1, 2000); + } while ((poll_rc == -1) && (errno == EINTR)); + if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) { + crm_info("Could not connect to %s IPC: %s", name, + (poll_rc == 0)?"timeout":strerror(errno)); + rc = pcmk_rc_ipc_unresponsive; + if (poll_rc > 0) { + c = NULL; // qb_ipcc_connect_continue cleaned up for us + } + goto bail; + } +#endif qb_rc = qb_ipcc_fd_get(c, &fd); if (qb_rc != 0) { -- 2.27.0