diff --git a/SOURCES/0017-Fix-sbd-pacemaker-make-handling-of-cib-connection-lo.patch b/SOURCES/0017-Fix-sbd-pacemaker-make-handling-of-cib-connection-lo.patch new file mode 100644 index 0000000..0d70c92 --- /dev/null +++ b/SOURCES/0017-Fix-sbd-pacemaker-make-handling-of-cib-connection-lo.patch @@ -0,0 +1,302 @@ +From a716a8ddd3df615009bcff3bd96dd9ae64cb5f68 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 19 Mar 2019 21:36:15 +0100 +Subject: [PATCH] Fix: sbd-pacemaker: make handling of cib-connection loss more + robust + +Exit pcmk-servant on graceful pacemaker shutdown and go back +to state before pacemaker was detected initially. +Purge all cib-traces otherwise and try to reconnect within timeout. +--- + src/sbd-inquisitor.c | 24 ++++++++++++++++++++---- + src/sbd-md.c | 30 +++++++++++++++--------------- + src/sbd-pacemaker.c | 38 +++++++++++++++++++++++++++++--------- + src/sbd.h | 11 +++++++---- + 4 files changed, 71 insertions(+), 32 deletions(-) + +diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c +index 9be6c99..77c6e4f 100644 +--- a/src/sbd-inquisitor.c ++++ b/src/sbd-inquisitor.c +@@ -490,19 +490,19 @@ void inquisitor_child(void) + if (sbd_is_disk(s)) { + if (WIFEXITED(status)) { + switch(WEXITSTATUS(status)) { +- case EXIT_MD_IO_FAIL: ++ case EXIT_MD_SERVANT_IO_FAIL: + DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", + s->devname); + break; +- case EXIT_MD_REQUEST_RESET: ++ case EXIT_MD_SERVANT_REQUEST_RESET: + cl_log(LOG_WARNING, "%s requested a reset", s->devname); + do_reset(); + break; +- case EXIT_MD_REQUEST_SHUTOFF: ++ case EXIT_MD_SERVANT_REQUEST_SHUTOFF: + cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); + do_off(); + break; +- case EXIT_MD_REQUEST_CRASHDUMP: ++ case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: + cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); + do_crashdump(); + break; +@@ -510,6 +510,22 @@ void inquisitor_child(void) + break; + } + } ++ } else if (sbd_is_pcmk(s)) { ++ if (WIFEXITED(status)) { ++ switch(WEXITSTATUS(status)) { ++ case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: ++ DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); ++ /* revert to state prior to pacemaker-detection */ ++ s->restarts = 0; ++ s->restart_blocked = 0; ++ cluster_appeared = 0; ++ s->outdated = 1; ++ s->t_last.tv_sec = 0; ++ break; ++ default: ++ break; ++ } ++ } + } + cleanup_servant_by_pid(pid); + } +diff --git a/src/sbd-md.c b/src/sbd-md.c +index ba2c34d..c51d381 100644 +--- a/src/sbd-md.c ++++ b/src/sbd-md.c +@@ -1061,19 +1061,19 @@ int servant_md(const char *diskname, int mode, const void* argp) + + st = open_device(diskname, LOG_WARNING); + if (!st) { +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + + s_header = header_get(st); + if (!s_header) { + cl_log(LOG_ERR, "Not a valid header on %s", diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + + if (servant_check_timeout_inconsistent(s_header) < 0) { + cl_log(LOG_ERR, "Timeouts on %s do not match first device", + diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + + if (s_header->minor_version > 0) { +@@ -1086,14 +1086,14 @@ int servant_md(const char *diskname, int mode, const void* argp) + cl_log(LOG_ERR, + "No slot allocated, and automatic allocation failed for disk %s.", + diskname); +- rc = EXIT_MD_IO_FAIL; ++ rc = EXIT_MD_SERVANT_IO_FAIL; + goto out; + } + s_node = sector_alloc(); + if (slot_read(st, mbox, s_node) < 0) { + cl_log(LOG_ERR, "Unable to read node entry on %s", + diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + + cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname); +@@ -1109,7 +1109,7 @@ int servant_md(const char *diskname, int mode, const void* argp) + if (mode > 0) { + if (mbox_read(st, mbox, s_mbox) < 0) { + cl_log(LOG_ERR, "mbox read failed during start-up in servant."); +- rc = EXIT_MD_IO_FAIL; ++ rc = EXIT_MD_SERVANT_IO_FAIL; + goto out; + } + if (s_mbox->cmd != SBD_MSG_EXIT && +@@ -1125,7 +1125,7 @@ int servant_md(const char *diskname, int mode, const void* argp) + DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); + memset(s_mbox, 0, sizeof(*s_mbox)); + if (mbox_write(st, mbox, s_mbox) < 0) { +- rc = EXIT_MD_IO_FAIL; ++ rc = EXIT_MD_SERVANT_IO_FAIL; + goto out; + } + } +@@ -1154,28 +1154,28 @@ int servant_md(const char *diskname, int mode, const void* argp) + s_header_retry = header_get(st); + if (!s_header_retry) { + cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { + cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + free(s_header_retry); + + s_node_retry = sector_alloc(); + if (slot_read(st, mbox, s_node_retry) < 0) { + cl_log(LOG_ERR, "slot read failed in servant."); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { + cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + free(s_node_retry); + + if (mbox_read(st, mbox, s_mbox) < 0) { + cl_log(LOG_ERR, "mbox read failed in servant."); +- exit(EXIT_MD_IO_FAIL); ++ exit(EXIT_MD_SERVANT_IO_FAIL); + } + + if (s_mbox->cmd > 0) { +@@ -1190,14 +1190,14 @@ int servant_md(const char *diskname, int mode, const void* argp) + sigqueue(ppid, SIG_TEST, signal_value); + break; + case SBD_MSG_RESET: +- exit(EXIT_MD_REQUEST_RESET); ++ exit(EXIT_MD_SERVANT_REQUEST_RESET); + case SBD_MSG_OFF: +- exit(EXIT_MD_REQUEST_SHUTOFF); ++ exit(EXIT_MD_SERVANT_REQUEST_SHUTOFF); + case SBD_MSG_EXIT: + sigqueue(ppid, SIG_EXITREQ, signal_value); + break; + case SBD_MSG_CRASHDUMP: +- exit(EXIT_MD_REQUEST_CRASHDUMP); ++ exit(EXIT_MD_SERVANT_REQUEST_CRASHDUMP); + default: + /* FIXME: + An "unknown" message might result +diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c +index aac355a..c69fc55 100644 +--- a/src/sbd-pacemaker.c ++++ b/src/sbd-pacemaker.c +@@ -103,6 +103,9 @@ static pe_working_set_t *data_set = NULL; + + static long last_refresh = 0; + ++static int pcmk_clean_shutdown = 0; ++static int pcmk_shutdown = 0; ++ + static gboolean + mon_timer_reconnect(gpointer data) + { +@@ -128,10 +131,26 @@ mon_cib_connection_destroy(gpointer user_data) + { + if (cib) { + cib->cmds->signoff(cib); ++ /* retrigger as last one might have been skipped */ ++ mon_refresh_state(NULL); ++ if (pcmk_clean_shutdown) { ++ /* assume a graceful pacemaker-shutdown */ ++ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); ++ } ++ /* getting here we aren't sure about the pacemaker-state ++ so try to use the timeout to reconnect and get ++ everything sorted out again ++ */ ++ pcmk_shutdown = 0; + set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); + timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); + } + cib_connected = 0; ++ /* no sense in looking into outdated cib, trying to apply patch, ... */ ++ if (current_cib) { ++ free_xml(current_cib); ++ current_cib = NULL; ++ } + return; + } + +@@ -171,7 +190,7 @@ static gboolean + mon_timer_notify(gpointer data) + { + static int counter = 0; +- int counter_max = timeout_watchdog / timeout_loop; ++ int counter_max = timeout_watchdog / timeout_loop / 2; + + if (timer_id_notify > 0) { + g_source_remove(timer_id_notify); +@@ -280,11 +299,6 @@ compute_status(pe_working_set_t * data_set) + } else if (node->details->pending) { + set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); + +-#if 0 +- } else if (node->details->shutdown) { +- set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down"); +-#endif +- + } else if (data_set->flags & pe_flag_have_quorum) { + set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); + ever_had_quorum = TRUE; +@@ -315,6 +329,12 @@ compute_status(pe_working_set_t * data_set) + } + } + ++ if (node->details->shutdown) { ++ pcmk_shutdown = 1; ++ } ++ if (pcmk_shutdown && !(node->details->running_rsc)) { ++ pcmk_clean_shutdown = 1; ++ } + notify_parent(); + return; + } +@@ -339,7 +359,7 @@ crm_diff_update(const char *event, xmlNode * msg) + static mainloop_timer_t *refresh_timer = NULL; + + if(refresh_timer == NULL) { +- refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL); ++ refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL); + refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); + } + +@@ -369,9 +389,9 @@ crm_diff_update(const char *event, xmlNode * msg) + } + + /* Refresh +- * - immediately if the last update was more than 5s ago ++ * - immediately if the last update was more than 1s ago + * - every 10 updates +- * - at most 2s after the last update ++ * - at most 1s after the last update + */ + if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) { + mon_refresh_state(refresh_timer); +diff --git a/src/sbd.h b/src/sbd.h +index 6fe07f9..3b05a11 100644 +--- a/src/sbd.h ++++ b/src/sbd.h +@@ -54,10 +54,13 @@ + /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ + + /* exit status for disk-servant */ +-#define EXIT_MD_IO_FAIL 20 +-#define EXIT_MD_REQUEST_RESET 21 +-#define EXIT_MD_REQUEST_SHUTOFF 22 +-#define EXIT_MD_REQUEST_CRASHDUMP 23 ++#define EXIT_MD_SERVANT_IO_FAIL 20 ++#define EXIT_MD_SERVANT_REQUEST_RESET 21 ++#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 ++#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 ++ ++/* exit status for pcmk-servant */ ++#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 + + #define HOG_CHAR 0xff + #define SECTOR_NAME_MAX 63 +-- +1.8.3.1 + diff --git a/SOURCES/0018-Fix-sbd-pacemaker-bail-out-of-status-earlier.patch b/SOURCES/0018-Fix-sbd-pacemaker-bail-out-of-status-earlier.patch new file mode 100644 index 0000000..776edea --- /dev/null +++ b/SOURCES/0018-Fix-sbd-pacemaker-bail-out-of-status-earlier.patch @@ -0,0 +1,45 @@ +From 79b778debfee5b4ab2d099b2bfc7385f45597f70 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 26 Mar 2019 11:17:45 +0100 +Subject: [PATCH] Fix: sbd-pacemaker: bail out of status earlier + +Prevents possible subsequent null-pointer access and avoids +unnecessary search for node. +--- + src/sbd-pacemaker.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c +index c69fc55..9a8b95f 100644 +--- a/src/sbd-pacemaker.c ++++ b/src/sbd-pacemaker.c +@@ -276,7 +276,7 @@ compute_status(pe_working_set_t * data_set) + static int updates = 0; + static int ever_had_quorum = FALSE; + +- node_t *node = pe_find_node(data_set->nodes, local_uname); ++ node_t *node = NULL; + + updates++; + +@@ -286,11 +286,15 @@ compute_status(pe_working_set_t * data_set) + return; + } + ++ node = pe_find_node(data_set->nodes, local_uname); + +- if (node == NULL) { ++ if ((node == NULL) || (node->details == NULL)) { + set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); ++ notify_parent(); ++ return; ++ } + +- } else if (node->details->online == FALSE) { ++ if (node->details->online == FALSE) { + set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); + + } else if (node->details->unclean) { +-- +1.8.3.1 + diff --git a/SOURCES/0019-Fix-sbd-pacemaker-assume-graceful-exit-if-leftovers-.patch b/SOURCES/0019-Fix-sbd-pacemaker-assume-graceful-exit-if-leftovers-.patch new file mode 100644 index 0000000..8c92df8 --- /dev/null +++ b/SOURCES/0019-Fix-sbd-pacemaker-assume-graceful-exit-if-leftovers-.patch @@ -0,0 +1,60 @@ +From 824fe834c67fb7bae7feb87607381f9fa8fa2945 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Fri, 7 Jun 2019 19:09:06 +0200 +Subject: [PATCH] Fix: sbd-pacemaker: assume graceful exit if leftovers are + unmanged + +--- + src/sbd-pacemaker.c | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c +index 9a8b95f..2b35ff6 100644 +--- a/src/sbd-pacemaker.c ++++ b/src/sbd-pacemaker.c +@@ -333,11 +333,41 @@ compute_status(pe_working_set_t * data_set) + } + } + ++ /* If we are in shutdown-state once this will go on till the end. ++ * If we've on top reached a state of 0 locally running resources ++ * we can assume a clean shutdown. ++ * Tricky are the situations where the node is in maintenance-mode ++ * or resources are unmanaged. So if the node is in maintenance or ++ * all left-over running resources are unmanaged we assume intention. ++ */ + if (node->details->shutdown) { + pcmk_shutdown = 1; + } +- if (pcmk_shutdown && !(node->details->running_rsc)) { ++ if (pcmk_shutdown) ++ { + pcmk_clean_shutdown = 1; ++ if (!(node->details->maintenance)) { ++ GListPtr iter; ++ ++ for (iter = node->details->running_rsc; ++ iter != NULL; iter = iter->next) { ++ resource_t *rsc = (resource_t *) iter->data; ++ ++ ++ if (is_set(rsc->flags, pe_rsc_managed)) { ++ pcmk_clean_shutdown = 0; ++ crm_debug("not clean as %s managed and still running", ++ rsc->id); ++ break; ++ } ++ } ++ if (pcmk_clean_shutdown) { ++ crm_debug("pcmk_clean_shutdown because " ++ "all managed resources down"); ++ } ++ } else { ++ crm_debug("pcmk_clean_shutdown because node is in maintenance"); ++ } + } + notify_parent(); + return; +-- +1.8.3.1 + diff --git a/SOURCES/0020-Fix-sbd-pacemaker-check-for-shutdown-attribute-on-ev.patch b/SOURCES/0020-Fix-sbd-pacemaker-check-for-shutdown-attribute-on-ev.patch new file mode 100644 index 0000000..16a50e0 --- /dev/null +++ b/SOURCES/0020-Fix-sbd-pacemaker-check-for-shutdown-attribute-on-ev.patch @@ -0,0 +1,54 @@ +From c8e3de2a7e98550ea9f27a0c59e13013ce02992d Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Fri, 16 Aug 2019 12:07:32 +0200 +Subject: [PATCH] Fix: sbd-pacemaker: check for shutdown attribute on every + cib-diff + +--- + src/sbd-pacemaker.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c +index 2b35ff6..1217acf 100644 +--- a/src/sbd-pacemaker.c ++++ b/src/sbd-pacemaker.c +@@ -383,6 +383,24 @@ mon_trigger_refresh(gpointer user_data) + return FALSE; + } + ++#define XPATH_SHUTDOWN "//" XML_CIB_TAG_STATE "[@uname='%s']/" \ ++ XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" \ ++ XML_CIB_TAG_NVPAIR "[@name='" XML_CIB_ATTR_SHUTDOWN "']" ++ ++static gboolean ++shutdown_attr_in_cib(void) ++{ ++ xmlNode *match = NULL; ++ char *xpath_string; ++ ++ xpath_string = crm_strdup_printf(XPATH_SHUTDOWN, local_uname); ++ if (xpath_string) { ++ match = get_xpath_object(xpath_string, current_cib, LOG_TRACE); ++ free(xpath_string); ++ } ++ return (match != NULL); ++} ++ + static void + crm_diff_update(const char *event, xmlNode * msg) + { +@@ -426,8 +444,10 @@ crm_diff_update(const char *event, xmlNode * msg) + * - immediately if the last update was more than 1s ago + * - every 10 updates + * - at most 1s after the last update ++ * - shutdown attribute for our node set for the first time + */ +- if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) { ++ if ((!pcmk_shutdown && shutdown_attr_in_cib()) || ++ (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000))) { + mon_refresh_state(refresh_timer); + updates = 0; + +-- +1.8.3.1 + diff --git a/SPECS/sbd.spec b/SPECS/sbd.spec index 7e26a69..5326751 100644 --- a/SPECS/sbd.spec +++ b/SPECS/sbd.spec @@ -25,7 +25,7 @@ Summary: Storage-based death License: GPLv2+ Group: System Environment/Daemons Version: 1.3.1 -Release: %{buildnum}%{?dist} +Release: %{buildnum}%{?dist}.3 Url: https://github.com/%{github_owner}/%{name} Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz Patch0: 0001-make-pacemaker-dlm-wait-for-sbd-start.patch @@ -44,6 +44,10 @@ Patch12: 0013-Refactor-sbd-common-separate-assignment-and-comparis.patch Patch13: 0014-Fix-sbd-common-avoid-statting-potential-links.patch Patch14: 0015-Refactor-use-pacemaker-s-new-pe-api-with-constructor.patch Patch15: 0016-Feature-make-timeout-action-executed-by-sbd-configur.patch +Patch16: 0017-Fix-sbd-pacemaker-make-handling-of-cib-connection-lo.patch +Patch17: 0018-Fix-sbd-pacemaker-bail-out-of-status-earlier.patch +Patch18: 0019-Fix-sbd-pacemaker-assume-graceful-exit-if-leftovers-.patch +Patch19: 0020-Fix-sbd-pacemaker-check-for-shutdown-attribute-on-ev.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: autoconf BuildRequires: automake @@ -59,7 +63,7 @@ BuildRequires: pkgconfig BuildRequires: systemd %if 0%{?rhel} > 0 -ExclusiveArch: i686 x86_64 s390x ppc64le aarch64 %{arm} +ExclusiveArch: i686 x86_64 s390x ppc64le aarch64 %endif %if %{defined systemd_requires} @@ -141,8 +145,21 @@ fi %doc COPYING %changelog -* Mon May 20 2019 Pablo Greco - 1.3.1-18 -- Allow building in armhfp +* Mon Aug 26 2019 Klaus Wenninger - 1.3.1-18.3 +- added missing patch + + Resolves: rhbz#1734061 + +* Fri Aug 16 2019 Klaus Wenninger - 1.3.1-18.2 +- check for shutdown attribute on every cib-diff + + Resolves: rhbz#1734061 + +* Tue Jul 30 2019 Klaus Wenninger - 1.3.1-18.1 +- assume graceful pacemaker exit if leftovers are unmanaged +- make handling of cib-connection loss more robust + + Resolves: rhbz#1734061 * Mon Dec 17 2018 Klaus Wenninger - 1.3.1-18 - make timeout-action executed by sbd configurable