|
|
52408e |
From a716a8ddd3df615009bcff3bd96dd9ae64cb5f68 Mon Sep 17 00:00:00 2001
|
|
|
52408e |
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
|
52408e |
Date: Tue, 19 Mar 2019 21:36:15 +0100
|
|
|
52408e |
Subject: [PATCH] Fix: sbd-pacemaker: make handling of cib-connection loss more
|
|
|
52408e |
robust
|
|
|
52408e |
|
|
|
52408e |
Exit pcmk-servant on graceful pacemaker shutdown and go back
|
|
|
52408e |
to state before pacemaker was detected initially.
|
|
|
52408e |
Purge all cib-traces otherwise and try to reconnect within timeout.
|
|
|
52408e |
---
|
|
|
52408e |
src/sbd-inquisitor.c | 24 ++++++++++++++++++++----
|
|
|
52408e |
src/sbd-md.c | 30 +++++++++++++++---------------
|
|
|
52408e |
src/sbd-pacemaker.c | 38 +++++++++++++++++++++++++++++---------
|
|
|
52408e |
src/sbd.h | 11 +++++++----
|
|
|
52408e |
4 files changed, 71 insertions(+), 32 deletions(-)
|
|
|
52408e |
|
|
|
52408e |
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
|
|
|
52408e |
index 9be6c99..77c6e4f 100644
|
|
|
52408e |
--- a/src/sbd-inquisitor.c
|
|
|
52408e |
+++ b/src/sbd-inquisitor.c
|
|
|
52408e |
@@ -490,19 +490,19 @@ void inquisitor_child(void)
|
|
|
52408e |
if (sbd_is_disk(s)) {
|
|
|
52408e |
if (WIFEXITED(status)) {
|
|
|
52408e |
switch(WEXITSTATUS(status)) {
|
|
|
52408e |
- case EXIT_MD_IO_FAIL:
|
|
|
52408e |
+ case EXIT_MD_SERVANT_IO_FAIL:
|
|
|
52408e |
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
|
|
|
52408e |
s->devname);
|
|
|
52408e |
break;
|
|
|
52408e |
- case EXIT_MD_REQUEST_RESET:
|
|
|
52408e |
+ case EXIT_MD_SERVANT_REQUEST_RESET:
|
|
|
52408e |
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
|
|
|
52408e |
do_reset();
|
|
|
52408e |
break;
|
|
|
52408e |
- case EXIT_MD_REQUEST_SHUTOFF:
|
|
|
52408e |
+ case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
|
|
|
52408e |
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
|
|
|
52408e |
do_off();
|
|
|
52408e |
break;
|
|
|
52408e |
- case EXIT_MD_REQUEST_CRASHDUMP:
|
|
|
52408e |
+ case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
|
|
|
52408e |
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
|
|
|
52408e |
do_crashdump();
|
|
|
52408e |
break;
|
|
|
52408e |
@@ -510,6 +510,22 @@ void inquisitor_child(void)
|
|
|
52408e |
break;
|
|
|
52408e |
}
|
|
|
52408e |
}
|
|
|
52408e |
+ } else if (sbd_is_pcmk(s)) {
|
|
|
52408e |
+ if (WIFEXITED(status)) {
|
|
|
52408e |
+ switch(WEXITSTATUS(status)) {
|
|
|
52408e |
+ case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
|
|
|
52408e |
+ DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
|
|
|
52408e |
+ /* revert to state prior to pacemaker-detection */
|
|
|
52408e |
+ s->restarts = 0;
|
|
|
52408e |
+ s->restart_blocked = 0;
|
|
|
52408e |
+ cluster_appeared = 0;
|
|
|
52408e |
+ s->outdated = 1;
|
|
|
52408e |
+ s->t_last.tv_sec = 0;
|
|
|
52408e |
+ break;
|
|
|
52408e |
+ default:
|
|
|
52408e |
+ break;
|
|
|
52408e |
+ }
|
|
|
52408e |
+ }
|
|
|
52408e |
}
|
|
|
52408e |
cleanup_servant_by_pid(pid);
|
|
|
52408e |
}
|
|
|
52408e |
diff --git a/src/sbd-md.c b/src/sbd-md.c
|
|
|
52408e |
index ba2c34d..c51d381 100644
|
|
|
52408e |
--- a/src/sbd-md.c
|
|
|
52408e |
+++ b/src/sbd-md.c
|
|
|
52408e |
@@ -1061,19 +1061,19 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
|
|
|
52408e |
st = open_device(diskname, LOG_WARNING);
|
|
|
52408e |
if (!st) {
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
s_header = header_get(st);
|
|
|
52408e |
if (!s_header) {
|
|
|
52408e |
cl_log(LOG_ERR, "Not a valid header on %s", diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
if (servant_check_timeout_inconsistent(s_header) < 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "Timeouts on %s do not match first device",
|
|
|
52408e |
diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
if (s_header->minor_version > 0) {
|
|
|
52408e |
@@ -1086,14 +1086,14 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
cl_log(LOG_ERR,
|
|
|
52408e |
"No slot allocated, and automatic allocation failed for disk %s.",
|
|
|
52408e |
diskname);
|
|
|
52408e |
- rc = EXIT_MD_IO_FAIL;
|
|
|
52408e |
+ rc = EXIT_MD_SERVANT_IO_FAIL;
|
|
|
52408e |
goto out;
|
|
|
52408e |
}
|
|
|
52408e |
s_node = sector_alloc();
|
|
|
52408e |
if (slot_read(st, mbox, s_node) < 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "Unable to read node entry on %s",
|
|
|
52408e |
diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname);
|
|
|
52408e |
@@ -1109,7 +1109,7 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
if (mode > 0) {
|
|
|
52408e |
if (mbox_read(st, mbox, s_mbox) < 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "mbox read failed during start-up in servant.");
|
|
|
52408e |
- rc = EXIT_MD_IO_FAIL;
|
|
|
52408e |
+ rc = EXIT_MD_SERVANT_IO_FAIL;
|
|
|
52408e |
goto out;
|
|
|
52408e |
}
|
|
|
52408e |
if (s_mbox->cmd != SBD_MSG_EXIT &&
|
|
|
52408e |
@@ -1125,7 +1125,7 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
DBGLOG(LOG_INFO, "First servant start - zeroing inbox");
|
|
|
52408e |
memset(s_mbox, 0, sizeof(*s_mbox));
|
|
|
52408e |
if (mbox_write(st, mbox, s_mbox) < 0) {
|
|
|
52408e |
- rc = EXIT_MD_IO_FAIL;
|
|
|
52408e |
+ rc = EXIT_MD_SERVANT_IO_FAIL;
|
|
|
52408e |
goto out;
|
|
|
52408e |
}
|
|
|
52408e |
}
|
|
|
52408e |
@@ -1154,28 +1154,28 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
s_header_retry = header_get(st);
|
|
|
52408e |
if (!s_header_retry) {
|
|
|
52408e |
cl_log(LOG_ERR, "No longer found a valid header on %s", diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
free(s_header_retry);
|
|
|
52408e |
|
|
|
52408e |
s_node_retry = sector_alloc();
|
|
|
52408e |
if (slot_read(st, mbox, s_node_retry) < 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "slot read failed in servant.");
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname);
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
free(s_node_retry);
|
|
|
52408e |
|
|
|
52408e |
if (mbox_read(st, mbox, s_mbox) < 0) {
|
|
|
52408e |
cl_log(LOG_ERR, "mbox read failed in servant.");
|
|
|
52408e |
- exit(EXIT_MD_IO_FAIL);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_IO_FAIL);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
if (s_mbox->cmd > 0) {
|
|
|
52408e |
@@ -1190,14 +1190,14 @@ int servant_md(const char *diskname, int mode, const void* argp)
|
|
|
52408e |
sigqueue(ppid, SIG_TEST, signal_value);
|
|
|
52408e |
break;
|
|
|
52408e |
case SBD_MSG_RESET:
|
|
|
52408e |
- exit(EXIT_MD_REQUEST_RESET);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_REQUEST_RESET);
|
|
|
52408e |
case SBD_MSG_OFF:
|
|
|
52408e |
- exit(EXIT_MD_REQUEST_SHUTOFF);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_REQUEST_SHUTOFF);
|
|
|
52408e |
case SBD_MSG_EXIT:
|
|
|
52408e |
sigqueue(ppid, SIG_EXITREQ, signal_value);
|
|
|
52408e |
break;
|
|
|
52408e |
case SBD_MSG_CRASHDUMP:
|
|
|
52408e |
- exit(EXIT_MD_REQUEST_CRASHDUMP);
|
|
|
52408e |
+ exit(EXIT_MD_SERVANT_REQUEST_CRASHDUMP);
|
|
|
52408e |
default:
|
|
|
52408e |
/* FIXME:
|
|
|
52408e |
An "unknown" message might result
|
|
|
52408e |
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
|
|
|
52408e |
index aac355a..c69fc55 100644
|
|
|
52408e |
--- a/src/sbd-pacemaker.c
|
|
|
52408e |
+++ b/src/sbd-pacemaker.c
|
|
|
52408e |
@@ -103,6 +103,9 @@ static pe_working_set_t *data_set = NULL;
|
|
|
52408e |
|
|
|
52408e |
static long last_refresh = 0;
|
|
|
52408e |
|
|
|
52408e |
+static int pcmk_clean_shutdown = 0;
|
|
|
52408e |
+static int pcmk_shutdown = 0;
|
|
|
52408e |
+
|
|
|
52408e |
static gboolean
|
|
|
52408e |
mon_timer_reconnect(gpointer data)
|
|
|
52408e |
{
|
|
|
52408e |
@@ -128,10 +131,26 @@ mon_cib_connection_destroy(gpointer user_data)
|
|
|
52408e |
{
|
|
|
52408e |
if (cib) {
|
|
|
52408e |
cib->cmds->signoff(cib);
|
|
|
52408e |
+ /* retrigger as last one might have been skipped */
|
|
|
52408e |
+ mon_refresh_state(NULL);
|
|
|
52408e |
+ if (pcmk_clean_shutdown) {
|
|
|
52408e |
+ /* assume a graceful pacemaker-shutdown */
|
|
|
52408e |
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
|
|
52408e |
+ }
|
|
|
52408e |
+ /* getting here we aren't sure about the pacemaker-state
|
|
|
52408e |
+ so try to use the timeout to reconnect and get
|
|
|
52408e |
+ everything sorted out again
|
|
|
52408e |
+ */
|
|
|
52408e |
+ pcmk_shutdown = 0;
|
|
|
52408e |
set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
|
|
|
52408e |
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
|
|
|
52408e |
}
|
|
|
52408e |
cib_connected = 0;
|
|
|
52408e |
+ /* no sense in looking into outdated cib, trying to apply patch, ... */
|
|
|
52408e |
+ if (current_cib) {
|
|
|
52408e |
+ free_xml(current_cib);
|
|
|
52408e |
+ current_cib = NULL;
|
|
|
52408e |
+ }
|
|
|
52408e |
return;
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
@@ -171,7 +190,7 @@ static gboolean
|
|
|
52408e |
mon_timer_notify(gpointer data)
|
|
|
52408e |
{
|
|
|
52408e |
static int counter = 0;
|
|
|
52408e |
- int counter_max = timeout_watchdog / timeout_loop;
|
|
|
52408e |
+ int counter_max = timeout_watchdog / timeout_loop / 2;
|
|
|
52408e |
|
|
|
52408e |
if (timer_id_notify > 0) {
|
|
|
52408e |
g_source_remove(timer_id_notify);
|
|
|
52408e |
@@ -280,11 +299,6 @@ compute_status(pe_working_set_t * data_set)
|
|
|
52408e |
} else if (node->details->pending) {
|
|
|
52408e |
set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending");
|
|
|
52408e |
|
|
|
52408e |
-#if 0
|
|
|
52408e |
- } else if (node->details->shutdown) {
|
|
|
52408e |
- set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down");
|
|
|
52408e |
-#endif
|
|
|
52408e |
-
|
|
|
52408e |
} else if (data_set->flags & pe_flag_have_quorum) {
|
|
|
52408e |
set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online");
|
|
|
52408e |
ever_had_quorum = TRUE;
|
|
|
52408e |
@@ -315,6 +329,12 @@ compute_status(pe_working_set_t * data_set)
|
|
|
52408e |
}
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
+ if (node->details->shutdown) {
|
|
|
52408e |
+ pcmk_shutdown = 1;
|
|
|
52408e |
+ }
|
|
|
52408e |
+ if (pcmk_shutdown && !(node->details->running_rsc)) {
|
|
|
52408e |
+ pcmk_clean_shutdown = 1;
|
|
|
52408e |
+ }
|
|
|
52408e |
notify_parent();
|
|
|
52408e |
return;
|
|
|
52408e |
}
|
|
|
52408e |
@@ -339,7 +359,7 @@ crm_diff_update(const char *event, xmlNode * msg)
|
|
|
52408e |
static mainloop_timer_t *refresh_timer = NULL;
|
|
|
52408e |
|
|
|
52408e |
if(refresh_timer == NULL) {
|
|
|
52408e |
- refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL);
|
|
|
52408e |
+ refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL);
|
|
|
52408e |
refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer);
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
@@ -369,9 +389,9 @@ crm_diff_update(const char *event, xmlNode * msg)
|
|
|
52408e |
}
|
|
|
52408e |
|
|
|
52408e |
/* Refresh
|
|
|
52408e |
- * - immediately if the last update was more than 5s ago
|
|
|
52408e |
+ * - immediately if the last update was more than 1s ago
|
|
|
52408e |
* - every 10 updates
|
|
|
52408e |
- * - at most 2s after the last update
|
|
|
52408e |
+ * - at most 1s after the last update
|
|
|
52408e |
*/
|
|
|
52408e |
if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) {
|
|
|
52408e |
mon_refresh_state(refresh_timer);
|
|
|
52408e |
diff --git a/src/sbd.h b/src/sbd.h
|
|
|
52408e |
index 6fe07f9..3b05a11 100644
|
|
|
52408e |
--- a/src/sbd.h
|
|
|
52408e |
+++ b/src/sbd.h
|
|
|
52408e |
@@ -54,10 +54,13 @@
|
|
|
52408e |
/* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */
|
|
|
52408e |
|
|
|
52408e |
/* exit status for disk-servant */
|
|
|
52408e |
-#define EXIT_MD_IO_FAIL 20
|
|
|
52408e |
-#define EXIT_MD_REQUEST_RESET 21
|
|
|
52408e |
-#define EXIT_MD_REQUEST_SHUTOFF 22
|
|
|
52408e |
-#define EXIT_MD_REQUEST_CRASHDUMP 23
|
|
|
52408e |
+#define EXIT_MD_SERVANT_IO_FAIL 20
|
|
|
52408e |
+#define EXIT_MD_SERVANT_REQUEST_RESET 21
|
|
|
52408e |
+#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22
|
|
|
52408e |
+#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23
|
|
|
52408e |
+
|
|
|
52408e |
+/* exit status for pcmk-servant */
|
|
|
52408e |
+#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30
|
|
|
52408e |
|
|
|
52408e |
#define HOG_CHAR 0xff
|
|
|
52408e |
#define SECTOR_NAME_MAX 63
|
|
|
52408e |
--
|
|
|
52408e |
1.8.3.1
|
|
|
52408e |
|