From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 10 Dec 2019 13:16:45 +0100
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness
State query ping of pacemakerd prevents pacemakerd from
starting any sub-daemons (and thus services) if sbd can't
reach it via ipc. As a health-check get timestamp from
pacemakerd. On shudown fetch info about graceful
shutdown from pacemakerd.
Use new pacemakerd-api provided by pacemaker.
---
configure.ac | 4 ++
src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 126 insertions(+), 10 deletions(-)
diff --git a/configure.ac b/configure.ac
index 23547cf..11d12f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
dnl pacemaker >= 1.1.8
AC_CHECK_HEADERS(crm/cluster.h)
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
test "$HAVE_cmap" = "1")
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
+
CONFIGDIR=""
AC_ARG_WITH(configdir,
[ --with-configdir=DIR
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 6e53557..1243bfc 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
#endif
+static void clean_up(int rc);
+
+#if USE_PACEMAKERD_API
+#include <crm/common/ipc_pacemakerd.h>
+
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
+static time_t last_ok = (time_t) 0;
+
+static void
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
+ enum pcmk_ipc_event event_type, crm_exit_t status,
+ void *event_data, void *user_data)
+{
+ pcmk_pacemakerd_api_reply_t *reply = event_data;
+
+ switch (event_type) {
+ case pcmk_ipc_event_disconnect:
+ /* Unexpected */
+ cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
+ return;
+
+ case pcmk_ipc_event_reply:
+ break;
+
+ default:
+ return;
+ }
+
+ if (status != CRM_EX_OK) {
+ cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
+ crm_exit_str(status));
+ return;
+ }
+
+ if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
+ cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
+ reply->reply_type);
+ } else {
+ if ((reply->data.ping.last_good != (time_t) 0) &&
+ (reply->data.ping.status == pcmk_rc_ok)) {
+ switch (reply->data.ping.state) {
+ case pcmk_pacemakerd_state_running:
+ case pcmk_pacemakerd_state_shutting_down:
+ last_ok = reply->data.ping.last_good;
+ break;
+ case pcmk_pacemakerd_state_shutdown_complete:
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+#endif
+
extern int disk_count;
static void clean_up(int rc);
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
cib->cmds->signoff(cib);
/* retrigger as last one might have been skipped */
mon_refresh_state(NULL);
+
+#if !USE_PACEMAKERD_API
if (pcmk_clean_shutdown) {
/* assume a graceful pacemaker-shutdown */
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
}
+#endif
/* getting here we aren't sure about the pacemaker-state
so try to use the timeout to reconnect and get
everything sorted out again
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
g_source_remove(timer_id_notify);
}
+#if USE_PACEMAKERD_API
+ {
+ time_t now = time(NULL);
+
+ if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
+#endif
+
if (cib_connected) {
if (counter == counter_max) {
mon_retrieve_current_cib();
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
counter++;
}
}
+
+#if USE_PACEMAKERD_API
+ }
+ }
+ if (pcmk_connect_ipc(pacemakerd_api,
+ pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+ }
+#endif
+
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
return FALSE;
}
@@ -526,6 +602,14 @@ clean_up(int rc)
cib = NULL;
}
+#if USE_PACEMAKERD_API
+ if (pacemakerd_api != NULL) {
+ pcmk_ipc_api_t *capi = pacemakerd_api;
+ pacemakerd_api = NULL; // Ensure we can't free this twice
+ pcmk_free_ipc_api(capi);
+ }
+#endif
+
if (rc >= 0) {
exit(rc);
}
@@ -535,11 +619,11 @@ clean_up(int rc)
int
servant_pcmk(const char *diskname, int mode, const void* argp)
{
- int exit_code = 0;
+ int exit_code = 0;
- crm_system_name = strdup("sbd:pcmk");
- cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
- set_proc_title("sbd: watcher: Pacemaker");
+ crm_system_name = strdup("sbd:pcmk");
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
+ set_proc_title("sbd: watcher: Pacemaker");
setenv("PCMK_watchdog", "true", 1);
if(debug == 0) {
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
}
- if (data_set == NULL) {
- data_set = pe_new_working_set();
- }
- if (data_set == NULL) {
- return -1;
- }
+ if (data_set == NULL) {
+ data_set = pe_new_working_set();
+ }
+ if (data_set == NULL) {
+ return -1;
+ }
+
+#if USE_PACEMAKERD_API
+ {
+ int rc;
+
+ rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
+ if (pacemakerd_api == NULL) {
+ cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
+ pcmk_rc_str(rc));
+ return -1;
+ }
+ pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
+ do {
+ rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
+ if (rc != pcmk_rc_ok) {
+ cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
+ pcmk_rc_str(rc));
+ sleep(reconnect_msec / 1000);
+ }
+ } while (rc != pcmk_rc_ok);
+ /* send a ping to pacemakerd to wake it up */
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+ /* cib should come up now as well so it's time
+ * to have the inquisitor have a closer look
+ */
+ notify_parent();
+ }
+#endif
if (current_cib == NULL) {
cib = cib_new();
--
1.8.3.1