Blob Blame History Raw
From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 10 Dec 2019 13:16:45 +0100
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness

State query ping of pacemakerd prevents pacemakerd from
starting any sub-daemons (and thus services) if sbd can't
reach it via ipc. As a health-check get timestamp from
pacemakerd. On shudown fetch info about graceful
shutdown from pacemakerd.
Use new pacemakerd-api provided by pacemaker.
---
 configure.ac        |   4 ++
 src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/configure.ac b/configure.ac
index 23547cf..11d12f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
 AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
 AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
 AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
 
 dnl pacemaker >= 1.1.8
 AC_CHECK_HEADERS(crm/cluster.h)
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
                test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
                test "$HAVE_cmap" = "1")
 
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
+
 CONFIGDIR=""
 AC_ARG_WITH(configdir,
     [  --with-configdir=DIR
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 6e53557..1243bfc 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
 
 #endif
 
+static void clean_up(int rc);
+
+#if USE_PACEMAKERD_API
+#include <crm/common/ipc_pacemakerd.h>
+
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
+static time_t last_ok = (time_t) 0;
+
+static void
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
+                    enum pcmk_ipc_event event_type, crm_exit_t status,
+                    void *event_data, void *user_data)
+{
+    pcmk_pacemakerd_api_reply_t *reply = event_data;
+
+    switch (event_type) {
+        case pcmk_ipc_event_disconnect:
+            /* Unexpected */
+            cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
+            return;
+
+        case pcmk_ipc_event_reply:
+            break;
+
+        default:
+            return;
+    }
+
+    if (status != CRM_EX_OK) {
+        cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
+                crm_exit_str(status));
+        return;
+    }
+
+    if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
+        cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
+                reply->reply_type);
+    } else {
+        if ((reply->data.ping.last_good != (time_t) 0) &&
+            (reply->data.ping.status == pcmk_rc_ok)) {
+            switch (reply->data.ping.state) {
+                case pcmk_pacemakerd_state_running:
+                case pcmk_pacemakerd_state_shutting_down:
+                    last_ok = reply->data.ping.last_good;
+                    break;
+                case pcmk_pacemakerd_state_shutdown_complete:
+                    clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
+                    break;
+                default:
+                    break;
+           }
+        }
+    }
+}
+#endif
+
 extern int disk_count;
 
 static void clean_up(int rc);
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
 		cib->cmds->signoff(cib);
 		/* retrigger as last one might have been skipped */
 		mon_refresh_state(NULL);
+
+#if !USE_PACEMAKERD_API
 		if (pcmk_clean_shutdown) {
 			/* assume a graceful pacemaker-shutdown */
 			clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
 		}
+#endif
 		/* getting here we aren't sure about the pacemaker-state
 		   so try to use the timeout to reconnect and get
 		   everything sorted out again
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
 		g_source_remove(timer_id_notify);
 	}
 
+#if USE_PACEMAKERD_API
+	{
+		time_t now = time(NULL);
+
+		if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
+#endif
+
 	if (cib_connected) {
 		if (counter == counter_max) {
 			mon_retrieve_current_cib();
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
 			counter++;
 		}
 	}
+
+#if USE_PACEMAKERD_API
+		}
+	}
+	if (pcmk_connect_ipc(pacemakerd_api,
+			pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
+		pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+	}
+#endif
+
 	timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
 	return FALSE;
 }
@@ -526,6 +602,14 @@ clean_up(int rc)
 		cib = NULL;
 	}
 
+#if USE_PACEMAKERD_API
+	if (pacemakerd_api != NULL) {
+		pcmk_ipc_api_t *capi = pacemakerd_api;
+		pacemakerd_api = NULL; // Ensure we can't free this twice
+		pcmk_free_ipc_api(capi);
+	}
+#endif
+
 	if (rc >= 0) {
 		exit(rc);
 	}
@@ -535,11 +619,11 @@ clean_up(int rc)
 int
 servant_pcmk(const char *diskname, int mode, const void* argp)
 {
-	int exit_code = 0;
+    int exit_code = 0;
 
-        crm_system_name = strdup("sbd:pcmk");
-	cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
-	set_proc_title("sbd: watcher: Pacemaker");
+    crm_system_name = strdup("sbd:pcmk");
+    cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
+    set_proc_title("sbd: watcher: Pacemaker");
         setenv("PCMK_watchdog", "true", 1);
 
         if(debug == 0) {
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
         }
 
 
-	if (data_set == NULL) {
-		data_set = pe_new_working_set();
-	}
-	if (data_set == NULL) {
-		return -1;
-	}
+    if (data_set == NULL) {
+        data_set = pe_new_working_set();
+    }
+    if (data_set == NULL) {
+        return -1;
+    }
+
+#if USE_PACEMAKERD_API
+    {
+    int rc;
+
+        rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
+        if (pacemakerd_api == NULL) {
+            cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
+                    pcmk_rc_str(rc));
+            return -1;
+        }
+        pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
+        do {
+            rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
+            if (rc != pcmk_rc_ok) {
+                cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
+                    pcmk_rc_str(rc));
+                sleep(reconnect_msec / 1000);
+            }
+        } while (rc != pcmk_rc_ok);
+        /* send a ping to pacemakerd to wake it up */
+        pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+        /* cib should come up now as well so it's time
+         * to have the inquisitor have a closer look
+        */
+        notify_parent();
+    }
+#endif
 
 	if (current_cib == NULL) {
 		cib = cib_new();
-- 
1.8.3.1