Blame SOURCES/0005-Fix-sbd-pacemaker-sync-with-pacemakerd-for-robustnes.patch

f21030
From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
f21030
From: Klaus Wenninger <klaus.wenninger@aon.at>
f21030
Date: Tue, 10 Dec 2019 13:16:45 +0100
f21030
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness
f21030
f21030
State query ping of pacemakerd prevents pacemakerd from
f21030
starting any sub-daemons (and thus services) if sbd can't
f21030
reach it via ipc. As a health-check get timestamp from
f21030
pacemakerd. On shudown fetch info about graceful
f21030
shutdown from pacemakerd.
f21030
Use new pacemakerd-api provided by pacemaker.
f21030
---
f21030
 configure.ac        |   4 ++
f21030
 src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
f21030
 2 files changed, 126 insertions(+), 10 deletions(-)
f21030
f21030
diff --git a/configure.ac b/configure.ac
f21030
index 23547cf..11d12f0 100644
f21030
--- a/configure.ac
f21030
+++ b/configure.ac
f21030
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
f21030
 AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
f21030
 AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
f21030
 AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
f21030
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
f21030
 
f21030
 dnl pacemaker >= 1.1.8
f21030
 AC_CHECK_HEADERS(crm/cluster.h)
f21030
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
f21030
                test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
f21030
                test "$HAVE_cmap" = "1")
f21030
 
f21030
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
f21030
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
f21030
+
f21030
 CONFIGDIR=""
f21030
 AC_ARG_WITH(configdir,
f21030
     [  --with-configdir=DIR
f21030
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
f21030
index 6e53557..1243bfc 100644
f21030
--- a/src/sbd-pacemaker.c
f21030
+++ b/src/sbd-pacemaker.c
f21030
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
f21030
 
f21030
 #endif
f21030
 
f21030
+static void clean_up(int rc);
f21030
+
f21030
+#if USE_PACEMAKERD_API
f21030
+#include <crm/common/ipc_pacemakerd.h>
f21030
+
f21030
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
f21030
+static time_t last_ok = (time_t) 0;
f21030
+
f21030
+static void
f21030
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
f21030
+                    enum pcmk_ipc_event event_type, crm_exit_t status,
f21030
+                    void *event_data, void *user_data)
f21030
+{
f21030
+    pcmk_pacemakerd_api_reply_t *reply = event_data;
f21030
+
f21030
+    switch (event_type) {
f21030
+        case pcmk_ipc_event_disconnect:
f21030
+            /* Unexpected */
f21030
+            cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
f21030
+            return;
f21030
+
f21030
+        case pcmk_ipc_event_reply:
f21030
+            break;
f21030
+
f21030
+        default:
f21030
+            return;
f21030
+    }
f21030
+
f21030
+    if (status != CRM_EX_OK) {
f21030
+        cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
f21030
+                crm_exit_str(status));
f21030
+        return;
f21030
+    }
f21030
+
f21030
+    if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
f21030
+        cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
f21030
+                reply->reply_type);
f21030
+    } else {
f21030
+        if ((reply->data.ping.last_good != (time_t) 0) &&
f21030
+            (reply->data.ping.status == pcmk_rc_ok)) {
f21030
+            switch (reply->data.ping.state) {
f21030
+                case pcmk_pacemakerd_state_running:
f21030
+                case pcmk_pacemakerd_state_shutting_down:
f21030
+                    last_ok = reply->data.ping.last_good;
f21030
+                    break;
f21030
+                case pcmk_pacemakerd_state_shutdown_complete:
f21030
+                    clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
f21030
+                    break;
f21030
+                default:
f21030
+                    break;
f21030
+           }
f21030
+        }
f21030
+    }
f21030
+}
f21030
+#endif
f21030
+
f21030
 extern int disk_count;
f21030
 
f21030
 static void clean_up(int rc);
f21030
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
f21030
 		cib->cmds->signoff(cib);
f21030
 		/* retrigger as last one might have been skipped */
f21030
 		mon_refresh_state(NULL);
f21030
+
f21030
+#if !USE_PACEMAKERD_API
f21030
 		if (pcmk_clean_shutdown) {
f21030
 			/* assume a graceful pacemaker-shutdown */
f21030
 			clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
f21030
 		}
f21030
+#endif
f21030
 		/* getting here we aren't sure about the pacemaker-state
f21030
 		   so try to use the timeout to reconnect and get
f21030
 		   everything sorted out again
f21030
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
f21030
 		g_source_remove(timer_id_notify);
f21030
 	}
f21030
 
f21030
+#if USE_PACEMAKERD_API
f21030
+	{
f21030
+		time_t now = time(NULL);
f21030
+
f21030
+		if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
f21030
+#endif
f21030
+
f21030
 	if (cib_connected) {
f21030
 		if (counter == counter_max) {
f21030
 			mon_retrieve_current_cib();
f21030
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
f21030
 			counter++;
f21030
 		}
f21030
 	}
f21030
+
f21030
+#if USE_PACEMAKERD_API
f21030
+		}
f21030
+	}
f21030
+	if (pcmk_connect_ipc(pacemakerd_api,
f21030
+			pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
f21030
+		pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
f21030
+	}
f21030
+#endif
f21030
+
f21030
 	timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
f21030
 	return FALSE;
f21030
 }
f21030
@@ -526,6 +602,14 @@ clean_up(int rc)
f21030
 		cib = NULL;
f21030
 	}
f21030
 
f21030
+#if USE_PACEMAKERD_API
f21030
+	if (pacemakerd_api != NULL) {
f21030
+		pcmk_ipc_api_t *capi = pacemakerd_api;
f21030
+		pacemakerd_api = NULL; // Ensure we can't free this twice
f21030
+		pcmk_free_ipc_api(capi);
f21030
+	}
f21030
+#endif
f21030
+
f21030
 	if (rc >= 0) {
f21030
 		exit(rc);
f21030
 	}
f21030
@@ -535,11 +619,11 @@ clean_up(int rc)
f21030
 int
f21030
 servant_pcmk(const char *diskname, int mode, const void* argp)
f21030
 {
f21030
-	int exit_code = 0;
f21030
+    int exit_code = 0;
f21030
 
f21030
-        crm_system_name = strdup("sbd:pcmk");
f21030
-	cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
f21030
-	set_proc_title("sbd: watcher: Pacemaker");
f21030
+    crm_system_name = strdup("sbd:pcmk");
f21030
+    cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
f21030
+    set_proc_title("sbd: watcher: Pacemaker");
f21030
         setenv("PCMK_watchdog", "true", 1);
f21030
 
f21030
         if(debug == 0) {
f21030
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
f21030
         }
f21030
 
f21030
 
f21030
-	if (data_set == NULL) {
f21030
-		data_set = pe_new_working_set();
f21030
-	}
f21030
-	if (data_set == NULL) {
f21030
-		return -1;
f21030
-	}
f21030
+    if (data_set == NULL) {
f21030
+        data_set = pe_new_working_set();
f21030
+    }
f21030
+    if (data_set == NULL) {
f21030
+        return -1;
f21030
+    }
f21030
+
f21030
+#if USE_PACEMAKERD_API
f21030
+    {
f21030
+    int rc;
f21030
+
f21030
+        rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
f21030
+        if (pacemakerd_api == NULL) {
f21030
+            cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
f21030
+                    pcmk_rc_str(rc));
f21030
+            return -1;
f21030
+        }
f21030
+        pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
f21030
+        do {
f21030
+            rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
f21030
+            if (rc != pcmk_rc_ok) {
f21030
+                cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
f21030
+                    pcmk_rc_str(rc));
f21030
+                sleep(reconnect_msec / 1000);
f21030
+            }
f21030
+        } while (rc != pcmk_rc_ok);
f21030
+        /* send a ping to pacemakerd to wake it up */
f21030
+        pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
f21030
+        /* cib should come up now as well so it's time
f21030
+         * to have the inquisitor have a closer look
f21030
+        */
f21030
+        notify_parent();
f21030
+    }
f21030
+#endif
f21030
 
f21030
 	if (current_cib == NULL) {
f21030
 		cib = cib_new();
f21030
-- 
f21030
1.8.3.1
f21030