From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Tue, 10 Dec 2019 13:16:45 +0100 Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness State query ping of pacemakerd prevents pacemakerd from starting any sub-daemons (and thus services) if sbd can't reach it via ipc. As a health-check get timestamp from pacemakerd. On shudown fetch info about graceful shutdown from pacemakerd. Use new pacemakerd-api provided by pacemaker. --- configure.ac | 4 ++ src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 126 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index 23547cf..11d12f0 100644 --- a/configure.ac +++ b/configure.ac @@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes") AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0) AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0) +AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0) dnl pacemaker >= 1.1.8 AC_CHECK_HEADERS(crm/cluster.h) @@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT, test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" && test "$HAVE_cmap" = "1") +AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd) +AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1") + CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 6e53557..1243bfc 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set) #endif +static void clean_up(int rc); + +#if USE_PACEMAKERD_API +#include + +static pcmk_ipc_api_t *pacemakerd_api = NULL; +static time_t last_ok = (time_t) 0; + +static void +pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, + enum pcmk_ipc_event event_type, crm_exit_t status, + void *event_data, void *user_data) +{ + pcmk_pacemakerd_api_reply_t *reply = event_data; + + switch (event_type) { + case pcmk_ipc_event_disconnect: + /* Unexpected */ + cl_log(LOG_ERR, "Lost connection to pacemakerd\n"); + return; + + case pcmk_ipc_event_reply: + break; + + default: + return; + } + + if (status != CRM_EX_OK) { + cl_log(LOG_ERR, "Bad reply from pacemakerd: %s", + crm_exit_str(status)); + return; + } + + if (reply->reply_type != pcmk_pacemakerd_reply_ping) { + cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n", + reply->reply_type); + } else { + if ((reply->data.ping.last_good != (time_t) 0) && + (reply->data.ping.status == pcmk_rc_ok)) { + switch (reply->data.ping.state) { + case pcmk_pacemakerd_state_running: + case pcmk_pacemakerd_state_shutting_down: + last_ok = reply->data.ping.last_good; + break; + case pcmk_pacemakerd_state_shutdown_complete: + clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); + break; + default: + break; + } + } + } +} +#endif + extern int disk_count; static void clean_up(int rc); @@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data) cib->cmds->signoff(cib); /* retrigger as last one might have been skipped */ mon_refresh_state(NULL); + +#if !USE_PACEMAKERD_API if (pcmk_clean_shutdown) { /* assume a graceful pacemaker-shutdown */ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); } +#endif /* getting here we aren't sure about the pacemaker-state so try to use the timeout to reconnect and get everything sorted out again @@ -196,6 +255,13 @@ mon_timer_notify(gpointer data) g_source_remove(timer_id_notify); } +#if USE_PACEMAKERD_API + { + time_t now = time(NULL); + + if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) { +#endif + if (cib_connected) { if (counter == counter_max) { mon_retrieve_current_cib(); @@ -207,6 +273,16 @@ mon_timer_notify(gpointer data) counter++; } } + +#if USE_PACEMAKERD_API + } + } + if (pcmk_connect_ipc(pacemakerd_api, + pcmk_ipc_dispatch_main) == pcmk_rc_ok) { + pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); + } +#endif + timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } @@ -526,6 +602,14 @@ clean_up(int rc) cib = NULL; } +#if USE_PACEMAKERD_API + if (pacemakerd_api != NULL) { + pcmk_ipc_api_t *capi = pacemakerd_api; + pacemakerd_api = NULL; // Ensure we can't free this twice + pcmk_free_ipc_api(capi); + } +#endif + if (rc >= 0) { exit(rc); } @@ -535,11 +619,11 @@ clean_up(int rc) int servant_pcmk(const char *diskname, int mode, const void* argp) { - int exit_code = 0; + int exit_code = 0; - crm_system_name = strdup("sbd:pcmk"); - cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); - set_proc_title("sbd: watcher: Pacemaker"); + crm_system_name = strdup("sbd:pcmk"); + cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); + set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { @@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp) } - if (data_set == NULL) { - data_set = pe_new_working_set(); - } - if (data_set == NULL) { - return -1; - } + if (data_set == NULL) { + data_set = pe_new_working_set(); + } + if (data_set == NULL) { + return -1; + } + +#if USE_PACEMAKERD_API + { + int rc; + + rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd); + if (pacemakerd_api == NULL) { + cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n", + pcmk_rc_str(rc)); + return -1; + } + pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL); + do { + rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main); + if (rc != pcmk_rc_ok) { + cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n", + pcmk_rc_str(rc)); + sleep(reconnect_msec / 1000); + } + } while (rc != pcmk_rc_ok); + /* send a ping to pacemakerd to wake it up */ + pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); + /* cib should come up now as well so it's time + * to have the inquisitor have a closer look + */ + notify_parent(); + } +#endif if (current_cib == NULL) { cib = cib_new(); -- 1.8.3.1