From 4c3e4049b08799094a64dac289a48deef4d3d916 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 24 Jul 2020 14:31:01 +0200 Subject: [PATCH] Fix: sbd-cluster: match qdevice-sync_timeout against wd-timeout --- configure.ac | 13 +++ src/sbd-cluster.c | 252 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 223 insertions(+), 42 deletions(-) diff --git a/configure.ac b/configure.ac index 3391c5f..23547cf 100644 --- a/configure.ac +++ b/configure.ac @@ -109,6 +109,12 @@ AC_TEST_NO_QUORUM_POLICY(no_quorum_demote) dnl check for new pe-API AC_CHECK_FUNCS(pe_new_working_set) +dnl check if votequorum comes with default for qdevice-sync_timeout +AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT], + HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1, + HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0, + [#include ]) + if test "$missing" = "yes"; then AC_MSG_ERROR([Missing required libraries or functions.]) fi @@ -140,6 +146,13 @@ AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1") AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle) AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1") +AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT, + ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap), + Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching) +AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT, + test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" && + test "$HAVE_cmap" = "1") + CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c index 13fa580..b6c5512 100644 --- a/src/sbd-cluster.c +++ b/src/sbd-cluster.c @@ -33,7 +33,7 @@ #include #include -#if CHECK_TWO_NODE +#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include #endif @@ -86,11 +86,20 @@ sbd_plugin_membership_dispatch(cpg_handle_t handle, static votequorum_handle_t votequorum_handle = 0; #endif +#if CHECK_TWO_NODE static bool two_node = false; +#endif static bool ever_seen_both = false; static int cpg_membership_entries = -1; -#if CHECK_TWO_NODE +#if CHECK_QDEVICE_SYNC_TIMEOUT +#include +static bool using_qdevice = false; +static uint32_t qdevice_sync_timeout = /* in seconds */ + VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; +#endif + +#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include static cmap_handle_t cmap_handle = 0; @@ -102,28 +111,59 @@ void sbd_cpg_membership_health_update() { if(cpg_membership_entries > 0) { - bool quorum_is_suspect = +#if CHECK_TWO_NODE + bool quorum_is_suspect_two_node = (two_node && ever_seen_both && cpg_membership_entries == 1); +#endif +#if CHECK_QDEVICE_SYNC_TIMEOUT + bool quorum_is_suspect_qdevice_timing = + using_qdevice && (qdevice_sync_timeout > timeout_watchdog); +#endif - if (!quorum_is_suspect) { + do { +#if CHECK_TWO_NODE + if (quorum_is_suspect_two_node) { + /* Alternative would be asking votequorum for number of votes. + * Using pacemaker's cpg as source for number of active nodes + * avoids binding to an additional library, is definitely + * less code to write and we wouldn't have to combine data + * from 3 sources (cmap, cpg & votequorum) in a potentially + * racy environment. + */ + set_servant_health(pcmk_health_noquorum, LOG_WARNING, + "Connected to %s but requires both nodes present", + name_for_cluster_type(get_cluster_type()) + ); + break; + } +#endif +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (quorum_is_suspect_qdevice_timing) { + /* We can't really trust quorum info as qdevice-sync_timeout + * makes reaction of quorum too sluggish for our + * watchdog-timeout. + */ + set_servant_health(pcmk_health_noquorum, LOG_WARNING, + "Connected to %s but quorum using qdevice is distrusted " + "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout " + "(%lus).", + name_for_cluster_type(get_cluster_type()), + qdevice_sync_timeout, timeout_watchdog + ); + break; + } +#endif set_servant_health(pcmk_health_online, LOG_INFO, - "Connected to %s (%u members)", - name_for_cluster_type(get_cluster_type()), - cpg_membership_entries - ); - } else { - /* Alternative would be asking votequorum for number of votes. - * Using pacemaker's cpg as source for number of active nodes - * avoids binding to an additional library, is definitely - * less code to write and we wouldn't have to combine data - * from 3 sources (cmap, cpq & votequorum) in a potentially - * racy environment. - */ - set_servant_health(pcmk_health_noquorum, LOG_WARNING, - "Connected to %s but requires both nodes present", - name_for_cluster_type(get_cluster_type()) - ); - } + "Connected to %s (%u members)%s", + name_for_cluster_type(get_cluster_type()), + cpg_membership_entries, +#if CHECK_QDEVICE_SYNC_TIMEOUT + using_qdevice?" using qdevice for quorum":"" +#else + "" +#endif + ); + } while (false); if (cpg_membership_entries > 1) { ever_seen_both = true; @@ -146,7 +186,7 @@ sbd_cpg_membership_dispatch(cpg_handle_t handle, notify_parent(); } -#if CHECK_TWO_NODE +#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT static void sbd_cmap_notify_fn( cmap_handle_t cmap_handle, cmap_track_handle_t cmap_track_handle, @@ -156,21 +196,99 @@ static void sbd_cmap_notify_fn( struct cmap_notify_value old_val, void *user_data) { - if (new_val.type == CMAP_VALUETYPE_UINT8) { - switch (event) { - case CMAP_TRACK_ADD: - case CMAP_TRACK_MODIFY: - two_node = *((uint8_t *) new_val.data); - break; - case CMAP_TRACK_DELETE: - two_node = false; - break; - default: - return; - } - sbd_cpg_membership_health_update(); - notify_parent(); + switch (event) { + case CMAP_TRACK_ADD: + case CMAP_TRACK_MODIFY: + switch (new_val.type) { + case CMAP_VALUETYPE_UINT8: +#if CHECK_TWO_NODE + if (!strcmp(key_name, "quorum.two_node")) { + two_node = *((uint8_t *) new_val.data); + } else { + return; + } + break; +#else + return; +#endif + case CMAP_VALUETYPE_STRING: +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (!strcmp(key_name, "quorum.device.model")) { + using_qdevice = + ((new_val.data) && strlen((char *) new_val.data)); + } else { + return; + } + break; +#else + return; +#endif + case CMAP_VALUETYPE_UINT32: +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (!strcmp(key_name, "quorum.device.sync_timeout")) { + if (new_val.data) { + qdevice_sync_timeout = + *((uint32_t *) new_val.data) / 1000; + } else { + qdevice_sync_timeout = + VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; + } + } else { + return; + } + break; +#else + return; +#endif + default: + return; + } + break; + case CMAP_TRACK_DELETE: + switch (new_val.type) { + case CMAP_VALUETYPE_UINT8: +#if CHECK_TWO_NODE + if (!strcmp(key_name, "quorum.two_node")) { + two_node = false; + } else { + return; + } + break; +#else + return; +#endif + case CMAP_VALUETYPE_STRING: +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (!strcmp(key_name, "quorum.device.model")) { + using_qdevice = false; + } else { + return; + } + break; +#else + return; +#endif + case CMAP_VALUETYPE_UINT32: +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (!strcmp(key_name, "quorum.device.sync_timeout")) { + qdevice_sync_timeout = + VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; + } else { + return; + } + break; +#else + return; +#endif + default: + return; + } + break; + default: + return; } + sbd_cpg_membership_health_update(); + notify_parent(); } static gboolean @@ -200,9 +318,14 @@ cmap_destroy(void) } static gboolean -sbd_get_two_node(void) +verify_against_cmap_config(void) { +#if CHECK_TWO_NODE uint8_t two_node_u8 = 0; +#endif +#if CHECK_QDEVICE_SYNC_TIMEOUT + char *qdevice_model = NULL; +#endif int cmap_fd; if (!track_handle) { @@ -211,12 +334,31 @@ sbd_get_two_node(void) goto out; } +#if CHECK_TWO_NODE if (cmap_track_add(cmap_handle, "quorum.two_node", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n"); goto out; } +#endif + +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (cmap_track_add(cmap_handle, "quorum.device.model", + CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, + sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { + cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n"); + goto out; + } + + if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout", + CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, + sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { + cl_log(LOG_WARNING, + "Failed adding CMAP tracker for qdevice-sync_timeout\n"); + goto out; + } +#endif /* add the tracker to mainloop */ if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) { @@ -232,13 +374,39 @@ sbd_get_two_node(void) g_source_attach(cmap_source, NULL); } - if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) { +#if CHECK_TWO_NODE + if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) + == CS_OK) { cl_log(two_node_u8? LOG_NOTICE : LOG_INFO, "Corosync is%s in 2Node-mode", two_node_u8?"":" not"); two_node = two_node_u8; } else { cl_log(LOG_INFO, "quorum.two_node not present in cmap\n"); } +#endif + +#if CHECK_QDEVICE_SYNC_TIMEOUT + if (cmap_get_string(cmap_handle, "quorum.device.model", + &qdevice_model) == CS_OK) { + using_qdevice = qdevice_model && strlen(qdevice_model); + cl_log(using_qdevice? LOG_NOTICE : LOG_INFO, + "Corosync is%s using qdevice", using_qdevice?"":" not"); + } else { + cl_log(LOG_INFO, "quorum.device.model not present in cmap\n"); + } + + if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout", + &qdevice_sync_timeout) == CS_OK) { + qdevice_sync_timeout /= 1000; + cl_log(LOG_INFO, + "Corosync is using qdevice-sync_timeout=%ds", + qdevice_sync_timeout); + } else { + cl_log(LOG_INFO, + "quorum.device.sync_timeout not present in cmap\n"); + } +#endif + return TRUE; out: @@ -331,15 +499,15 @@ sbd_membership_connect(void) } else { cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack)); -#if SUPPORT_COROSYNC && CHECK_TWO_NODE - if (sbd_get_two_node()) { +#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) + if (verify_against_cmap_config()) { #endif if(crm_cluster_connect(&cluster)) { connected = true; } -#if SUPPORT_COROSYNC && CHECK_TWO_NODE +#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) } #endif } @@ -362,7 +530,7 @@ sbd_membership_destroy(gpointer user_data) cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type())); if (get_cluster_type() != pcmk_cluster_unknown) { -#if SUPPORT_COROSYNC && CHECK_TWO_NODE +#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) cmap_destroy(); #endif } -- 1.8.3.1