commit e073613f0727a3646732d0d9bb4f2050017476b3 Author: Andrew Beekhof Date: Tue Oct 28 10:32:02 2014 +1100 Fix: watchdog: Allow startup without sbd diff --git a/crmd/control.c b/crmd/control.c index 0332f10..8cc1cfa 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -874,7 +874,7 @@ pe_cluster_option crmd_opts[] = { { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, - { "stonith-watchdog-timeout", NULL, "time", NULL, "0s", &check_timer, + { "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer, "How long to wait before we can assume nodes are safely down", NULL }, { "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL }, @@ -911,6 +911,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); + long st_timeout = 0; + long sbd_timeout = 0; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; @@ -946,17 +948,36 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void } value = getenv("SBD_WATCHDOG_TIMEOUT"); + sbd_timeout = crm_get_msec(value); - if(value == NULL) { - value = crmd_pref(config_hash, "stonith-watchdog-timeout"); - } + value = crmd_pref(config_hash, "stonith-watchdog-timeout"); + st_timeout = crm_get_msec(value); - if(crm_get_msec(value) > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) { + if(st_timeout > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) { do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, no watchdog device configured"); crmd_exit(DAEMON_RESPAWN_STOP); - } else if(crm_get_msec(value) <= 0 && daemon_option_enabled(crm_system_name, "watchdog")) { - crm_warn("Watchdog enabled but no stonith-watchdog-timeout configured"); + } else if(!daemon_option_enabled(crm_system_name, "watchdog")) { + crm_trace("Watchdog disabled"); + + } else if(value == NULL && sbd_timeout > 0) { + char *timeout = NULL; + + st_timeout = 2 * sbd_timeout / 1000; + timeout = g_strdup_printf("%lds", st_timeout); + crm_notice("Setting stonith-watchdog-timeout=%s", timeout); + + update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, + "stonith-watchdog-timeout", timeout, FALSE, NULL, NULL); + free(timeout); + + } else if(st_timeout <= 0) { + crm_notice("Watchdog enabled but stonith-watchdog-timeout is disabled"); + + } else if(st_timeout < sbd_timeout) { + do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, stonith-watchdog-timeout (%ldms) is too short (must be greater than %ldms)", + st_timeout, sbd_timeout); + crmd_exit(DAEMON_RESPAWN_STOP); } value = crmd_pref(config_hash, "no-quorum-policy"); diff --git a/fencing/main.c b/fencing/main.c index fe6560d..2694452 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -1003,7 +1003,8 @@ update_cib_cache_cb(const char *event, xmlNode * msg) } if(daemon_option_enabled(crm_system_name, "watchdog")) { - const char *value = getenv("SBD_WATCHDOG_TIMEOUT"); + const char *value = NULL; + long timeout_ms = 0; if(value == NULL) { stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_TRACE); @@ -1013,10 +1014,12 @@ update_cib_cache_cb(const char *event, xmlNode * msg) } if(value) { - stonith_watchdog_timeout_ms = crm_get_msec(value); + timeout_ms = crm_get_msec(value); + } - } else { - stonith_watchdog_timeout_ms = 0; + if(timeout_ms != stonith_watchdog_timeout_ms) { + crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); + stonith_watchdog_timeout_ms = timeout_ms; } } diff --git a/lib/common/utils.c b/lib/common/utils.c index 6b8b12c..eacd8e9 100644 --- a/lib/common/utils.c +++ b/lib/common/utils.c @@ -286,6 +286,9 @@ cluster_option(GHashTable * options, gboolean(*validate) (const char *), if (options == NULL) { return def_value; + + } else if(def_value == NULL) { + return def_value; } g_hash_table_insert(options, strdup(name), strdup(def_value)); @@ -319,7 +322,6 @@ get_cluster_pref(GHashTable * options, pe_cluster_option * option_list, int len, } } CRM_CHECK(found, crm_err("No option named: %s", name)); - CRM_ASSERT(value != NULL); return value; } diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c index c7852c3..fa2c707 100644 --- a/mcp/pacemaker.c +++ b/mcp/pacemaker.c @@ -1040,6 +1040,8 @@ main(int argc, char **argv) if(pcmk_locate_sbd() > 0) { setenv("PCMK_watchdog", "true", 1); + } else { + setenv("PCMK_watchdog", "false", 1); } find_and_track_existing_processes();