commit e073613f0727a3646732d0d9bb4f2050017476b3
Author: Andrew Beekhof <andrew@beekhof.net>
Date: Tue Oct 28 10:32:02 2014 +1100
Fix: watchdog: Allow startup without sbd
diff --git a/crmd/control.c b/crmd/control.c
index 0332f10..8cc1cfa 100644
--- a/crmd/control.c
+++ b/crmd/control.c
@@ -874,7 +874,7 @@ pe_cluster_option crmd_opts[] = {
{ "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
{ "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." },
{ "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." },
- { "stonith-watchdog-timeout", NULL, "time", NULL, "0s", &check_timer,
+ { "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer,
"How long to wait before we can assume nodes are safely down", NULL },
{ "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL },
@@ -911,6 +911,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
const char *value = NULL;
GHashTable *config_hash = NULL;
crm_time_t *now = crm_time_new(NULL);
+ long st_timeout = 0;
+ long sbd_timeout = 0;
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
@@ -946,17 +948,36 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
}
value = getenv("SBD_WATCHDOG_TIMEOUT");
+ sbd_timeout = crm_get_msec(value);
- if(value == NULL) {
- value = crmd_pref(config_hash, "stonith-watchdog-timeout");
- }
+ value = crmd_pref(config_hash, "stonith-watchdog-timeout");
+ st_timeout = crm_get_msec(value);
- if(crm_get_msec(value) > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) {
+ if(st_timeout > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) {
do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, no watchdog device configured");
crmd_exit(DAEMON_RESPAWN_STOP);
- } else if(crm_get_msec(value) <= 0 && daemon_option_enabled(crm_system_name, "watchdog")) {
- crm_warn("Watchdog enabled but no stonith-watchdog-timeout configured");
+ } else if(!daemon_option_enabled(crm_system_name, "watchdog")) {
+ crm_trace("Watchdog disabled");
+
+ } else if(value == NULL && sbd_timeout > 0) {
+ char *timeout = NULL;
+
+ st_timeout = 2 * sbd_timeout / 1000;
+ timeout = g_strdup_printf("%lds", st_timeout);
+ crm_notice("Setting stonith-watchdog-timeout=%s", timeout);
+
+ update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
+ "stonith-watchdog-timeout", timeout, FALSE, NULL, NULL);
+ free(timeout);
+
+ } else if(st_timeout <= 0) {
+ crm_notice("Watchdog enabled but stonith-watchdog-timeout is disabled");
+
+ } else if(st_timeout < sbd_timeout) {
+ do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, stonith-watchdog-timeout (%ldms) is too short (must be greater than %ldms)",
+ st_timeout, sbd_timeout);
+ crmd_exit(DAEMON_RESPAWN_STOP);
}
value = crmd_pref(config_hash, "no-quorum-policy");
diff --git a/fencing/main.c b/fencing/main.c
index fe6560d..2694452 100644
--- a/fencing/main.c
+++ b/fencing/main.c
@@ -1003,7 +1003,8 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
}
if(daemon_option_enabled(crm_system_name, "watchdog")) {
- const char *value = getenv("SBD_WATCHDOG_TIMEOUT");
+ const char *value = NULL;
+ long timeout_ms = 0;
if(value == NULL) {
stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_TRACE);
@@ -1013,10 +1014,12 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
}
if(value) {
- stonith_watchdog_timeout_ms = crm_get_msec(value);
+ timeout_ms = crm_get_msec(value);
+ }
- } else {
- stonith_watchdog_timeout_ms = 0;
+ if(timeout_ms != stonith_watchdog_timeout_ms) {
+ crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000);
+ stonith_watchdog_timeout_ms = timeout_ms;
}
}
diff --git a/lib/common/utils.c b/lib/common/utils.c
index 6b8b12c..eacd8e9 100644
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -286,6 +286,9 @@ cluster_option(GHashTable * options, gboolean(*validate) (const char *),
if (options == NULL) {
return def_value;
+
+ } else if(def_value == NULL) {
+ return def_value;
}
g_hash_table_insert(options, strdup(name), strdup(def_value));
@@ -319,7 +322,6 @@ get_cluster_pref(GHashTable * options, pe_cluster_option * option_list, int len,
}
}
CRM_CHECK(found, crm_err("No option named: %s", name));
- CRM_ASSERT(value != NULL);
return value;
}
diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c
index c7852c3..fa2c707 100644
--- a/mcp/pacemaker.c
+++ b/mcp/pacemaker.c
@@ -1040,6 +1040,8 @@ main(int argc, char **argv)
if(pcmk_locate_sbd() > 0) {
setenv("PCMK_watchdog", "true", 1);
+ } else {
+ setenv("PCMK_watchdog", "false", 1);
}
find_and_track_existing_processes();