Blob Blame History Raw
commit e073613f0727a3646732d0d9bb4f2050017476b3
Author: Andrew Beekhof <andrew@beekhof.net>
Date:   Tue Oct 28 10:32:02 2014 +1100

    Fix: watchdog: Allow startup without sbd

diff --git a/crmd/control.c b/crmd/control.c
index 0332f10..8cc1cfa 100644
--- a/crmd/control.c
+++ b/crmd/control.c
@@ -874,7 +874,7 @@ pe_cluster_option crmd_opts[] = {
 	{ "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
 	{ "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." },
 	{ "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." },
-	{ "stonith-watchdog-timeout", NULL, "time", NULL, "0s", &check_timer,
+	{ "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer,
 	  "How long to wait before we can assume nodes are safely down", NULL },
 	{ "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL },
 
@@ -911,6 +911,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
     const char *value = NULL;
     GHashTable *config_hash = NULL;
     crm_time_t *now = crm_time_new(NULL);
+    long st_timeout = 0;
+    long sbd_timeout = 0;
 
     if (rc != pcmk_ok) {
         fsa_data_t *msg_data = NULL;
@@ -946,17 +948,36 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
     }
 
     value = getenv("SBD_WATCHDOG_TIMEOUT");
+    sbd_timeout = crm_get_msec(value);
 
-    if(value == NULL) {
-        value = crmd_pref(config_hash, "stonith-watchdog-timeout");
-    }
+    value = crmd_pref(config_hash, "stonith-watchdog-timeout");
+    st_timeout = crm_get_msec(value);
 
-    if(crm_get_msec(value) > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) {
+    if(st_timeout > 0 && !daemon_option_enabled(crm_system_name, "watchdog")) {
         do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, no watchdog device configured");
         crmd_exit(DAEMON_RESPAWN_STOP);
 
-    } else if(crm_get_msec(value) <= 0 && daemon_option_enabled(crm_system_name, "watchdog")) {
-        crm_warn("Watchdog enabled but no stonith-watchdog-timeout configured");
+    } else if(!daemon_option_enabled(crm_system_name, "watchdog")) {
+        crm_trace("Watchdog disabled");
+
+    } else if(value == NULL && sbd_timeout > 0) {
+        char *timeout = NULL;
+
+        st_timeout = 2 * sbd_timeout / 1000;
+        timeout = g_strdup_printf("%lds", st_timeout);
+        crm_notice("Setting stonith-watchdog-timeout=%s", timeout);
+
+        update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
+                             "stonith-watchdog-timeout", timeout, FALSE, NULL, NULL);
+        free(timeout);
+
+    } else if(st_timeout <= 0) {
+        crm_notice("Watchdog enabled but stonith-watchdog-timeout is disabled");
+
+    } else if(st_timeout < sbd_timeout) {
+        do_crm_log_always(LOG_EMERG, "Shutting down pacemaker, stonith-watchdog-timeout (%ldms) is too short (must be greater than %ldms)",
+                          st_timeout, sbd_timeout);
+        crmd_exit(DAEMON_RESPAWN_STOP);
     }
 
     value = crmd_pref(config_hash, "no-quorum-policy");
diff --git a/fencing/main.c b/fencing/main.c
index fe6560d..2694452 100644
--- a/fencing/main.c
+++ b/fencing/main.c
@@ -1003,7 +1003,8 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
     }
 
     if(daemon_option_enabled(crm_system_name, "watchdog")) {
-        const char *value = getenv("SBD_WATCHDOG_TIMEOUT");
+        const char *value = NULL;
+        long timeout_ms = 0;
 
         if(value == NULL) {
             stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_TRACE);
@@ -1013,10 +1014,12 @@ update_cib_cache_cb(const char *event, xmlNode * msg)
         }
 
         if(value) {
-            stonith_watchdog_timeout_ms = crm_get_msec(value);
+            timeout_ms = crm_get_msec(value);
+        }
 
-        } else {
-            stonith_watchdog_timeout_ms = 0;
+        if(timeout_ms != stonith_watchdog_timeout_ms) {
+            crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000);
+            stonith_watchdog_timeout_ms = timeout_ms;
         }
     }
 
diff --git a/lib/common/utils.c b/lib/common/utils.c
index 6b8b12c..eacd8e9 100644
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -286,6 +286,9 @@ cluster_option(GHashTable * options, gboolean(*validate) (const char *),
 
         if (options == NULL) {
             return def_value;
+
+        } else if(def_value == NULL) {
+            return def_value;
         }
 
         g_hash_table_insert(options, strdup(name), strdup(def_value));
@@ -319,7 +322,6 @@ get_cluster_pref(GHashTable * options, pe_cluster_option * option_list, int len,
         }
     }
     CRM_CHECK(found, crm_err("No option named: %s", name));
-    CRM_ASSERT(value != NULL);
     return value;
 }
 
diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c
index c7852c3..fa2c707 100644
--- a/mcp/pacemaker.c
+++ b/mcp/pacemaker.c
@@ -1040,6 +1040,8 @@ main(int argc, char **argv)
 
     if(pcmk_locate_sbd() > 0) {
         setenv("PCMK_watchdog", "true", 1);
+    } else {
+        setenv("PCMK_watchdog", "false", 1);
     }
 
     find_and_track_existing_processes();