Blame SOURCES/050-stonith-fail-handling.patch

60de42
From b005b4f2809020304862000326b22cded7b14377 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Thu, 6 Apr 2017 15:51:47 -0500
60de42
Subject: [PATCH 01/13] Fix: libpe_status: guest nodes from bundles may have
60de42
 attributes
60de42
60de42
Previously, if a guest node created by a bundle had a node attribute,
60de42
pe_create_node() would get called twice, once when parsing the <node> entry and
60de42
once when parsing the <bundle>, resulting in any bundle primitive being unable
60de42
to run.
60de42
---
60de42
 lib/pengine/container.c | 15 ++++++++++++---
60de42
 1 file changed, 12 insertions(+), 3 deletions(-)
60de42
60de42
diff --git a/lib/pengine/container.c b/lib/pengine/container.c
60de42
index 127b144..054ef5f 100644
60de42
--- a/lib/pengine/container.c
60de42
+++ b/lib/pengine/container.c
60de42
@@ -390,9 +390,18 @@ create_remote_resource(
60de42
 
60de42
         // tuple->docker->fillers = g_list_append(tuple->docker->fillers, child);
60de42
 
60de42
-        // -INFINITY prevents anyone else from running here
60de42
-        node = pe_create_node(strdup(nodeid), nodeid, "remote", "-INFINITY",
60de42
-                              data_set);
60de42
+        /* Ensure a node has been created for the guest (it may have already
60de42
+         * been, if it has a permanent node attribute), and ensure its weight is
60de42
+         * -INFINITY so no other resources can run on it.
60de42
+         */
60de42
+        node = pe_find_node(data_set->nodes, nodeid);
60de42
+        if (node == NULL) {
60de42
+            node = pe_create_node(strdup(nodeid), nodeid, "remote", "-INFINITY",
60de42
+                                  data_set);
60de42
+        } else {
60de42
+            node->weight = -INFINITY;
60de42
+        }
60de42
+
60de42
         tuple->node = node_copy(node);
60de42
         tuple->node->weight = 500;
60de42
         nodeid = NULL;
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 7b89ff8b65fcdcad55676578361080eb23edb3e4 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Thu, 6 Apr 2017 16:56:52 -0500
60de42
Subject: [PATCH 02/13] Low: fencing: ignore empty 'action' parameter in fence
60de42
 devices
60de42
60de42
This makes the fix in 9c0c3d6 more comprehensive.
60de42
---
60de42
 fencing/commands.c | 5 ++++-
60de42
 1 file changed, 4 insertions(+), 1 deletion(-)
60de42
60de42
diff --git a/fencing/commands.c b/fencing/commands.c
60de42
index b4e6eb5..deec050 100644
60de42
--- a/fencing/commands.c
60de42
+++ b/fencing/commands.c
60de42
@@ -829,7 +829,10 @@ xml2device_params(const char *name, xmlNode *dev)
60de42
         crm_warn("%s has '%s' parameter, which should never be specified in configuration",
60de42
                  name, STONITH_ATTR_ACTION_OP);
60de42
 
60de42
-        if (strcmp(value, "reboot") == 0) {
60de42
+        if (*value == '\0') {
60de42
+            crm_warn("Ignoring empty '%s' parameter", STONITH_ATTR_ACTION_OP);
60de42
+
60de42
+        } else if (strcmp(value, "reboot") == 0) {
60de42
             crm_warn("Ignoring %s='reboot' (see stonith-action cluster property instead)",
60de42
                      STONITH_ATTR_ACTION_OP);
60de42
 
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 100dd5fda476ef526ac1964260252b30864d5ca7 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Fri, 7 Apr 2017 16:51:29 -0500
60de42
Subject: [PATCH 03/13] Fix: crmd: check for too many stonith failures only
60de42
 when aborting for that reason
60de42
60de42
Previously, crmd would check for too many stonith failures whenever aborting
60de42
a transition. This would lead to a new transition not being triggered when
60de42
aborting for some other unrelated reason, such as a configuration change.
60de42
60de42
Now, crmd checks for too many stonith failures only when aborting due to a new
60de42
stonith failure.
60de42
---
60de42
 crmd/crmd_utils.h   |  2 +-
60de42
 crmd/te_actions.c   | 12 ++++--------
60de42
 crmd/te_callbacks.c | 24 ++++++++++++++++++++++--
60de42
 crmd/te_utils.c     |  2 +-
60de42
 4 files changed, 28 insertions(+), 12 deletions(-)
60de42
60de42
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
60de42
index a1aaad3..d2f8eb2 100644
60de42
--- a/crmd/crmd_utils.h
60de42
+++ b/crmd/crmd_utils.h
60de42
@@ -100,8 +100,8 @@ int crmd_join_phase_count(enum crm_join_phase phase);
60de42
 void crmd_join_phase_log(int level);
60de42
 
60de42
 const char *get_timer_desc(fsa_timer_t * timer);
60de42
-gboolean too_many_st_failures(void);
60de42
 void st_fail_count_reset(const char * target);
60de42
+void abort_for_stonith_failure(xmlNode *reason);
60de42
 void crmd_peer_down(crm_node_t *peer, bool full);
60de42
 
60de42
 /* Convenience macro for registering a CIB callback
60de42
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
60de42
index a8ad86f..66dd16e 100644
60de42
--- a/crmd/te_actions.c
60de42
+++ b/crmd/te_actions.c
60de42
@@ -726,15 +726,11 @@ notify_crmd(crm_graph_t * graph)
60de42
         case tg_restart:
60de42
             type = "restart";
60de42
             if (fsa_state == S_TRANSITION_ENGINE) {
60de42
-                if (too_many_st_failures() == FALSE) {
60de42
-                    if (transition_timer->period_ms > 0) {
60de42
-                        crm_timer_stop(transition_timer);
60de42
-                        crm_timer_start(transition_timer);
60de42
-                    } else {
60de42
-                        event = I_PE_CALC;
60de42
-                    }
60de42
+                if (transition_timer->period_ms > 0) {
60de42
+                    crm_timer_stop(transition_timer);
60de42
+                    crm_timer_start(transition_timer);
60de42
                 } else {
60de42
-                    event = I_TE_SUCCESS;
60de42
+                    event = I_PE_CALC;
60de42
                 }
60de42
 
60de42
             } else if (fsa_state == S_POLICY_ENGINE) {
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index 6c0670c..a0aa081 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -635,7 +635,7 @@ struct st_fail_rec {
60de42
     int count;
60de42
 };
60de42
 
60de42
-gboolean
60de42
+static gboolean
60de42
 too_many_st_failures(void)
60de42
 {
60de42
     GHashTableIter iter;
60de42
@@ -694,6 +694,26 @@ st_fail_count_increment(const char *target, int rc)
60de42
     }
60de42
 }
60de42
 
60de42
+/*!
60de42
+ * \internal
60de42
+ * \brief Abort transition due to stonith failure
60de42
+ *
60de42
+ * \param[in] reason  Failed stonith action XML, or NULL
60de42
+ */
60de42
+void
60de42
+abort_for_stonith_failure(xmlNode *reason)
60de42
+{
60de42
+    enum transition_action abort_action = tg_restart;
60de42
+
60de42
+    /* If stonith repeatedly fails, we eventually give up on starting a new
60de42
+     * transition for that reason.
60de42
+     */
60de42
+    if (too_many_st_failures()) {
60de42
+        abort_action = tg_stop;
60de42
+    }
60de42
+    abort_transition(INFINITY, abort_action, "Stonith failed", reason);
60de42
+}
60de42
+
60de42
 void
60de42
 tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
60de42
 {
60de42
@@ -759,7 +779,7 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
60de42
         action->failed = TRUE;
60de42
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
60de42
                    call_id, target, pcmk_strerror(rc));
60de42
-        abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
60de42
+        abort_for_stonith_failure(NULL);
60de42
         st_fail_count_increment(target, rc);
60de42
     }
60de42
 
60de42
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
60de42
index 3b67afe..4603307 100644
60de42
--- a/crmd/te_utils.c
60de42
+++ b/crmd/te_utils.c
60de42
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
60de42
 
60de42
     if (last_action != NULL) {
60de42
         crm_warn("STONITHd failure resulted in un-runnable actions");
60de42
-        abort_transition(INFINITY, tg_restart, "Stonith failure", last_action);
60de42
+        abort_for_stonith_failure(last_action);
60de42
         return TRUE;
60de42
     }
60de42
 
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 3c49a1cf86cb819eca18c841661d90fa65bcb185 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Fri, 7 Apr 2017 21:03:31 -0500
60de42
Subject: [PATCH 04/13] Low: crmd: consider target when checking stonith
60de42
 failures
60de42
60de42
Previously, if the crmd aborted a transition due to failure to fence a
60de42
particular node, a new transition would not be started if *any* node had
60de42
been fenced too many times. Now, only failures of the particular target are
60de42
checked in that situation.
60de42
---
60de42
 crmd/crmd_utils.h   |  2 +-
60de42
 crmd/te_callbacks.c | 33 +++++++++++++++++++++++----------
60de42
 crmd/te_utils.c     |  2 +-
60de42
 3 files changed, 25 insertions(+), 12 deletions(-)
60de42
60de42
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
60de42
index d2f8eb2..f0289d4 100644
60de42
--- a/crmd/crmd_utils.h
60de42
+++ b/crmd/crmd_utils.h
60de42
@@ -101,7 +101,7 @@ void crmd_join_phase_log(int level);
60de42
 
60de42
 const char *get_timer_desc(fsa_timer_t * timer);
60de42
 void st_fail_count_reset(const char * target);
60de42
-void abort_for_stonith_failure(xmlNode *reason);
60de42
+void abort_for_stonith_failure(const char *target, xmlNode *reason);
60de42
 void crmd_peer_down(crm_node_t *peer, bool full);
60de42
 
60de42
 /* Convenience macro for registering a CIB callback
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index a0aa081..6e306fd 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -636,7 +636,7 @@ struct st_fail_rec {
60de42
 };
60de42
 
60de42
 static gboolean
60de42
-too_many_st_failures(void)
60de42
+too_many_st_failures(const char *target)
60de42
 {
60de42
     GHashTableIter iter;
60de42
     const char *key = NULL;
60de42
@@ -646,14 +646,26 @@ too_many_st_failures(void)
60de42
         return FALSE;
60de42
     }
60de42
 
60de42
-    g_hash_table_iter_init(&iter, stonith_failures);
60de42
-    while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
60de42
-        if (value->count > stonith_max_attempts ) {
60de42
-            crm_notice("Too many failures to fence %s (%d), giving up", key, value->count);
60de42
-            return TRUE;
60de42
+    if (target == NULL) {
60de42
+        g_hash_table_iter_init(&iter, stonith_failures);
60de42
+        while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
60de42
+            if (value->count > stonith_max_attempts) {
60de42
+                target = (const char*)key;
60de42
+                goto too_many;
60de42
+            }
60de42
+        }
60de42
+    } else {
60de42
+        value = g_hash_table_lookup(stonith_failures, target);
60de42
+        if ((value != NULL) && (value->count > stonith_max_attempts)) {
60de42
+            goto too_many;
60de42
         }
60de42
     }
60de42
     return FALSE;
60de42
+
60de42
+too_many:
60de42
+    crm_warn("Too many failures (%d) to fence %s, giving up",
60de42
+             value->count, target);
60de42
+    return TRUE;
60de42
 }
60de42
 
60de42
 void
60de42
@@ -698,17 +710,18 @@ st_fail_count_increment(const char *target, int rc)
60de42
  * \internal
60de42
  * \brief Abort transition due to stonith failure
60de42
  *
60de42
- * \param[in] reason  Failed stonith action XML, or NULL
60de42
+ * \param[in] target  Don't restart if this (NULL for any) has too many failures
60de42
+ * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
60de42
  */
60de42
 void
60de42
-abort_for_stonith_failure(xmlNode *reason)
60de42
+abort_for_stonith_failure(const char *target, xmlNode *reason)
60de42
 {
60de42
     enum transition_action abort_action = tg_restart;
60de42
 
60de42
     /* If stonith repeatedly fails, we eventually give up on starting a new
60de42
      * transition for that reason.
60de42
      */
60de42
-    if (too_many_st_failures()) {
60de42
+    if (too_many_st_failures(target)) {
60de42
         abort_action = tg_stop;
60de42
     }
60de42
     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
60de42
@@ -779,7 +792,7 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
60de42
         action->failed = TRUE;
60de42
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
60de42
                    call_id, target, pcmk_strerror(rc));
60de42
-        abort_for_stonith_failure(NULL);
60de42
+        abort_for_stonith_failure(target, NULL);
60de42
         st_fail_count_increment(target, rc);
60de42
     }
60de42
 
60de42
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
60de42
index 4603307..66b0883 100644
60de42
--- a/crmd/te_utils.c
60de42
+++ b/crmd/te_utils.c
60de42
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
60de42
 
60de42
     if (last_action != NULL) {
60de42
         crm_warn("STONITHd failure resulted in un-runnable actions");
60de42
-        abort_for_stonith_failure(last_action);
60de42
+        abort_for_stonith_failure(NULL, last_action);
60de42
         return TRUE;
60de42
     }
60de42
 
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 0c43785dad9be38566cccce677c54da42ff2c691 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 10 Apr 2017 14:22:45 -0500
60de42
Subject: [PATCH 05/13] Fix: crmd: forget stonith failures when forgetting node
60de42
60de42
---
60de42
 crmd/messages.c | 6 ++++++
60de42
 1 file changed, 6 insertions(+)
60de42
60de42
diff --git a/crmd/messages.c b/crmd/messages.c
60de42
index c79d96e..4307fca 100644
60de42
--- a/crmd/messages.c
60de42
+++ b/crmd/messages.c
60de42
@@ -870,6 +870,12 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause)
60de42
 
60de42
         } else {
60de42
             reap_crm_member(id, name);
60de42
+
60de42
+            /* If we're forgetting this node, also forget any failures to fence
60de42
+             * it, so we don't carry that over to any node added later with the
60de42
+             * same name.
60de42
+             */
60de42
+            st_fail_count_reset(name);
60de42
         }
60de42
 
60de42
     } else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) {
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 515424f01b8ac5eb8705cecb26a60e17de3a7df6 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 10 Apr 2017 15:23:46 -0500
60de42
Subject: [PATCH 06/13] Fix: crmd: track stonith fail counts on all nodes
60de42
60de42
Previously, the stonith fail count was incremented in
60de42
tengine_stonith_callback(), which is called only on the DC. Now, it is
60de42
incremented in tengine_stonith_notify() instead, which is called on all nodes,
60de42
ensuring the count is correct when a new node takes over DC.
60de42
---
60de42
 crmd/crmd_utils.h   | 1 +
60de42
 crmd/te_callbacks.c | 5 ++---
60de42
 crmd/te_utils.c     | 9 ++++++---
60de42
 3 files changed, 9 insertions(+), 6 deletions(-)
60de42
60de42
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
60de42
index f0289d4..fd8fe76 100644
60de42
--- a/crmd/crmd_utils.h
60de42
+++ b/crmd/crmd_utils.h
60de42
@@ -101,6 +101,7 @@ void crmd_join_phase_log(int level);
60de42
 
60de42
 const char *get_timer_desc(fsa_timer_t * timer);
60de42
 void st_fail_count_reset(const char * target);
60de42
+void st_fail_count_increment(const char *target);
60de42
 void abort_for_stonith_failure(const char *target, xmlNode *reason);
60de42
 void crmd_peer_down(crm_node_t *peer, bool full);
60de42
 
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index 6e306fd..aa4a141 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -682,8 +682,8 @@ st_fail_count_reset(const char *target)
60de42
     }
60de42
 }
60de42
 
60de42
-static void
60de42
-st_fail_count_increment(const char *target, int rc)
60de42
+void
60de42
+st_fail_count_increment(const char *target)
60de42
 {
60de42
     struct st_fail_rec *rec = NULL;
60de42
 
60de42
@@ -793,7 +793,6 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
60de42
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
60de42
                    call_id, target, pcmk_strerror(rc));
60de42
         abort_for_stonith_failure(target, NULL);
60de42
-        st_fail_count_increment(target, rc);
60de42
     }
60de42
 
60de42
     update_graph(transition_graph, action);
60de42
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
60de42
index 66b0883..32ddae1 100644
60de42
--- a/crmd/te_utils.c
60de42
+++ b/crmd/te_utils.c
60de42
@@ -259,9 +259,12 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
60de42
         return;
60de42
     }
60de42
 
60de42
-    if (st_event->result == pcmk_ok &&
60de42
-        safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
60de42
-        st_fail_count_reset(st_event->target);
60de42
+    if (safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
60de42
+        if (st_event->result == pcmk_ok) {
60de42
+            st_fail_count_reset(st_event->target);
60de42
+        } else {
60de42
+            st_fail_count_increment(st_event->target);
60de42
+        }
60de42
     }
60de42
 
60de42
     crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 714a8d07a500675d84e6ef779ba21e6c23e27853 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 10 Apr 2017 17:20:08 -0500
60de42
Subject: [PATCH 07/13] Low: crmd: allow clearing all stonith fail counts
60de42
60de42
for future improvements
60de42
---
60de42
 crmd/te_callbacks.c | 29 ++++++++++++++++++++++++-----
60de42
 1 file changed, 24 insertions(+), 5 deletions(-)
60de42
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index aa4a141..4f896ee 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -668,17 +668,36 @@ too_many:
60de42
     return TRUE;
60de42
 }
60de42
 
60de42
+/*!
60de42
+ * \internal
60de42
+ * \brief Reset a stonith fail count
60de42
+ *
60de42
+ * \param[in] target  Name of node to reset, or NULL for all
60de42
+ */
60de42
 void
60de42
 st_fail_count_reset(const char *target)
60de42
 {
60de42
-    struct st_fail_rec *rec = NULL;
60de42
+    if (stonith_failures == NULL) {
60de42
+        return;
60de42
+    }
60de42
+
60de42
+    if (target) {
60de42
+        struct st_fail_rec *rec = NULL;
60de42
 
60de42
-    if (stonith_failures) {
60de42
         rec = g_hash_table_lookup(stonith_failures, target);
60de42
-    }
60de42
+        if (rec) {
60de42
+            rec->count = 0;
60de42
+        }
60de42
+    } else {
60de42
+        GHashTableIter iter;
60de42
+        const char *key = NULL;
60de42
+        struct st_fail_rec *rec = NULL;
60de42
 
60de42
-    if (rec) {
60de42
-        rec->count = 0;
60de42
+        g_hash_table_iter_init(&iter, stonith_failures);
60de42
+        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
60de42
+                                      (gpointer *) &rec)) {
60de42
+            rec->count = 0;
60de42
+        }
60de42
     }
60de42
 }
60de42
 
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 8fd6691558d94a8294f3d860cc9451c1a8e0c7a1 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 13:55:19 -0500
60de42
Subject: [PATCH 08/13] Low: crmd: skip restart at (not above)
60de42
 stonith-max-attempts
60de42
60de42
---
60de42
 crmd/te_callbacks.c | 4 ++--
60de42
 1 file changed, 2 insertions(+), 2 deletions(-)
60de42
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index 4f896ee..b4d8713 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -649,14 +649,14 @@ too_many_st_failures(const char *target)
60de42
     if (target == NULL) {
60de42
         g_hash_table_iter_init(&iter, stonith_failures);
60de42
         while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
60de42
-            if (value->count > stonith_max_attempts) {
60de42
+            if (value->count >= stonith_max_attempts) {
60de42
                 target = (const char*)key;
60de42
                 goto too_many;
60de42
             }
60de42
         }
60de42
     } else {
60de42
         value = g_hash_table_lookup(stonith_failures, target);
60de42
-        if ((value != NULL) && (value->count > stonith_max_attempts)) {
60de42
+        if ((value != NULL) && (value->count >= stonith_max_attempts)) {
60de42
             goto too_many;
60de42
         }
60de42
     }
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 9e9a271fd666ff371487f22c28ba9e420a22434c Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 18:18:42 -0500
60de42
Subject: [PATCH 09/13] Fix: crmd: don't restart transition if no fence devices
60de42
60de42
This restores the behavior removed by ff881376, but more precisely where it's
60de42
needed.
60de42
---
60de42
 crmd/crmd_utils.h        |  4 +++-
60de42
 crmd/te_callbacks.c      | 21 ++++++++++++++++-----
60de42
 crmd/te_utils.c          |  2 +-
60de42
 include/crm/transition.h |  4 ++++
60de42
 4 files changed, 24 insertions(+), 7 deletions(-)
60de42
60de42
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
60de42
index fd8fe76..9a09340 100644
60de42
--- a/crmd/crmd_utils.h
60de42
+++ b/crmd/crmd_utils.h
60de42
@@ -19,6 +19,7 @@
60de42
 #  define CRMD_UTILS__H
60de42
 
60de42
 #  include <crm/crm.h>
60de42
+#  include <crm/transition.h>
60de42
 #  include <crm/common/xml.h>
60de42
 #  include <crm/cib/internal.h> /* For CIB_OP_MODIFY */
60de42
 #  include "notify.h"
60de42
@@ -102,7 +103,8 @@ void crmd_join_phase_log(int level);
60de42
 const char *get_timer_desc(fsa_timer_t * timer);
60de42
 void st_fail_count_reset(const char * target);
60de42
 void st_fail_count_increment(const char *target);
60de42
-void abort_for_stonith_failure(const char *target, xmlNode *reason);
60de42
+void abort_for_stonith_failure(enum transition_action abort_action,
60de42
+                               const char *target, xmlNode *reason);
60de42
 void crmd_peer_down(crm_node_t *peer, bool full);
60de42
 
60de42
 /* Convenience macro for registering a CIB callback
60de42
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
60de42
index b4d8713..c2b0c0d 100644
60de42
--- a/crmd/te_callbacks.c
60de42
+++ b/crmd/te_callbacks.c
60de42
@@ -729,18 +729,18 @@ st_fail_count_increment(const char *target)
60de42
  * \internal
60de42
  * \brief Abort transition due to stonith failure
60de42
  *
60de42
+ * \param[in] abort_action  Whether to restart or stop transition
60de42
  * \param[in] target  Don't restart if this (NULL for any) has too many failures
60de42
  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
60de42
  */
60de42
 void
60de42
-abort_for_stonith_failure(const char *target, xmlNode *reason)
60de42
+abort_for_stonith_failure(enum transition_action abort_action,
60de42
+                          const char *target, xmlNode *reason)
60de42
 {
60de42
-    enum transition_action abort_action = tg_restart;
60de42
-
60de42
     /* If stonith repeatedly fails, we eventually give up on starting a new
60de42
      * transition for that reason.
60de42
      */
60de42
-    if (too_many_st_failures(target)) {
60de42
+    if ((abort_action != tg_stop) && too_many_st_failures(target)) {
60de42
         abort_action = tg_stop;
60de42
     }
60de42
     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
60de42
@@ -807,11 +807,22 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
60de42
 
60de42
     } else {
60de42
         const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
60de42
+        enum transition_action abort_action = tg_restart;
60de42
 
60de42
         action->failed = TRUE;
60de42
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
60de42
                    call_id, target, pcmk_strerror(rc));
60de42
-        abort_for_stonith_failure(target, NULL);
60de42
+
60de42
+        /* If no fence devices were available, there's no use in immediately
60de42
+         * checking again, so don't start a new transition in that case.
60de42
+         */
60de42
+        if (rc == -ENODEV) {
60de42
+            crm_warn("No devices found in cluster to fence %s, giving up",
60de42
+                     target);
60de42
+            abort_action = tg_stop;
60de42
+        }
60de42
+
60de42
+        abort_for_stonith_failure(abort_action, target, NULL);
60de42
     }
60de42
 
60de42
     update_graph(transition_graph, action);
60de42
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
60de42
index 32ddae1..dcfbb3b 100644
60de42
--- a/crmd/te_utils.c
60de42
+++ b/crmd/te_utils.c
60de42
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
60de42
 
60de42
     if (last_action != NULL) {
60de42
         crm_warn("STONITHd failure resulted in un-runnable actions");
60de42
-        abort_for_stonith_failure(NULL, last_action);
60de42
+        abort_for_stonith_failure(tg_restart, NULL, last_action);
60de42
         return TRUE;
60de42
     }
60de42
 
60de42
diff --git a/include/crm/transition.h b/include/crm/transition.h
60de42
index f2069cc..21f7c55 100644
60de42
--- a/include/crm/transition.h
60de42
+++ b/include/crm/transition.h
60de42
@@ -15,6 +15,8 @@
60de42
  * License along with this library; if not, write to the Free Software
60de42
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
60de42
  */
60de42
+#ifndef CRM_TRANSITION__H
60de42
+#  define CRM_TRANSITION__H
60de42
 
60de42
 #include <crm/crm.h>
60de42
 #include <crm/msg_xml.h>
60de42
@@ -147,3 +149,5 @@ bool update_abort_priority(crm_graph_t * graph, int priority,
60de42
 const char *actiontype2text(action_type_e type);
60de42
 lrmd_event_data_t *convert_graph_action(xmlNode * resource, crm_action_t * action, int status,
60de42
                                         int rc);
60de42
+
60de42
+#endif
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 268f70f9ab644783a8038aa82bcca3628cc942dc Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 14:39:19 -0500
60de42
Subject: [PATCH 10/13] Low: crmd: avoid DC sending offer to itself twice
60de42
60de42
---
60de42
 crmd/join_dc.c | 6 ++++--
60de42
 1 file changed, 4 insertions(+), 2 deletions(-)
60de42
60de42
diff --git a/crmd/join_dc.c b/crmd/join_dc.c
60de42
index 71311de..999996d 100644
60de42
--- a/crmd/join_dc.c
60de42
+++ b/crmd/join_dc.c
60de42
@@ -242,8 +242,10 @@ do_dc_join_offer_one(long long action,
60de42
     /* always offer to the DC (ourselves)
60de42
      * this ensures the correct value for max_generation_from
60de42
      */
60de42
-    member = crm_get_peer(0, fsa_our_uname);
60de42
-    join_make_offer(NULL, member, NULL);
60de42
+    if (strcmp(join_to, fsa_our_uname) != 0) {
60de42
+        member = crm_get_peer(0, fsa_our_uname);
60de42
+        join_make_offer(NULL, member, NULL);
60de42
+    }
60de42
 
60de42
     /* this was a genuine join request, cancel any existing
60de42
      * transition and invoke the PE
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 249c7c9a83371a67e573276a285697994fe66fed Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 15:41:18 -0500
60de42
Subject: [PATCH 11/13] Fix: crmd: don't fence old DC if it's shutting down as
60de42
 soon-to-be DC joins
60de42
60de42
Existing peers of a DC that is shutting down can avoid fencing it (by updating
60de42
its expected state) because it broadcasts its shutdown request. However, a
60de42
newly joining node won't get that broadcast.
60de42
60de42
Previously, if the joining node became the new DC, it would fence the old one.
60de42
Now, the DC notifies joining nodes (via a join message field) whether it is
60de42
shutting down, and joining nodes update its expected state accordingly.
60de42
---
60de42
 crmd/join_client.c    | 24 ++++++++++++++++++++++++
60de42
 crmd/join_dc.c        | 32 ++++++++++++++++++++++++++------
60de42
 include/crm/msg_xml.h |  1 +
60de42
 3 files changed, 51 insertions(+), 6 deletions(-)
60de42
60de42
diff --git a/crmd/join_client.c b/crmd/join_client.c
60de42
index 319272d..4510483 100644
60de42
--- a/crmd/join_client.c
60de42
+++ b/crmd/join_client.c
60de42
@@ -30,6 +30,26 @@ void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, v
60de42
 
60de42
 extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
60de42
 
60de42
+/*!
60de42
+ * \internal
60de42
+ * \brief Remember if DC is shutting down as we join
60de42
+ *
60de42
+ * If we're joining while the current DC is shutting down, update its expected
60de42
+ * state, so we don't fence it if we become the new DC. (We weren't a peer
60de42
+ * when it broadcast its shutdown request.)
60de42
+ *
60de42
+ * \param[in] msg  A join message from the DC
60de42
+ */
60de42
+static void
60de42
+update_dc_expected(xmlNode *msg)
60de42
+{
60de42
+    if (fsa_our_dc && crm_is_true(crm_element_value(msg, F_CRM_DC_LEAVING))) {
60de42
+        crm_node_t *dc_node = crm_get_peer(0, fsa_our_dc);
60de42
+
60de42
+        crm_update_peer_expected(__FUNCTION__, dc_node, CRMD_JOINSTATE_DOWN);
60de42
+    }
60de42
+}
60de42
+
60de42
 /*	A_CL_JOIN_QUERY		*/
60de42
 /* is there a DC out there? */
60de42
 void
60de42
@@ -128,6 +148,8 @@ do_cl_join_offer_respond(long long action,
60de42
         return;
60de42
     }
60de42
 
60de42
+    update_dc_expected(input->msg);
60de42
+
60de42
     CRM_LOG_ASSERT(input != NULL);
60de42
     query_call_id =
60de42
         fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local | cib_no_children);
60de42
@@ -250,6 +272,8 @@ do_cl_join_finalize_respond(long long action,
60de42
         return;
60de42
     }
60de42
 
60de42
+    update_dc_expected(input->msg);
60de42
+
60de42
     /* send our status section to the DC */
60de42
     tmp1 = do_lrm_query(TRUE, fsa_our_uname);
60de42
     if (tmp1 != NULL) {
60de42
diff --git a/crmd/join_dc.c b/crmd/join_dc.c
60de42
index 999996d..ebb5059 100644
60de42
--- a/crmd/join_dc.c
60de42
+++ b/crmd/join_dc.c
60de42
@@ -106,6 +106,30 @@ initialize_join(gboolean before)
60de42
     }
60de42
 }
60de42
 
60de42
+/*!
60de42
+ * \internal
60de42
+ * \brief Create a join message from the DC
60de42
+ *
60de42
+ * \param[in] join_op  Join operation name
60de42
+ * \param[in] host_to  Recipient of message
60de42
+ */
60de42
+static xmlNode *
60de42
+create_dc_message(const char *join_op, const char *host_to)
60de42
+{
60de42
+    xmlNode *msg = create_request(join_op, NULL, host_to, CRM_SYSTEM_CRMD,
60de42
+                                  CRM_SYSTEM_DC, NULL);
60de42
+
60de42
+    /* Identify which election this is a part of */
60de42
+    crm_xml_add_int(msg, F_CRM_JOIN_ID, current_join_id);
60de42
+
60de42
+    /* Add a field specifying whether the DC is shutting down. This keeps the
60de42
+     * joining node from fencing the old DC if it becomes the new DC.
60de42
+     */
60de42
+    crm_xml_add_boolean(msg, F_CRM_DC_LEAVING,
60de42
+                        is_set(fsa_input_register, R_SHUTDOWN));
60de42
+    return msg;
60de42
+}
60de42
+
60de42
 static void
60de42
 join_make_offer(gpointer key, gpointer value, gpointer user_data)
60de42
 {
60de42
@@ -147,10 +171,8 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
60de42
 
60de42
     crm_update_peer_join(__FUNCTION__, (crm_node_t*)member, crm_join_none);
60de42
 
60de42
-    offer = create_request(CRM_OP_JOIN_OFFER, NULL, member->uname,
60de42
-                           CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
60de42
+    offer = create_dc_message(CRM_OP_JOIN_OFFER, member->uname);
60de42
 
60de42
-    crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id);
60de42
     /* send the welcome */
60de42
     crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);
60de42
 
60de42
@@ -588,9 +610,7 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
60de42
     }
60de42
 
60de42
     /* send the ack/nack to the node */
60de42
-    acknak = create_request(CRM_OP_JOIN_ACKNAK, NULL, join_to,
60de42
-                            CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
60de42
-    crm_xml_add_int(acknak, F_CRM_JOIN_ID, current_join_id);
60de42
+    acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
60de42
 
60de42
     crm_debug("join-%d: ACK'ing join request from %s",
60de42
               current_join_id, join_to);
60de42
diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h
60de42
index 7198fe5..7504744 100644
60de42
--- a/include/crm/msg_xml.h
60de42
+++ b/include/crm/msg_xml.h
60de42
@@ -64,6 +64,7 @@
60de42
 #  define F_CRM_ORIGIN			"origin"
60de42
 #  define F_CRM_USER			"crm_user"
60de42
 #  define F_CRM_JOIN_ID			"join_id"
60de42
+#  define F_CRM_DC_LEAVING      "dc-leaving"
60de42
 #  define F_CRM_ELECTION_ID		"election-id"
60de42
 #  define F_CRM_ELECTION_AGE_S		"election-age-sec"
60de42
 #  define F_CRM_ELECTION_AGE_US		"election-age-nano-sec"
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 9fe47194b7636bfe3aebdeece0ec89a7f588d77d Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 19:03:03 -0500
60de42
Subject: [PATCH 12/13] Refactor: extra: use whitespace consistently in Dummy
60de42
60de42
to make changes easier
60de42
---
60de42
 extra/resources/Dummy | 102 +++++++++++++++++++++++++-------------------------
60de42
 1 file changed, 51 insertions(+), 51 deletions(-)
60de42
60de42
diff --git a/extra/resources/Dummy b/extra/resources/Dummy
60de42
index 1fd6156..bab56e4 100644
60de42
--- a/extra/resources/Dummy
60de42
+++ b/extra/resources/Dummy
60de42
@@ -1,8 +1,8 @@
60de42
 #!/bin/sh
60de42
 #
60de42
 #
60de42
-#	Dummy OCF RA. Does nothing but wait a few seconds, can be
60de42
-#	configured to fail occassionally.
60de42
+# Dummy OCF RA. Does nothing but wait a few seconds, can be
60de42
+# configured to fail occassionally.
60de42
 #
60de42
 # Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Brée
60de42
 #                    All Rights Reserved.
60de42
@@ -37,7 +37,7 @@
60de42
 #######################################################################
60de42
 
60de42
 meta_data() {
60de42
-	cat <
60de42
+    cat <
60de42
 
60de42
 
60de42
 <resource-agent name="Dummy" version="1.0">
60de42
@@ -130,12 +130,12 @@ END
60de42
 # don't exit on TERM, to test that lrmd makes sure that we do exit
60de42
 trap sigterm_handler TERM
60de42
 sigterm_handler() {
60de42
-	ocf_log info "They use TERM to bring us down. No such luck."
60de42
-	return
60de42
+    ocf_log info "They use TERM to bring us down. No such luck."
60de42
+    return
60de42
 }
60de42
 
60de42
 dummy_usage() {
60de42
-	cat <
60de42
+    cat <
60de42
 usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
60de42
 
60de42
 Expects to have a fully populated OCF RA-compliant environment set.
60de42
@@ -174,34 +174,34 @@ dummy_start() {
60de42
 dummy_stop() {
60de42
     dummy_monitor
60de42
     if [ $? -eq $OCF_SUCCESS ]; then
60de42
-	rm ${OCF_RESKEY_state}
60de42
+        rm ${OCF_RESKEY_state}
60de42
     fi
60de42
     rm -f "${VERIFY_SERIALIZED_FILE}"
60de42
     return $OCF_SUCCESS
60de42
 }
60de42
 
60de42
 dummy_monitor() {
60de42
-	# Monitor _MUST!_ differentiate correctly between running
60de42
-	# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
60de42
-	# That is THREE states, not just yes/no.
60de42
-
60de42
-	if [ $OCF_RESKEY_op_sleep -ne 0 ]; then
60de42
-		if [ -f "${VERIFY_SERIALIZED_FILE}" ]; then
60de42
-			# two monitor ops have occurred at the same time.
60de42
-			# this is to verify a condition in the lrmd regression tests.
60de42
-			ocf_log err "$VERIFY_SERIALIZED_FILE exists already"
60de42
-			return $OCF_ERR_GENERIC
60de42
-		fi
60de42
-
60de42
-		touch "${VERIFY_SERIALIZED_FILE}"
60de42
-		sleep ${OCF_RESKEY_op_sleep}
60de42
-		rm "${VERIFY_SERIALIZED_FILE}"
60de42
-	fi
60de42
-	
60de42
-	if [ -f "${OCF_RESKEY_state}" ]; then
60de42
-		return $OCF_SUCCESS
60de42
-	fi
60de42
-	return $OCF_NOT_RUNNING
60de42
+    # Monitor _MUST!_ differentiate correctly between running
60de42
+    # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
60de42
+    # That is THREE states, not just yes/no.
60de42
+
60de42
+    if [ $OCF_RESKEY_op_sleep -ne 0 ]; then
60de42
+        if [ -f "${VERIFY_SERIALIZED_FILE}" ]; then
60de42
+            # two monitor ops have occurred at the same time.
60de42
+            # this is to verify a condition in the lrmd regression tests.
60de42
+            ocf_log err "$VERIFY_SERIALIZED_FILE exists already"
60de42
+            return $OCF_ERR_GENERIC
60de42
+        fi
60de42
+
60de42
+        touch "${VERIFY_SERIALIZED_FILE}"
60de42
+        sleep ${OCF_RESKEY_op_sleep}
60de42
+        rm "${VERIFY_SERIALIZED_FILE}"
60de42
+    fi
60de42
+    
60de42
+    if [ -f "${OCF_RESKEY_state}" ]; then
60de42
+        return $OCF_SUCCESS
60de42
+    fi
60de42
+    return $OCF_NOT_RUNNING
60de42
 }
60de42
 
60de42
 dummy_validate() {
60de42
@@ -210,7 +210,7 @@ dummy_validate() {
60de42
     state_dir=`dirname "$OCF_RESKEY_state"`
60de42
     touch "$state_dir/$$"
60de42
     if [ $? -ne 0 ]; then
60de42
-	return $OCF_ERR_ARGS
60de42
+        return $OCF_ERR_ARGS
60de42
     fi
60de42
     rm "$state_dir/$$"
60de42
 
60de42
@@ -235,28 +235,28 @@ VERIFY_SERIALIZED_FILE="${OCF_RESKEY_state}.serialized"
60de42
 dump_env
60de42
 
60de42
 case $__OCF_ACTION in
60de42
-meta-data)	meta_data
60de42
-		exit $OCF_SUCCESS
60de42
-		;;
60de42
-start)		dummy_start;;
60de42
-stop)		dummy_stop;;
60de42
-monitor)	dummy_monitor;;
60de42
-migrate_to)	ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}."
60de42
-	        dummy_stop
60de42
-		;;
60de42
-migrate_from)	ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}."
60de42
-	        dummy_start
60de42
-		;;
60de42
-reload)		ocf_log err "Reloading..."
60de42
-	        dummy_start
60de42
-		;;
60de42
-validate-all)	dummy_validate;;
60de42
-usage|help)	dummy_usage
60de42
-		exit $OCF_SUCCESS
60de42
-		;;
60de42
-*)		dummy_usage
60de42
-		exit $OCF_ERR_UNIMPLEMENTED
60de42
-		;;
60de42
+meta-data)      meta_data
60de42
+                exit $OCF_SUCCESS
60de42
+                ;;
60de42
+start)          dummy_start;;
60de42
+stop)           dummy_stop;;
60de42
+monitor)        dummy_monitor;;
60de42
+migrate_to)     ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}."
60de42
+                dummy_stop
60de42
+                ;;
60de42
+migrate_from)   ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}."
60de42
+                dummy_start
60de42
+                ;;
60de42
+reload)         ocf_log err "Reloading..."
60de42
+                dummy_start
60de42
+                ;;
60de42
+validate-all)   dummy_validate;;
60de42
+usage|help)     dummy_usage
60de42
+                exit $OCF_SUCCESS
60de42
+                ;;
60de42
+*)              dummy_usage
60de42
+                exit $OCF_ERR_UNIMPLEMENTED
60de42
+                ;;
60de42
 esac
60de42
 rc=$?
60de42
 ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 14d5eb9bcf1a00612fbe952d945b07bbe3c44844 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Mon, 17 Apr 2017 19:04:58 -0500
60de42
Subject: [PATCH 13/13] Test: extra: Dummy stop shouldn't fail if monitor is in
60de42
 progress
60de42
60de42
---
60de42
 extra/resources/Dummy | 4 ++--
60de42
 1 file changed, 2 insertions(+), 2 deletions(-)
60de42
60de42
diff --git a/extra/resources/Dummy b/extra/resources/Dummy
60de42
index bab56e4..4a6884c 100644
60de42
--- a/extra/resources/Dummy
60de42
+++ b/extra/resources/Dummy
60de42
@@ -172,7 +172,7 @@ dummy_start() {
60de42
 }
60de42
 
60de42
 dummy_stop() {
60de42
-    dummy_monitor
60de42
+    dummy_monitor --force
60de42
     if [ $? -eq $OCF_SUCCESS ]; then
60de42
         rm ${OCF_RESKEY_state}
60de42
     fi
60de42
@@ -186,7 +186,7 @@ dummy_monitor() {
60de42
     # That is THREE states, not just yes/no.
60de42
 
60de42
     if [ $OCF_RESKEY_op_sleep -ne 0 ]; then
60de42
-        if [ -f "${VERIFY_SERIALIZED_FILE}" ]; then
60de42
+        if [ "$1" = "" -a -f "${VERIFY_SERIALIZED_FILE}" ]; then
60de42
             # two monitor ops have occurred at the same time.
60de42
             # this is to verify a condition in the lrmd regression tests.
60de42
             ocf_log err "$VERIFY_SERIALIZED_FILE exists already"
60de42
-- 
60de42
1.8.3.1
60de42