Blob Blame History Raw
From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 18 Aug 2015 10:30:49 +1000
Subject: [PATCH] Fix: PE: Bug cl#5247 - Imply resources running on a container
 are stopped when the container is stopped

(cherry picked from commit e10eff1902d5b451454e2d467ee337c964f536ab)
---
 lib/pengine/unpack.c                  | 29 ++++++++++++++++++++---------
 pengine/allocate.c                    | 17 +++++++++++++++++
 pengine/graph.c                       |  7 ++++++-
 pengine/test10/bug-rh-1097457.dot     |  2 ++
 pengine/test10/bug-rh-1097457.exp     | 12 ++++++++++--
 pengine/test10/bug-rh-1097457.summary | 10 +++++-----
 pengine/test10/whitebox-fail1.dot     |  1 +
 pengine/test10/whitebox-fail1.exp     |  6 +++++-
 pengine/test10/whitebox-fail1.summary |  8 ++++----
 pengine/test10/whitebox-fail2.dot     |  1 +
 pengine/test10/whitebox-fail2.exp     |  6 +++++-
 pengine/test10/whitebox-fail2.summary |  8 ++++----
 12 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 106c674..0f83be4 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -44,7 +44,7 @@ CRM_TRACE_INIT_DATA(pe_status);
 
 gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op,
                        enum action_fail_response *failed, pe_working_set_t * data_set);
-static gboolean determine_remote_online_status(node_t * this_node);
+static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node);
 
 static gboolean
 is_dangling_container_remote_node(node_t *node)
@@ -73,6 +73,8 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
         if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
             crm_warn("Remote node %s will be fenced by recovering container resource %s",
                 node->details->uname, rsc->id, reason);
+            /* node->details->unclean = TRUE; */
+            node->details->remote_requires_reset = TRUE;
             set_bit(rsc->flags, pe_rsc_failed);
         }
     } else if (is_dangling_container_remote_node(node)) {
@@ -1157,7 +1159,7 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set)
         if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) {
             continue;
         }
-        determine_remote_online_status(this_node);
+        determine_remote_online_status(data_set, this_node);
     }
 
     /* process attributes */
@@ -1366,7 +1368,7 @@ determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_stat
 }
 
 static gboolean
-determine_remote_online_status(node_t * this_node)
+determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node)
 {
     resource_t *rsc = this_node->details->remote_rsc;
     resource_t *container = NULL;
@@ -1393,13 +1395,21 @@ determine_remote_online_status(node_t * this_node)
     }
 
     /* Now check all the failure conditions. */
-    if (is_set(rsc->flags, pe_rsc_failed) ||
-        (rsc->role == RSC_ROLE_STOPPED) ||
-        (container && is_set(container->flags, pe_rsc_failed)) ||
-        (container && container->role == RSC_ROLE_STOPPED)) {
+    if(container && is_set(container->flags, pe_rsc_failed)) {
+        crm_trace("Remote node %s is set to UNCLEAN. rsc failed.", this_node->details->id);
+        this_node->details->online = FALSE;
+        this_node->details->remote_requires_reset = TRUE;
 
-        crm_trace("Remote node %s is set to OFFLINE. node is stopped or rsc failed.", this_node->details->id);
+    } else if(is_set(rsc->flags, pe_rsc_failed)) {
+        crm_trace("Remote node %s is set to OFFLINE. rsc failed.", this_node->details->id);
         this_node->details->online = FALSE;
+
+    } else if (rsc->role == RSC_ROLE_STOPPED
+        || (container && container->role == RSC_ROLE_STOPPED)) {
+
+        crm_trace("Remote node %s is set to OFFLINE. node is stopped.", this_node->details->id);
+        this_node->details->online = FALSE;
+        this_node->details->remote_requires_reset = FALSE;
     }
 
 remote_online_done:
@@ -3375,7 +3385,8 @@ find_operations(const char *rsc, const char *node, gboolean active_filter,
                 continue;
 
             } else if (is_remote_node(this_node)) {
-                determine_remote_online_status(this_node);
+                determine_remote_online_status(data_set, this_node);
+
             } else {
                 determine_online_status(node_state, this_node, data_set);
             }
diff --git a/pengine/allocate.c b/pengine/allocate.c
index c2e56f9..65ae05d 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -1406,6 +1406,23 @@ stage6(pe_working_set_t * data_set)
 
         /* remote-nodes associated with a container resource (such as a vm) are not fenced */
         if (is_container_remote_node(node)) {
+            /* Guest */
+            if (need_stonith
+                && node->details->remote_requires_reset
+                && pe_can_fence(data_set, node)) {
+                resource_t *container = node->details->remote_rsc->container;
+                char *key = stop_key(container);
+                GListPtr stop_list = find_actions(container->actions, key, NULL);
+
+                crm_info("Impliying node %s is down when container %s is stopped (%p)",
+                         node->details->uname, container->id, stop_list);
+                if(stop_list) {
+                    stonith_constraints(node, stop_list->data, data_set);
+                }
+
+                g_list_free(stop_list);
+                free(key);
+            }
             continue;
         }
 
diff --git a/pengine/graph.c b/pengine/graph.c
index 3d832f0..a50f15b 100644
--- a/pengine/graph.c
+++ b/pengine/graph.c
@@ -697,7 +697,12 @@ stonith_constraints(node_t * node, action_t * stonith_op, pe_working_set_t * dat
         for (lpc = data_set->resources; lpc != NULL; lpc = lpc->next) {
             resource_t *rsc = (resource_t *) lpc->data;
 
-            rsc_stonith_ordering(rsc, stonith_op, data_set);
+            if(stonith_op->rsc == NULL) {
+                rsc_stonith_ordering(rsc, stonith_op, data_set);
+
+            } else if(stonith_op->rsc != rsc && stonith_op->rsc != rsc->container) {
+                rsc_stonith_ordering(rsc, stonith_op, data_set);
+            }
         }
     }
 
diff --git a/pengine/test10/bug-rh-1097457.dot b/pengine/test10/bug-rh-1097457.dot
index 666099c..078d177 100644
--- a/pengine/test10/bug-rh-1097457.dot
+++ b/pengine/test10/bug-rh-1097457.dot
@@ -49,10 +49,12 @@ digraph "g" {
 "VM2_start_0 lama3" [ style=bold color="green" fontcolor="black"]
 "VM2_stop_0 lama3" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold]
 "VM2_stop_0 lama3" -> "FAKE4_stop_0 lamaVM2" [ style = bold]
+"VM2_stop_0 lama3" -> "FAKE6-clone_stop_0" [ style = bold]
 "VM2_stop_0 lama3" -> "FAKE6_stop_0 lamaVM2" [ style = bold]
 "VM2_stop_0 lama3" -> "FSlun3_stop_0 lamaVM2" [ style = bold]
 "VM2_stop_0 lama3" -> "VM2_start_0 lama3" [ style = bold]
 "VM2_stop_0 lama3" -> "all_stopped" [ style = bold]
+"VM2_stop_0 lama3" -> "lamaVM2-G4_stop_0" [ style = bold]
 "VM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"]
 "all_stopped" [ style=bold color="green" fontcolor="orange"]
 "lamaVM2-G4_running_0" [ style=bold color="green" fontcolor="orange"]
diff --git a/pengine/test10/bug-rh-1097457.exp b/pengine/test10/bug-rh-1097457.exp
index 36af9f3..175f413 100644
--- a/pengine/test10/bug-rh-1097457.exp
+++ b/pengine/test10/bug-rh-1097457.exp
@@ -119,7 +119,11 @@
         <attributes CRM_meta_timeout="20000" />
       </pseudo_event>
     </action_set>
-    <inputs/>
+    <inputs>
+      <trigger>
+        <rsc_op id="40" operation="stop" operation_key="VM2_stop_0" on_node="lama3" on_node_uuid="2"/>
+      </trigger>
+    </inputs>
   </synapse>
   <synapse id="9">
     <action_set>
@@ -331,7 +335,11 @@
         <attributes CRM_meta_clone_max="3" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
       </pseudo_event>
     </action_set>
-    <inputs/>
+    <inputs>
+      <trigger>
+        <rsc_op id="40" operation="stop" operation_key="VM2_stop_0" on_node="lama3" on_node_uuid="2"/>
+      </trigger>
+    </inputs>
   </synapse>
   <synapse id="22" priority="1000000">
     <action_set>
diff --git a/pengine/test10/bug-rh-1097457.summary b/pengine/test10/bug-rh-1097457.summary
index e2f235d..c8751ae 100644
--- a/pengine/test10/bug-rh-1097457.summary
+++ b/pengine/test10/bug-rh-1097457.summary
@@ -39,17 +39,17 @@ Transition Summary:
  * Restart lamaVM2	(Started lama3)
 
 Executing cluster transition:
- * Pseudo action:   lamaVM2-G4_stop_0
- * Pseudo action:   FAKE6-clone_stop_0
  * Resource action: lamaVM2         stop on lama3
  * Resource action: VM2             stop on lama3
+ * Pseudo action:   lamaVM2-G4_stop_0
  * Pseudo action:   FAKE4-IP_stop_0
- * Pseudo action:   FAKE6_stop_0
- * Pseudo action:   FAKE6-clone_stopped_0
- * Pseudo action:   FAKE6-clone_start_0
+ * Pseudo action:   FAKE6-clone_stop_0
  * Resource action: VM2             start on lama3
  * Resource action: VM2             monitor=10000 on lama3
  * Pseudo action:   FAKE4_stop_0
+ * Pseudo action:   FAKE6_stop_0
+ * Pseudo action:   FAKE6-clone_stopped_0
+ * Pseudo action:   FAKE6-clone_start_0
  * Resource action: lamaVM2         start on lama3
  * Resource action: lamaVM2         monitor=30000 on lama3
  * Resource action: FSlun3          monitor=10000 on lamaVM2
diff --git a/pengine/test10/whitebox-fail1.dot b/pengine/test10/whitebox-fail1.dot
index b595015..0f0fe26 100644
--- a/pengine/test10/whitebox-fail1.dot
+++ b/pengine/test10/whitebox-fail1.dot
@@ -26,6 +26,7 @@ digraph "g" {
 "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold]
 "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"]
 "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold]
+"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold]
 "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold]
 "container1_stop_0 18node2" -> "all_stopped" [ style = bold]
 "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold]
diff --git a/pengine/test10/whitebox-fail1.exp b/pengine/test10/whitebox-fail1.exp
index 834b231..01bb142 100644
--- a/pengine/test10/whitebox-fail1.exp
+++ b/pengine/test10/whitebox-fail1.exp
@@ -96,7 +96,11 @@
         <attributes CRM_meta_clone_max="5" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
       </pseudo_event>
     </action_set>
-    <inputs/>
+    <inputs>
+      <trigger>
+        <rsc_op id="5" operation="stop" operation_key="container1_stop_0" on_node="18node2" on_node_uuid="2"/>
+      </trigger>
+    </inputs>
   </synapse>
   <synapse id="7" priority="1000000">
     <action_set>
diff --git a/pengine/test10/whitebox-fail1.summary b/pengine/test10/whitebox-fail1.summary
index 5e5887b..1586407 100644
--- a/pengine/test10/whitebox-fail1.summary
+++ b/pengine/test10/whitebox-fail1.summary
@@ -20,17 +20,17 @@ Transition Summary:
  * Restart lxc1	(Started 18node2)
 
 Executing cluster transition:
- * Pseudo action:   M-clone_stop_0
  * Resource action: lxc1            stop on 18node2
  * Resource action: container1      stop on 18node2
+ * Pseudo action:   M-clone_stop_0
+ * Pseudo action:   B_stop_0
+ * Resource action: container1      start on 18node2
  * Pseudo action:   M_stop_0
  * Pseudo action:   M-clone_stopped_0
  * Pseudo action:   M-clone_start_0
- * Pseudo action:   B_stop_0
- * Pseudo action:   all_stopped
- * Resource action: container1      start on 18node2
  * Resource action: lxc1            start on 18node2
  * Resource action: lxc1            monitor=30000 on 18node2
+ * Pseudo action:   all_stopped
  * Resource action: M               start on lxc1
  * Pseudo action:   M-clone_running_0
  * Resource action: B               start on lxc1
diff --git a/pengine/test10/whitebox-fail2.dot b/pengine/test10/whitebox-fail2.dot
index b595015..0f0fe26 100644
--- a/pengine/test10/whitebox-fail2.dot
+++ b/pengine/test10/whitebox-fail2.dot
@@ -26,6 +26,7 @@ digraph "g" {
 "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold]
 "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"]
 "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold]
+"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold]
 "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold]
 "container1_stop_0 18node2" -> "all_stopped" [ style = bold]
 "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold]
diff --git a/pengine/test10/whitebox-fail2.exp b/pengine/test10/whitebox-fail2.exp
index 834b231..01bb142 100644
--- a/pengine/test10/whitebox-fail2.exp
+++ b/pengine/test10/whitebox-fail2.exp
@@ -96,7 +96,11 @@
         <attributes CRM_meta_clone_max="5" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
       </pseudo_event>
     </action_set>
-    <inputs/>
+    <inputs>
+      <trigger>
+        <rsc_op id="5" operation="stop" operation_key="container1_stop_0" on_node="18node2" on_node_uuid="2"/>
+      </trigger>
+    </inputs>
   </synapse>
   <synapse id="7" priority="1000000">
     <action_set>
diff --git a/pengine/test10/whitebox-fail2.summary b/pengine/test10/whitebox-fail2.summary
index 338173d..ab40d99 100644
--- a/pengine/test10/whitebox-fail2.summary
+++ b/pengine/test10/whitebox-fail2.summary
@@ -20,17 +20,17 @@ Transition Summary:
  * Recover lxc1	(Started 18node2)
 
 Executing cluster transition:
- * Pseudo action:   M-clone_stop_0
  * Resource action: lxc1            stop on 18node2
  * Resource action: container1      stop on 18node2
+ * Pseudo action:   M-clone_stop_0
+ * Pseudo action:   B_stop_0
+ * Resource action: container1      start on 18node2
  * Pseudo action:   M_stop_0
  * Pseudo action:   M-clone_stopped_0
  * Pseudo action:   M-clone_start_0
- * Pseudo action:   B_stop_0
- * Pseudo action:   all_stopped
- * Resource action: container1      start on 18node2
  * Resource action: lxc1            start on 18node2
  * Resource action: lxc1            monitor=30000 on 18node2
+ * Pseudo action:   all_stopped
  * Resource action: M               start on lxc1
  * Pseudo action:   M-clone_running_0
  * Resource action: B               start on lxc1