From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 18 Aug 2015 10:30:49 +1000
Subject: [PATCH] Fix: PE: Bug cl#5247 - Imply resources running on a container
are stopped when the container is stopped
(cherry picked from commit e10eff1902d5b451454e2d467ee337c964f536ab)
---
lib/pengine/unpack.c | 29 ++++++++++++++++++++---------
pengine/allocate.c | 17 +++++++++++++++++
pengine/graph.c | 7 ++++++-
pengine/test10/bug-rh-1097457.dot | 2 ++
pengine/test10/bug-rh-1097457.exp | 12 ++++++++++--
pengine/test10/bug-rh-1097457.summary | 10 +++++-----
pengine/test10/whitebox-fail1.dot | 1 +
pengine/test10/whitebox-fail1.exp | 6 +++++-
pengine/test10/whitebox-fail1.summary | 8 ++++----
pengine/test10/whitebox-fail2.dot | 1 +
pengine/test10/whitebox-fail2.exp | 6 +++++-
pengine/test10/whitebox-fail2.summary | 8 ++++----
12 files changed, 80 insertions(+), 27 deletions(-)
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 106c674..0f83be4 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -44,7 +44,7 @@ CRM_TRACE_INIT_DATA(pe_status);
gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op,
enum action_fail_response *failed, pe_working_set_t * data_set);
-static gboolean determine_remote_online_status(node_t * this_node);
+static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node);
static gboolean
is_dangling_container_remote_node(node_t *node)
@@ -73,6 +73,8 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
crm_warn("Remote node %s will be fenced by recovering container resource %s",
node->details->uname, rsc->id, reason);
+ /* node->details->unclean = TRUE; */
+ node->details->remote_requires_reset = TRUE;
set_bit(rsc->flags, pe_rsc_failed);
}
} else if (is_dangling_container_remote_node(node)) {
@@ -1157,7 +1159,7 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set)
if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) {
continue;
}
- determine_remote_online_status(this_node);
+ determine_remote_online_status(data_set, this_node);
}
/* process attributes */
@@ -1366,7 +1368,7 @@ determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_stat
}
static gboolean
-determine_remote_online_status(node_t * this_node)
+determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node)
{
resource_t *rsc = this_node->details->remote_rsc;
resource_t *container = NULL;
@@ -1393,13 +1395,21 @@ determine_remote_online_status(node_t * this_node)
}
/* Now check all the failure conditions. */
- if (is_set(rsc->flags, pe_rsc_failed) ||
- (rsc->role == RSC_ROLE_STOPPED) ||
- (container && is_set(container->flags, pe_rsc_failed)) ||
- (container && container->role == RSC_ROLE_STOPPED)) {
+ if(container && is_set(container->flags, pe_rsc_failed)) {
+ crm_trace("Remote node %s is set to UNCLEAN. rsc failed.", this_node->details->id);
+ this_node->details->online = FALSE;
+ this_node->details->remote_requires_reset = TRUE;
- crm_trace("Remote node %s is set to OFFLINE. node is stopped or rsc failed.", this_node->details->id);
+ } else if(is_set(rsc->flags, pe_rsc_failed)) {
+ crm_trace("Remote node %s is set to OFFLINE. rsc failed.", this_node->details->id);
this_node->details->online = FALSE;
+
+ } else if (rsc->role == RSC_ROLE_STOPPED
+ || (container && container->role == RSC_ROLE_STOPPED)) {
+
+ crm_trace("Remote node %s is set to OFFLINE. node is stopped.", this_node->details->id);
+ this_node->details->online = FALSE;
+ this_node->details->remote_requires_reset = FALSE;
}
remote_online_done:
@@ -3375,7 +3385,8 @@ find_operations(const char *rsc, const char *node, gboolean active_filter,
continue;
} else if (is_remote_node(this_node)) {
- determine_remote_online_status(this_node);
+ determine_remote_online_status(data_set, this_node);
+
} else {
determine_online_status(node_state, this_node, data_set);
}
diff --git a/pengine/allocate.c b/pengine/allocate.c
index c2e56f9..65ae05d 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -1406,6 +1406,23 @@ stage6(pe_working_set_t * data_set)
/* remote-nodes associated with a container resource (such as a vm) are not fenced */
if (is_container_remote_node(node)) {
+ /* Guest */
+ if (need_stonith
+ && node->details->remote_requires_reset
+ && pe_can_fence(data_set, node)) {
+ resource_t *container = node->details->remote_rsc->container;
+ char *key = stop_key(container);
+ GListPtr stop_list = find_actions(container->actions, key, NULL);
+
+ crm_info("Impliying node %s is down when container %s is stopped (%p)",
+ node->details->uname, container->id, stop_list);
+ if(stop_list) {
+ stonith_constraints(node, stop_list->data, data_set);
+ }
+
+ g_list_free(stop_list);
+ free(key);
+ }
continue;
}
diff --git a/pengine/graph.c b/pengine/graph.c
index 3d832f0..a50f15b 100644
--- a/pengine/graph.c
+++ b/pengine/graph.c
@@ -697,7 +697,12 @@ stonith_constraints(node_t * node, action_t * stonith_op, pe_working_set_t * dat
for (lpc = data_set->resources; lpc != NULL; lpc = lpc->next) {
resource_t *rsc = (resource_t *) lpc->data;
- rsc_stonith_ordering(rsc, stonith_op, data_set);
+ if(stonith_op->rsc == NULL) {
+ rsc_stonith_ordering(rsc, stonith_op, data_set);
+
+ } else if(stonith_op->rsc != rsc && stonith_op->rsc != rsc->container) {
+ rsc_stonith_ordering(rsc, stonith_op, data_set);
+ }
}
}
diff --git a/pengine/test10/bug-rh-1097457.dot b/pengine/test10/bug-rh-1097457.dot
index 666099c..078d177 100644
--- a/pengine/test10/bug-rh-1097457.dot
+++ b/pengine/test10/bug-rh-1097457.dot
@@ -49,10 +49,12 @@ digraph "g" {
"VM2_start_0 lama3" [ style=bold color="green" fontcolor="black"]
"VM2_stop_0 lama3" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold]
"VM2_stop_0 lama3" -> "FAKE4_stop_0 lamaVM2" [ style = bold]
+"VM2_stop_0 lama3" -> "FAKE6-clone_stop_0" [ style = bold]
"VM2_stop_0 lama3" -> "FAKE6_stop_0 lamaVM2" [ style = bold]
"VM2_stop_0 lama3" -> "FSlun3_stop_0 lamaVM2" [ style = bold]
"VM2_stop_0 lama3" -> "VM2_start_0 lama3" [ style = bold]
"VM2_stop_0 lama3" -> "all_stopped" [ style = bold]
+"VM2_stop_0 lama3" -> "lamaVM2-G4_stop_0" [ style = bold]
"VM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"]
"all_stopped" [ style=bold color="green" fontcolor="orange"]
"lamaVM2-G4_running_0" [ style=bold color="green" fontcolor="orange"]
diff --git a/pengine/test10/bug-rh-1097457.exp b/pengine/test10/bug-rh-1097457.exp
index 36af9f3..175f413 100644
--- a/pengine/test10/bug-rh-1097457.exp
+++ b/pengine/test10/bug-rh-1097457.exp
@@ -119,7 +119,11 @@
<attributes CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
- <inputs/>
+ <inputs>
+ <trigger>
+ <rsc_op id="40" operation="stop" operation_key="VM2_stop_0" on_node="lama3" on_node_uuid="2"/>
+ </trigger>
+ </inputs>
</synapse>
<synapse id="9">
<action_set>
@@ -331,7 +335,11 @@
<attributes CRM_meta_clone_max="3" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
- <inputs/>
+ <inputs>
+ <trigger>
+ <rsc_op id="40" operation="stop" operation_key="VM2_stop_0" on_node="lama3" on_node_uuid="2"/>
+ </trigger>
+ </inputs>
</synapse>
<synapse id="22" priority="1000000">
<action_set>
diff --git a/pengine/test10/bug-rh-1097457.summary b/pengine/test10/bug-rh-1097457.summary
index e2f235d..c8751ae 100644
--- a/pengine/test10/bug-rh-1097457.summary
+++ b/pengine/test10/bug-rh-1097457.summary
@@ -39,17 +39,17 @@ Transition Summary:
* Restart lamaVM2 (Started lama3)
Executing cluster transition:
- * Pseudo action: lamaVM2-G4_stop_0
- * Pseudo action: FAKE6-clone_stop_0
* Resource action: lamaVM2 stop on lama3
* Resource action: VM2 stop on lama3
+ * Pseudo action: lamaVM2-G4_stop_0
* Pseudo action: FAKE4-IP_stop_0
- * Pseudo action: FAKE6_stop_0
- * Pseudo action: FAKE6-clone_stopped_0
- * Pseudo action: FAKE6-clone_start_0
+ * Pseudo action: FAKE6-clone_stop_0
* Resource action: VM2 start on lama3
* Resource action: VM2 monitor=10000 on lama3
* Pseudo action: FAKE4_stop_0
+ * Pseudo action: FAKE6_stop_0
+ * Pseudo action: FAKE6-clone_stopped_0
+ * Pseudo action: FAKE6-clone_start_0
* Resource action: lamaVM2 start on lama3
* Resource action: lamaVM2 monitor=30000 on lama3
* Resource action: FSlun3 monitor=10000 on lamaVM2
diff --git a/pengine/test10/whitebox-fail1.dot b/pengine/test10/whitebox-fail1.dot
index b595015..0f0fe26 100644
--- a/pengine/test10/whitebox-fail1.dot
+++ b/pengine/test10/whitebox-fail1.dot
@@ -26,6 +26,7 @@ digraph "g" {
"container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold]
"container1_start_0 18node2" [ style=bold color="green" fontcolor="black"]
"container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold]
+"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold]
"container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold]
"container1_stop_0 18node2" -> "all_stopped" [ style = bold]
"container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold]
diff --git a/pengine/test10/whitebox-fail1.exp b/pengine/test10/whitebox-fail1.exp
index 834b231..01bb142 100644
--- a/pengine/test10/whitebox-fail1.exp
+++ b/pengine/test10/whitebox-fail1.exp
@@ -96,7 +96,11 @@
<attributes CRM_meta_clone_max="5" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
- <inputs/>
+ <inputs>
+ <trigger>
+ <rsc_op id="5" operation="stop" operation_key="container1_stop_0" on_node="18node2" on_node_uuid="2"/>
+ </trigger>
+ </inputs>
</synapse>
<synapse id="7" priority="1000000">
<action_set>
diff --git a/pengine/test10/whitebox-fail1.summary b/pengine/test10/whitebox-fail1.summary
index 5e5887b..1586407 100644
--- a/pengine/test10/whitebox-fail1.summary
+++ b/pengine/test10/whitebox-fail1.summary
@@ -20,17 +20,17 @@ Transition Summary:
* Restart lxc1 (Started 18node2)
Executing cluster transition:
- * Pseudo action: M-clone_stop_0
* Resource action: lxc1 stop on 18node2
* Resource action: container1 stop on 18node2
+ * Pseudo action: M-clone_stop_0
+ * Pseudo action: B_stop_0
+ * Resource action: container1 start on 18node2
* Pseudo action: M_stop_0
* Pseudo action: M-clone_stopped_0
* Pseudo action: M-clone_start_0
- * Pseudo action: B_stop_0
- * Pseudo action: all_stopped
- * Resource action: container1 start on 18node2
* Resource action: lxc1 start on 18node2
* Resource action: lxc1 monitor=30000 on 18node2
+ * Pseudo action: all_stopped
* Resource action: M start on lxc1
* Pseudo action: M-clone_running_0
* Resource action: B start on lxc1
diff --git a/pengine/test10/whitebox-fail2.dot b/pengine/test10/whitebox-fail2.dot
index b595015..0f0fe26 100644
--- a/pengine/test10/whitebox-fail2.dot
+++ b/pengine/test10/whitebox-fail2.dot
@@ -26,6 +26,7 @@ digraph "g" {
"container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold]
"container1_start_0 18node2" [ style=bold color="green" fontcolor="black"]
"container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold]
+"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold]
"container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold]
"container1_stop_0 18node2" -> "all_stopped" [ style = bold]
"container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold]
diff --git a/pengine/test10/whitebox-fail2.exp b/pengine/test10/whitebox-fail2.exp
index 834b231..01bb142 100644
--- a/pengine/test10/whitebox-fail2.exp
+++ b/pengine/test10/whitebox-fail2.exp
@@ -96,7 +96,11 @@
<attributes CRM_meta_clone_max="5" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
- <inputs/>
+ <inputs>
+ <trigger>
+ <rsc_op id="5" operation="stop" operation_key="container1_stop_0" on_node="18node2" on_node_uuid="2"/>
+ </trigger>
+ </inputs>
</synapse>
<synapse id="7" priority="1000000">
<action_set>
diff --git a/pengine/test10/whitebox-fail2.summary b/pengine/test10/whitebox-fail2.summary
index 338173d..ab40d99 100644
--- a/pengine/test10/whitebox-fail2.summary
+++ b/pengine/test10/whitebox-fail2.summary
@@ -20,17 +20,17 @@ Transition Summary:
* Recover lxc1 (Started 18node2)
Executing cluster transition:
- * Pseudo action: M-clone_stop_0
* Resource action: lxc1 stop on 18node2
* Resource action: container1 stop on 18node2
+ * Pseudo action: M-clone_stop_0
+ * Pseudo action: B_stop_0
+ * Resource action: container1 start on 18node2
* Pseudo action: M_stop_0
* Pseudo action: M-clone_stopped_0
* Pseudo action: M-clone_start_0
- * Pseudo action: B_stop_0
- * Pseudo action: all_stopped
- * Resource action: container1 start on 18node2
* Resource action: lxc1 start on 18node2
* Resource action: lxc1 monitor=30000 on 18node2
+ * Pseudo action: all_stopped
* Resource action: M start on lxc1
* Pseudo action: M-clone_running_0
* Resource action: B start on lxc1