From: Andrew Beekhof Date: Tue, 18 Aug 2015 10:30:49 +1000 Subject: [PATCH] Fix: PE: Bug cl#5247 - Imply resources running on a container are stopped when the container is stopped (cherry picked from commit e10eff1902d5b451454e2d467ee337c964f536ab) --- lib/pengine/unpack.c | 29 ++++++++++++++++++++--------- pengine/allocate.c | 17 +++++++++++++++++ pengine/graph.c | 7 ++++++- pengine/test10/bug-rh-1097457.dot | 2 ++ pengine/test10/bug-rh-1097457.exp | 12 ++++++++++-- pengine/test10/bug-rh-1097457.summary | 10 +++++----- pengine/test10/whitebox-fail1.dot | 1 + pengine/test10/whitebox-fail1.exp | 6 +++++- pengine/test10/whitebox-fail1.summary | 8 ++++---- pengine/test10/whitebox-fail2.dot | 1 + pengine/test10/whitebox-fail2.exp | 6 +++++- pengine/test10/whitebox-fail2.summary | 8 ++++---- 12 files changed, 80 insertions(+), 27 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 106c674..0f83be4 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -44,7 +44,7 @@ CRM_TRACE_INIT_DATA(pe_status); gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, enum action_fail_response *failed, pe_working_set_t * data_set); -static gboolean determine_remote_online_status(node_t * this_node); +static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node); static gboolean is_dangling_container_remote_node(node_t *node) @@ -73,6 +73,8 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { crm_warn("Remote node %s will be fenced by recovering container resource %s", node->details->uname, rsc->id, reason); + /* node->details->unclean = TRUE; */ + node->details->remote_requires_reset = TRUE; set_bit(rsc->flags, pe_rsc_failed); } } else if (is_dangling_container_remote_node(node)) { @@ -1157,7 +1159,7 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) { continue; } - determine_remote_online_status(this_node); + determine_remote_online_status(data_set, this_node); } /* process attributes */ @@ -1366,7 +1368,7 @@ determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_stat } static gboolean -determine_remote_online_status(node_t * this_node) +determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) { resource_t *rsc = this_node->details->remote_rsc; resource_t *container = NULL; @@ -1393,13 +1395,21 @@ determine_remote_online_status(node_t * this_node) } /* Now check all the failure conditions. */ - if (is_set(rsc->flags, pe_rsc_failed) || - (rsc->role == RSC_ROLE_STOPPED) || - (container && is_set(container->flags, pe_rsc_failed)) || - (container && container->role == RSC_ROLE_STOPPED)) { + if(container && is_set(container->flags, pe_rsc_failed)) { + crm_trace("Remote node %s is set to UNCLEAN. rsc failed.", this_node->details->id); + this_node->details->online = FALSE; + this_node->details->remote_requires_reset = TRUE; - crm_trace("Remote node %s is set to OFFLINE. node is stopped or rsc failed.", this_node->details->id); + } else if(is_set(rsc->flags, pe_rsc_failed)) { + crm_trace("Remote node %s is set to OFFLINE. rsc failed.", this_node->details->id); this_node->details->online = FALSE; + + } else if (rsc->role == RSC_ROLE_STOPPED + || (container && container->role == RSC_ROLE_STOPPED)) { + + crm_trace("Remote node %s is set to OFFLINE. node is stopped.", this_node->details->id); + this_node->details->online = FALSE; + this_node->details->remote_requires_reset = FALSE; } remote_online_done: @@ -3375,7 +3385,8 @@ find_operations(const char *rsc, const char *node, gboolean active_filter, continue; } else if (is_remote_node(this_node)) { - determine_remote_online_status(this_node); + determine_remote_online_status(data_set, this_node); + } else { determine_online_status(node_state, this_node, data_set); } diff --git a/pengine/allocate.c b/pengine/allocate.c index c2e56f9..65ae05d 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1406,6 +1406,23 @@ stage6(pe_working_set_t * data_set) /* remote-nodes associated with a container resource (such as a vm) are not fenced */ if (is_container_remote_node(node)) { + /* Guest */ + if (need_stonith + && node->details->remote_requires_reset + && pe_can_fence(data_set, node)) { + resource_t *container = node->details->remote_rsc->container; + char *key = stop_key(container); + GListPtr stop_list = find_actions(container->actions, key, NULL); + + crm_info("Impliying node %s is down when container %s is stopped (%p)", + node->details->uname, container->id, stop_list); + if(stop_list) { + stonith_constraints(node, stop_list->data, data_set); + } + + g_list_free(stop_list); + free(key); + } continue; } diff --git a/pengine/graph.c b/pengine/graph.c index 3d832f0..a50f15b 100644 --- a/pengine/graph.c +++ b/pengine/graph.c @@ -697,7 +697,12 @@ stonith_constraints(node_t * node, action_t * stonith_op, pe_working_set_t * dat for (lpc = data_set->resources; lpc != NULL; lpc = lpc->next) { resource_t *rsc = (resource_t *) lpc->data; - rsc_stonith_ordering(rsc, stonith_op, data_set); + if(stonith_op->rsc == NULL) { + rsc_stonith_ordering(rsc, stonith_op, data_set); + + } else if(stonith_op->rsc != rsc && stonith_op->rsc != rsc->container) { + rsc_stonith_ordering(rsc, stonith_op, data_set); + } } } diff --git a/pengine/test10/bug-rh-1097457.dot b/pengine/test10/bug-rh-1097457.dot index 666099c..078d177 100644 --- a/pengine/test10/bug-rh-1097457.dot +++ b/pengine/test10/bug-rh-1097457.dot @@ -49,10 +49,12 @@ digraph "g" { "VM2_start_0 lama3" [ style=bold color="green" fontcolor="black"] "VM2_stop_0 lama3" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold] "VM2_stop_0 lama3" -> "FAKE4_stop_0 lamaVM2" [ style = bold] +"VM2_stop_0 lama3" -> "FAKE6-clone_stop_0" [ style = bold] "VM2_stop_0 lama3" -> "FAKE6_stop_0 lamaVM2" [ style = bold] "VM2_stop_0 lama3" -> "FSlun3_stop_0 lamaVM2" [ style = bold] "VM2_stop_0 lama3" -> "VM2_start_0 lama3" [ style = bold] "VM2_stop_0 lama3" -> "all_stopped" [ style = bold] +"VM2_stop_0 lama3" -> "lamaVM2-G4_stop_0" [ style = bold] "VM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"] "all_stopped" [ style=bold color="green" fontcolor="orange"] "lamaVM2-G4_running_0" [ style=bold color="green" fontcolor="orange"] diff --git a/pengine/test10/bug-rh-1097457.exp b/pengine/test10/bug-rh-1097457.exp index 36af9f3..175f413 100644 --- a/pengine/test10/bug-rh-1097457.exp +++ b/pengine/test10/bug-rh-1097457.exp @@ -119,7 +119,11 @@ - + + + + + @@ -331,7 +335,11 @@ - + + + + + diff --git a/pengine/test10/bug-rh-1097457.summary b/pengine/test10/bug-rh-1097457.summary index e2f235d..c8751ae 100644 --- a/pengine/test10/bug-rh-1097457.summary +++ b/pengine/test10/bug-rh-1097457.summary @@ -39,17 +39,17 @@ Transition Summary: * Restart lamaVM2 (Started lama3) Executing cluster transition: - * Pseudo action: lamaVM2-G4_stop_0 - * Pseudo action: FAKE6-clone_stop_0 * Resource action: lamaVM2 stop on lama3 * Resource action: VM2 stop on lama3 + * Pseudo action: lamaVM2-G4_stop_0 * Pseudo action: FAKE4-IP_stop_0 - * Pseudo action: FAKE6_stop_0 - * Pseudo action: FAKE6-clone_stopped_0 - * Pseudo action: FAKE6-clone_start_0 + * Pseudo action: FAKE6-clone_stop_0 * Resource action: VM2 start on lama3 * Resource action: VM2 monitor=10000 on lama3 * Pseudo action: FAKE4_stop_0 + * Pseudo action: FAKE6_stop_0 + * Pseudo action: FAKE6-clone_stopped_0 + * Pseudo action: FAKE6-clone_start_0 * Resource action: lamaVM2 start on lama3 * Resource action: lamaVM2 monitor=30000 on lama3 * Resource action: FSlun3 monitor=10000 on lamaVM2 diff --git a/pengine/test10/whitebox-fail1.dot b/pengine/test10/whitebox-fail1.dot index b595015..0f0fe26 100644 --- a/pengine/test10/whitebox-fail1.dot +++ b/pengine/test10/whitebox-fail1.dot @@ -26,6 +26,7 @@ digraph "g" { "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] +"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] "container1_stop_0 18node2" -> "all_stopped" [ style = bold] "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] diff --git a/pengine/test10/whitebox-fail1.exp b/pengine/test10/whitebox-fail1.exp index 834b231..01bb142 100644 --- a/pengine/test10/whitebox-fail1.exp +++ b/pengine/test10/whitebox-fail1.exp @@ -96,7 +96,11 @@ - + + + + + diff --git a/pengine/test10/whitebox-fail1.summary b/pengine/test10/whitebox-fail1.summary index 5e5887b..1586407 100644 --- a/pengine/test10/whitebox-fail1.summary +++ b/pengine/test10/whitebox-fail1.summary @@ -20,17 +20,17 @@ Transition Summary: * Restart lxc1 (Started 18node2) Executing cluster transition: - * Pseudo action: M-clone_stop_0 * Resource action: lxc1 stop on 18node2 * Resource action: container1 stop on 18node2 + * Pseudo action: M-clone_stop_0 + * Pseudo action: B_stop_0 + * Resource action: container1 start on 18node2 * Pseudo action: M_stop_0 * Pseudo action: M-clone_stopped_0 * Pseudo action: M-clone_start_0 - * Pseudo action: B_stop_0 - * Pseudo action: all_stopped - * Resource action: container1 start on 18node2 * Resource action: lxc1 start on 18node2 * Resource action: lxc1 monitor=30000 on 18node2 + * Pseudo action: all_stopped * Resource action: M start on lxc1 * Pseudo action: M-clone_running_0 * Resource action: B start on lxc1 diff --git a/pengine/test10/whitebox-fail2.dot b/pengine/test10/whitebox-fail2.dot index b595015..0f0fe26 100644 --- a/pengine/test10/whitebox-fail2.dot +++ b/pengine/test10/whitebox-fail2.dot @@ -26,6 +26,7 @@ digraph "g" { "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] +"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] "container1_stop_0 18node2" -> "all_stopped" [ style = bold] "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] diff --git a/pengine/test10/whitebox-fail2.exp b/pengine/test10/whitebox-fail2.exp index 834b231..01bb142 100644 --- a/pengine/test10/whitebox-fail2.exp +++ b/pengine/test10/whitebox-fail2.exp @@ -96,7 +96,11 @@ - + + + + + diff --git a/pengine/test10/whitebox-fail2.summary b/pengine/test10/whitebox-fail2.summary index 338173d..ab40d99 100644 --- a/pengine/test10/whitebox-fail2.summary +++ b/pengine/test10/whitebox-fail2.summary @@ -20,17 +20,17 @@ Transition Summary: * Recover lxc1 (Started 18node2) Executing cluster transition: - * Pseudo action: M-clone_stop_0 * Resource action: lxc1 stop on 18node2 * Resource action: container1 stop on 18node2 + * Pseudo action: M-clone_stop_0 + * Pseudo action: B_stop_0 + * Resource action: container1 start on 18node2 * Pseudo action: M_stop_0 * Pseudo action: M-clone_stopped_0 * Pseudo action: M-clone_start_0 - * Pseudo action: B_stop_0 - * Pseudo action: all_stopped - * Resource action: container1 start on 18node2 * Resource action: lxc1 start on 18node2 * Resource action: lxc1 monitor=30000 on 18node2 + * Pseudo action: all_stopped * Resource action: M start on lxc1 * Pseudo action: M-clone_running_0 * Resource action: B start on lxc1