From 73da74305b69b086f8bc7cae697063e2534a79f4 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 22 Nov 2016 16:37:07 -0600 Subject: [PATCH 1/8] Low: pengine: remove unnecessary assert it was made obsolete with 1420ff88 --- lib/pengine/unpack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index a9fbcc0..2ef9343 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1402,6 +1402,10 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) resource_t *rsc = this_node->details->remote_rsc; resource_t *container = NULL; + /* If there is a node state entry for a (former) Pacemaker Remote node + * but no resource creating that node, the node's connection resource will + * be NULL. Consider it an offline remote node in that case. + */ if (rsc == NULL) { this_node->details->online = FALSE; goto remote_online_done; @@ -1409,8 +1413,6 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) container = rsc->container; - CRM_ASSERT(rsc != NULL); - /* If the resource is currently started, mark it online. */ if (rsc->role == RSC_ROLE_STARTED) { crm_trace("Remote node %s is set to ONLINE. role == started", this_node->details->id); -- 1.8.3.1 From 5156074d560d85ee84de31b9d1e0bd893999fa4e Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 23 Nov 2016 13:40:47 -0600 Subject: [PATCH 2/8] Log: pengine: improve trace messages for Pacemaker Remote nodes --- lib/pengine/unpack.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 2ef9343..a49e108 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -70,10 +70,15 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) /* A guest node is fenced by marking its container as failed */ if (is_container_remote_node(node)) { resource_t *rsc = node->details->remote_rsc->container; + if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { crm_warn("Guest node %s will be fenced (by recovering %s) %s", node->details->uname, rsc->id, reason); - /* node->details->unclean = TRUE; */ + + /* We don't mark the node as unclean, because that would prevent the + * node from running resources. We want to allow it to run resources + * in this transition if the recovery succeeds. + */ node->details->remote_requires_reset = TRUE; set_bit(rsc->flags, pe_rsc_failed); } @@ -1415,30 +1420,35 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) /* If the resource is currently started, mark it online. */ if (rsc->role == RSC_ROLE_STARTED) { - crm_trace("Remote node %s is set to ONLINE. role == started", this_node->details->id); + crm_trace("%s node %s presumed ONLINE because connection resource is started", + (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = TRUE; } /* consider this node shutting down if transitioning start->stop */ if (rsc->role == RSC_ROLE_STARTED && rsc->next_role == RSC_ROLE_STOPPED) { - crm_trace("Remote node %s shutdown. transition from start to stop role", this_node->details->id); + crm_trace("%s node %s shutting down because connection resource is stopping", + (container? "Guest" : "Remote"), this_node->details->id); this_node->details->shutdown = TRUE; } /* Now check all the failure conditions. */ if(container && is_set(container->flags, pe_rsc_failed)) { - crm_trace("Remote node %s is set to UNCLEAN. rsc failed.", this_node->details->id); + crm_trace("Guest node %s UNCLEAN because guest resource failed", + this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = TRUE; } else if(is_set(rsc->flags, pe_rsc_failed)) { - crm_trace("Remote node %s is set to OFFLINE. rsc failed.", this_node->details->id); + crm_trace("%s node %s OFFLINE because connection resource failed", + (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; } else if (rsc->role == RSC_ROLE_STOPPED || (container && container->role == RSC_ROLE_STOPPED)) { - crm_trace("Remote node %s is set to OFFLINE. node is stopped.", this_node->details->id); + crm_trace("%s node %s OFFLINE because its resource is stopped", + (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = FALSE; } -- 1.8.3.1 From 225d20cacc5643e113d42159fc713071172d88da Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 22 Nov 2016 16:40:52 -0600 Subject: [PATCH 3/8] Fix: pengine: guest node fencing doesn't require stonith enabled Comments elsewhere say as much, but stage6() didn't get the memo --- lib/pengine/utils.c | 13 +++++++++++++ pengine/allocate.c | 5 +---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index cc97db1..6be9bb1 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -36,6 +36,19 @@ void unpack_operation(action_t * action, xmlNode * xml_obj, resource_t * contain static xmlNode *find_rsc_op_entry_helper(resource_t * rsc, const char *key, gboolean include_disabled); +/*! + * \internal + * \brief Check whether we can fence a particular node + * + * \param[in] data_set Working set for cluster + * \param[in] node Name of node to check + * + * \return TRUE if node can be fenced, FALSE otherwise + * + * \note This function should only be called for cluster nodes and baremetal + * remote nodes; guest nodes are fenced by stopping their container + * resource, so fence execution requirements do not apply to them. + */ bool pe_can_fence(pe_working_set_t * data_set, node_t *node) { if(is_not_set(data_set->flags, pe_flag_stonith_enabled)) { diff --git a/pengine/allocate.c b/pengine/allocate.c index 82abd36..bdf03e5 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1374,10 +1374,7 @@ stage6(pe_working_set_t * data_set) * guest's host. */ if (is_container_remote_node(node)) { - /* Guest */ - if (need_stonith - && node->details->remote_requires_reset - && pe_can_fence(data_set, node)) { + if (node->details->remote_requires_reset && need_stonith) { resource_t *container = node->details->remote_rsc->container; char *key = stop_key(container); GListPtr stop_list = find_actions(container->actions, key, NULL); -- 1.8.3.1 From b11887869723f23a330af8b1b0e9ffd935b68ae0 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 23 Mar 2016 17:57:50 -0500 Subject: [PATCH 4/8] Test: pengine: add regression test for when a guest node's host goes away As of this commit, the PE handles this situation badly. Adding the test before the fix allows the changes in behavior to be highlighted. --- pengine/regression.sh | 1 + pengine/test10/guest-node-host-dies.dot | 107 ++++++ pengine/test10/guest-node-host-dies.exp | 539 ++++++++++++++++++++++++++++ pengine/test10/guest-node-host-dies.scores | 80 +++++ pengine/test10/guest-node-host-dies.summary | 73 ++++ pengine/test10/guest-node-host-dies.xml | 294 +++++++++++++++ 6 files changed, 1094 insertions(+) create mode 100644 pengine/test10/guest-node-host-dies.dot create mode 100644 pengine/test10/guest-node-host-dies.exp create mode 100644 pengine/test10/guest-node-host-dies.scores create mode 100644 pengine/test10/guest-node-host-dies.summary create mode 100644 pengine/test10/guest-node-host-dies.xml diff --git a/pengine/regression.sh b/pengine/regression.sh index be1734b..1bc8e1e 100755 --- a/pengine/regression.sh +++ b/pengine/regression.sh @@ -817,6 +817,7 @@ do_test whitebox-unexpectedly-running "Recover container nodes the cluster did n do_test whitebox-migrate1 "Migrate both container and connection resource" do_test whitebox-imply-stop-on-fence "imply stop action on container node rsc when host node is fenced" do_test whitebox-nested-group "Verify guest remote-node works nested in a group" +do_test guest-node-host-dies "Verify guest node is recovered if host goes away" echo "" do_test remote-startup-probes "Baremetal remote-node startup probes" diff --git a/pengine/test10/guest-node-host-dies.dot b/pengine/test10/guest-node-host-dies.dot new file mode 100644 index 0000000..01858b3 --- /dev/null +++ b/pengine/test10/guest-node-host-dies.dot @@ -0,0 +1,107 @@ +digraph "g" { +"Fencing_monitor_120000 rhel7-4" [ style=bold color="green" fontcolor="black"] +"Fencing_start_0 rhel7-4" -> "Fencing_monitor_120000 rhel7-4" [ style = bold] +"Fencing_start_0 rhel7-4" [ style=bold color="green" fontcolor="black"] +"Fencing_stop_0 rhel7-4" -> "Fencing_start_0 rhel7-4" [ style = bold] +"Fencing_stop_0 rhel7-4" -> "all_stopped" [ style = bold] +"Fencing_stop_0 rhel7-4" [ style=bold color="green" fontcolor="black"] +"all_stopped" [ style=bold color="green" fontcolor="orange"] +"container1_start_0 rhel7-2" -> "lxc1_start_0 rhel7-2" [ style = bold] +"container1_start_0 rhel7-2" [ style=bold color="green" fontcolor="black"] +"container1_stop_0 rhel7-1" -> "all_stopped" [ style = bold] +"container1_stop_0 rhel7-1" -> "container1_start_0 rhel7-2" [ style = bold] +"container1_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"container2_start_0 rhel7-3" -> "lxc2_start_0 rhel7-3" [ style = bold] +"container2_start_0 rhel7-3" [ style=bold color="green" fontcolor="black"] +"container2_stop_0 rhel7-1" -> "all_stopped" [ style = bold] +"container2_stop_0 rhel7-1" -> "container2_start_0 rhel7-3" [ style = bold] +"container2_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_demote_0" -> "lxc-ms-master_demoted_0" [ style = bold] +"lxc-ms-master_demote_0" -> "lxc-ms_demote_0 lxc1" [ style = bold] +"lxc-ms-master_demote_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_demoted_0" -> "lxc-ms-master_promote_0" [ style = bold] +"lxc-ms-master_demoted_0" -> "lxc-ms-master_start_0" [ style = bold] +"lxc-ms-master_demoted_0" -> "lxc-ms-master_stop_0" [ style = bold] +"lxc-ms-master_demoted_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_promote_0" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"lxc-ms-master_promote_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_promoted_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_running_0" -> "lxc-ms-master_promote_0" [ style = bold] +"lxc-ms-master_running_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_start_0" -> "lxc-ms-master_running_0" [ style = bold] +"lxc-ms-master_start_0" -> "lxc-ms_start_0 lxc1" [ style = bold] +"lxc-ms-master_start_0" -> "lxc-ms_start_0 lxc2" [ style = bold] +"lxc-ms-master_start_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_stop_0" -> "lxc-ms-master_stopped_0" [ style = bold] +"lxc-ms-master_stop_0" -> "lxc-ms_stop_0 lxc1" [ style = bold] +"lxc-ms-master_stop_0" -> "lxc-ms_stop_0 lxc2" [ style = bold] +"lxc-ms-master_stop_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms-master_stopped_0" -> "lxc-ms-master_promote_0" [ style = bold] +"lxc-ms-master_stopped_0" -> "lxc-ms-master_start_0" [ style = bold] +"lxc-ms-master_stopped_0" [ style=bold color="green" fontcolor="orange"] +"lxc-ms_demote_0 lxc1" -> "lxc-ms-master_demoted_0" [ style = bold] +"lxc-ms_demote_0 lxc1" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"lxc-ms_demote_0 lxc1" -> "lxc-ms_stop_0 lxc1" [ style = bold] +"lxc-ms_demote_0 lxc1" [ style=bold color="green" fontcolor="black"] +"lxc-ms_monitor_10000 lxc2" [ style=bold color="green" fontcolor="black"] +"lxc-ms_promote_0 lxc1" -> "lxc-ms-master_promoted_0" [ style = bold] +"lxc-ms_promote_0 lxc1" [ style=bold color="green" fontcolor="black"] +"lxc-ms_start_0 lxc1" -> "lxc-ms-master_running_0" [ style = bold] +"lxc-ms_start_0 lxc1" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"lxc-ms_start_0 lxc1" [ style=bold color="green" fontcolor="black"] +"lxc-ms_start_0 lxc2" -> "lxc-ms-master_running_0" [ style = bold] +"lxc-ms_start_0 lxc2" -> "lxc-ms_monitor_10000 lxc2" [ style = bold] +"lxc-ms_start_0 lxc2" [ style=bold color="green" fontcolor="black"] +"lxc-ms_stop_0 lxc1" -> "all_stopped" [ style = bold] +"lxc-ms_stop_0 lxc1" -> "lxc-ms-master_stopped_0" [ style = bold] +"lxc-ms_stop_0 lxc1" -> "lxc-ms_start_0 lxc1" [ style = bold] +"lxc-ms_stop_0 lxc1" [ style=bold color="green" fontcolor="orange"] +"lxc-ms_stop_0 lxc2" -> "all_stopped" [ style = bold] +"lxc-ms_stop_0 lxc2" -> "lxc-ms-master_stopped_0" [ style = bold] +"lxc-ms_stop_0 lxc2" -> "lxc-ms_start_0 lxc2" [ style = bold] +"lxc-ms_stop_0 lxc2" [ style=bold color="green" fontcolor="orange"] +"lxc1_monitor_30000 rhel7-2" [ style=bold color="green" fontcolor="black"] +"lxc1_start_0 rhel7-2" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"lxc1_start_0 rhel7-2" -> "lxc-ms_start_0 lxc1" [ style = bold] +"lxc1_start_0 rhel7-2" -> "lxc1_monitor_30000 rhel7-2" [ style = bold] +"lxc1_start_0 rhel7-2" [ style=bold color="green" fontcolor="black"] +"lxc1_stop_0 rhel7-1" -> "all_stopped" [ style = bold] +"lxc1_stop_0 rhel7-1" -> "container1_stop_0 rhel7-1" [ style = bold] +"lxc1_stop_0 rhel7-1" -> "lxc1_start_0 rhel7-2" [ style = bold] +"lxc1_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"lxc2_monitor_30000 rhel7-3" [ style=bold color="green" fontcolor="black"] +"lxc2_start_0 rhel7-3" -> "lxc-ms_monitor_10000 lxc2" [ style = bold] +"lxc2_start_0 rhel7-3" -> "lxc-ms_start_0 lxc2" [ style = bold] +"lxc2_start_0 rhel7-3" -> "lxc2_monitor_30000 rhel7-3" [ style = bold] +"lxc2_start_0 rhel7-3" [ style=bold color="green" fontcolor="black"] +"lxc2_stop_0 rhel7-1" -> "all_stopped" [ style = bold] +"lxc2_stop_0 rhel7-1" -> "container2_stop_0 rhel7-1" [ style = bold] +"lxc2_stop_0 rhel7-1" -> "lxc2_start_0 rhel7-3" [ style = bold] +"lxc2_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"rsc_rhel7-1_monitor_5000 rhel7-5" [ style=bold color="green" fontcolor="black"] +"rsc_rhel7-1_start_0 rhel7-5" -> "rsc_rhel7-1_monitor_5000 rhel7-5" [ style = bold] +"rsc_rhel7-1_start_0 rhel7-5" [ style=bold color="green" fontcolor="black"] +"rsc_rhel7-1_stop_0 rhel7-1" -> "all_stopped" [ style = bold] +"rsc_rhel7-1_stop_0 rhel7-1" -> "rsc_rhel7-1_start_0 rhel7-5" [ style = bold] +"rsc_rhel7-1_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' rhel7-1" -> "container1_stop_0 rhel7-1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "container2_stop_0 rhel7-1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "lxc-ms-master_stop_0" [ style = bold] +"stonith 'reboot' rhel7-1" -> "lxc-ms_stop_0 lxc1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "lxc-ms_stop_0 lxc2" [ style = bold] +"stonith 'reboot' rhel7-1" -> "lxc1_stop_0 rhel7-1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "lxc2_stop_0 rhel7-1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "rsc_rhel7-1_stop_0 rhel7-1" [ style = bold] +"stonith 'reboot' rhel7-1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' rhel7-1" [ style=bold color="green" fontcolor="black"] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "container1_start_0 rhel7-2" [ style = bold] +"stonith_complete" -> "container2_start_0 rhel7-3" [ style = bold] +"stonith_complete" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"stonith_complete" -> "lxc-ms_start_0 lxc1" [ style = bold] +"stonith_complete" -> "lxc-ms_start_0 lxc2" [ style = bold] +"stonith_complete" -> "lxc1_start_0 rhel7-2" [ style = bold] +"stonith_complete" -> "lxc2_start_0 rhel7-3" [ style = bold] +"stonith_complete" -> "rsc_rhel7-1_start_0 rhel7-5" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] +} diff --git a/pengine/test10/guest-node-host-dies.exp b/pengine/test10/guest-node-host-dies.exp new file mode 100644 index 0000000..b3c24be --- /dev/null +++ b/pengine/test10/guest-node-host-dies.exp @@ -0,0 +1,539 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/guest-node-host-dies.scores b/pengine/test10/guest-node-host-dies.scores new file mode 100644 index 0000000..0d7ad3f --- /dev/null +++ b/pengine/test10/guest-node-host-dies.scores @@ -0,0 +1,80 @@ +Allocation scores: +clone_color: lxc-ms-master allocation score on lxc1: INFINITY +clone_color: lxc-ms-master allocation score on lxc2: INFINITY +clone_color: lxc-ms-master allocation score on rhel7-1: 0 +clone_color: lxc-ms-master allocation score on rhel7-2: 0 +clone_color: lxc-ms-master allocation score on rhel7-3: 0 +clone_color: lxc-ms-master allocation score on rhel7-4: 0 +clone_color: lxc-ms-master allocation score on rhel7-5: 0 +clone_color: lxc-ms:0 allocation score on lxc1: INFINITY +clone_color: lxc-ms:0 allocation score on lxc2: INFINITY +clone_color: lxc-ms:0 allocation score on rhel7-1: 0 +clone_color: lxc-ms:0 allocation score on rhel7-2: 0 +clone_color: lxc-ms:0 allocation score on rhel7-3: 0 +clone_color: lxc-ms:0 allocation score on rhel7-4: 0 +clone_color: lxc-ms:0 allocation score on rhel7-5: 0 +clone_color: lxc-ms:1 allocation score on lxc1: INFINITY +clone_color: lxc-ms:1 allocation score on lxc2: INFINITY +clone_color: lxc-ms:1 allocation score on rhel7-1: 0 +clone_color: lxc-ms:1 allocation score on rhel7-2: 0 +clone_color: lxc-ms:1 allocation score on rhel7-3: 0 +clone_color: lxc-ms:1 allocation score on rhel7-4: 0 +clone_color: lxc-ms:1 allocation score on rhel7-5: 0 +lxc-ms:0 promotion score on lxc1: INFINITY +lxc-ms:1 promotion score on lxc2: INFINITY +native_color: Fencing allocation score on lxc1: -INFINITY +native_color: Fencing allocation score on lxc2: -INFINITY +native_color: Fencing allocation score on rhel7-1: 0 +native_color: Fencing allocation score on rhel7-2: 0 +native_color: Fencing allocation score on rhel7-3: 0 +native_color: Fencing allocation score on rhel7-4: 0 +native_color: Fencing allocation score on rhel7-5: 0 +native_color: container1 allocation score on lxc1: -INFINITY +native_color: container1 allocation score on lxc2: -INFINITY +native_color: container1 allocation score on rhel7-1: -INFINITY +native_color: container1 allocation score on rhel7-2: 0 +native_color: container1 allocation score on rhel7-3: 0 +native_color: container1 allocation score on rhel7-4: 0 +native_color: container1 allocation score on rhel7-5: 0 +native_color: container2 allocation score on lxc1: -INFINITY +native_color: container2 allocation score on lxc2: -INFINITY +native_color: container2 allocation score on rhel7-1: -INFINITY +native_color: container2 allocation score on rhel7-2: 0 +native_color: container2 allocation score on rhel7-3: 0 +native_color: container2 allocation score on rhel7-4: 0 +native_color: container2 allocation score on rhel7-5: 0 +native_color: lxc-ms:0 allocation score on lxc1: INFINITY +native_color: lxc-ms:0 allocation score on lxc2: INFINITY +native_color: lxc-ms:0 allocation score on rhel7-1: -INFINITY +native_color: lxc-ms:0 allocation score on rhel7-2: 0 +native_color: lxc-ms:0 allocation score on rhel7-3: 0 +native_color: lxc-ms:0 allocation score on rhel7-4: 0 +native_color: lxc-ms:0 allocation score on rhel7-5: 0 +native_color: lxc-ms:1 allocation score on lxc1: -INFINITY +native_color: lxc-ms:1 allocation score on lxc2: INFINITY +native_color: lxc-ms:1 allocation score on rhel7-1: -INFINITY +native_color: lxc-ms:1 allocation score on rhel7-2: 0 +native_color: lxc-ms:1 allocation score on rhel7-3: 0 +native_color: lxc-ms:1 allocation score on rhel7-4: 0 +native_color: lxc-ms:1 allocation score on rhel7-5: 0 +native_color: lxc1 allocation score on lxc1: -INFINITY +native_color: lxc1 allocation score on lxc2: -INFINITY +native_color: lxc1 allocation score on rhel7-1: -INFINITY +native_color: lxc1 allocation score on rhel7-2: 0 +native_color: lxc1 allocation score on rhel7-3: -INFINITY +native_color: lxc1 allocation score on rhel7-4: -INFINITY +native_color: lxc1 allocation score on rhel7-5: -INFINITY +native_color: lxc2 allocation score on lxc1: -INFINITY +native_color: lxc2 allocation score on lxc2: -INFINITY +native_color: lxc2 allocation score on rhel7-1: -INFINITY +native_color: lxc2 allocation score on rhel7-2: -INFINITY +native_color: lxc2 allocation score on rhel7-3: 0 +native_color: lxc2 allocation score on rhel7-4: -INFINITY +native_color: lxc2 allocation score on rhel7-5: -INFINITY +native_color: rsc_rhel7-1 allocation score on lxc1: -INFINITY +native_color: rsc_rhel7-1 allocation score on lxc2: -INFINITY +native_color: rsc_rhel7-1 allocation score on rhel7-1: 100 +native_color: rsc_rhel7-1 allocation score on rhel7-2: 0 +native_color: rsc_rhel7-1 allocation score on rhel7-3: 0 +native_color: rsc_rhel7-1 allocation score on rhel7-4: 0 +native_color: rsc_rhel7-1 allocation score on rhel7-5: 0 diff --git a/pengine/test10/guest-node-host-dies.summary b/pengine/test10/guest-node-host-dies.summary new file mode 100644 index 0000000..8a1bfd4 --- /dev/null +++ b/pengine/test10/guest-node-host-dies.summary @@ -0,0 +1,73 @@ + +Current cluster status: +Node rhel7-1 (1): UNCLEAN (offline) +Online: [ rhel7-2 rhel7-3 rhel7-4 rhel7-5 ] +Containers: [ lxc1:container1 lxc2:container2 ] + + Fencing (stonith:fence_xvm): Started rhel7-4 + rsc_rhel7-1 (ocf::heartbeat:IPaddr2): Started rhel7-1 ( UNCLEAN ) + container1 (ocf::heartbeat:VirtualDomain): Started rhel7-1 ( UNCLEAN ) + container2 (ocf::heartbeat:VirtualDomain): Started rhel7-1 ( UNCLEAN ) + Master/Slave Set: lxc-ms-master [lxc-ms] + Masters: [ lxc1 ] + Slaves: [ lxc2 ] + +Transition Summary: + * Restart Fencing (Started rhel7-4) + * Move rsc_rhel7-1 (Started rhel7-1 -> rhel7-5) + * Move container1 (Started rhel7-1 -> rhel7-2) + * Move container2 (Started rhel7-1 -> rhel7-3) + * Restart lxc-ms:0 (Master lxc1) + * Restart lxc-ms:1 (Slave lxc2) + * Move lxc1 (Started rhel7-1 -> rhel7-2) + * Move lxc2 (Started rhel7-1 -> rhel7-3) + +Executing cluster transition: + * Resource action: Fencing stop on rhel7-4 + * Resource action: Fencing start on rhel7-4 + * Resource action: Fencing monitor=120000 on rhel7-4 + * Pseudo action: lxc-ms-master_demote_0 + * Fencing rhel7-1 (reboot) + * Pseudo action: stonith_complete + * Pseudo action: rsc_rhel7-1_stop_0 + * Resource action: lxc-ms demote on lxc1 + * Pseudo action: lxc-ms-master_demoted_0 + * Pseudo action: lxc-ms-master_stop_0 + * Pseudo action: lxc1_stop_0 + * Pseudo action: lxc2_stop_0 + * Resource action: rsc_rhel7-1 start on rhel7-5 + * Pseudo action: container1_stop_0 + * Pseudo action: container2_stop_0 + * Pseudo action: lxc-ms_stop_0 + * Pseudo action: lxc-ms_stop_0 + * Pseudo action: lxc-ms-master_stopped_0 + * Pseudo action: lxc-ms-master_start_0 + * Pseudo action: all_stopped + * Resource action: rsc_rhel7-1 monitor=5000 on rhel7-5 + * Resource action: container1 start on rhel7-2 + * Resource action: container2 start on rhel7-3 + * Resource action: lxc1 start on rhel7-2 + * Resource action: lxc2 start on rhel7-3 + * Resource action: lxc-ms start on lxc1 + * Resource action: lxc-ms start on lxc2 + * Resource action: lxc-ms monitor=10000 on lxc2 + * Pseudo action: lxc-ms-master_running_0 + * Resource action: lxc1 monitor=30000 on rhel7-2 + * Resource action: lxc2 monitor=30000 on rhel7-3 + * Pseudo action: lxc-ms-master_promote_0 + * Resource action: lxc-ms promote on lxc1 + * Pseudo action: lxc-ms-master_promoted_0 + +Revised cluster status: +Online: [ rhel7-2 rhel7-3 rhel7-4 rhel7-5 ] +OFFLINE: [ rhel7-1 ] +Containers: [ lxc1:container1 lxc2:container2 ] + + Fencing (stonith:fence_xvm): Started rhel7-4 + rsc_rhel7-1 (ocf::heartbeat:IPaddr2): Started rhel7-5 + container1 (ocf::heartbeat:VirtualDomain): Started rhel7-2 + container2 (ocf::heartbeat:VirtualDomain): Started rhel7-3 + Master/Slave Set: lxc-ms-master [lxc-ms] + Masters: [ lxc1 ] + Slaves: [ lxc2 ] + diff --git a/pengine/test10/guest-node-host-dies.xml b/pengine/test10/guest-node-host-dies.xml new file mode 100644 index 0000000..a840da1 --- /dev/null +++ b/pengine/test10/guest-node-host-dies.xml @@ -0,0 +1,294 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- 1.8.3.1 From beab7718e14a54f1b50d7c5ff4b0086e09332da3 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Fri, 15 Apr 2016 13:10:17 -0500 Subject: [PATCH 5/8] Fix: pengine: create a pseudo-fence for guest node recovery If a guest node needs to be recovered, the PE would previously order actions in relation to the stop action for the guest's container resource, if one was scheduled. This had problems: for implied stops due to fencing the guest's host, there would be no stop action, so no ordering could be done; ordering in relation to the stop action made stonith_constraints() mistakenly assume that the host node (the node for the stop action) was the fence target, and thus mistakenly mark the wrong stops/demotes as implied; and, clone notifications for fence events would not get called for guest node recoveries, whether explicit or implied. Now, a fence pseudo-event is created for guest node recovery, regardless of whether there is an explicit stop action scheduled for the container. This addresses all those issues, and will allow the crmd to be able to detect implied stops. This also allows us to simplify the implied stop/demote detection, since we will check the pseudo-op for implied actions -- we don't need to check the real fence op for implied actions on guest nodes. --- crmd/te_utils.c | 8 ++++++ pengine/allocate.c | 82 ++++++++++++++++++++++++++++++++++++++++++++---------- pengine/graph.c | 14 ++++------ pengine/native.c | 48 ++------------------------------ 4 files changed, 83 insertions(+), 69 deletions(-) diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 4c708a1..e7bf7ff 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -331,6 +331,14 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); + /* @TODO Ideally, at this point, we'd check whether the fenced node + * hosted any guest nodes, and call remote_node_down() for them. + * Unfortunately, the crmd doesn't have a simple, reliable way to + * map hosts to guests. It might be possible to track this in the + * peer cache via crm_remote_peer_cache_refresh(). For now, we rely + * on the PE creating fence pseudo-events for the guests. + */ + if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us diff --git a/pengine/allocate.c b/pengine/allocate.c index bdf03e5..74b57fb 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1341,6 +1341,70 @@ any_managed_resources(pe_working_set_t * data_set) return FALSE; } +/*! + * \internal + * \brief Create pseudo-op for guest node fence, and order relative to it + * + * \param[in] node Guest node to fence + * \param[in] done STONITH_DONE operation + * \param[in] data_set Working set of CIB state + */ +static void +fence_guest(pe_node_t *node, pe_action_t *done, pe_working_set_t *data_set) +{ + resource_t *container = node->details->remote_rsc->container; + pe_action_t *stop = NULL; + pe_action_t *stonith_op = NULL; + + /* The fence action is just a label; we don't do anything differently for + * off vs. reboot. We specify it explicitly, rather than let it default to + * cluster's default action, because we are not _initiating_ fencing -- we + * are creating a pseudo-event to describe fencing that is already occurring + * by other means (container recovery). + */ + const char *fence_action = "off"; + + /* Check whether guest's container resource is has any explicit stop or + * start (the stop may be implied by fencing of the guest's host). + */ + if (container) { + stop = find_first_action(container->actions, NULL, CRMD_ACTION_STOP, NULL); + + if (find_first_action(container->actions, NULL, CRMD_ACTION_START, NULL)) { + fence_action = "reboot"; + } + } + + /* Create a fence pseudo-event, so we have an event to order actions + * against, and crmd can always detect it. + */ + stonith_op = pe_fence_op(node, fence_action, FALSE, data_set); + update_action_flags(stonith_op, pe_action_pseudo | pe_action_runnable, + __FUNCTION__); + + /* We want to imply stops/demotes after the guest is stopped, not wait until + * it is restarted, so we always order pseudo-fencing after stop, not start + * (even though start might be closer to what is done for a real reboot). + */ + if (stop) { + order_actions(stop, stonith_op, + pe_order_runnable_left|pe_order_implies_then); + crm_info("Implying guest node %s is down (action %d) " + "after container %s is stopped (action %d)", + node->details->uname, stonith_op->id, + container->id, stop->id); + } else { + crm_info("Implying guest node %s is down (action %d) ", + node->details->uname, stonith_op->id); + } + + /* @TODO: Order pseudo-fence after any (optional) fence of guest's host */ + + /* Order/imply other actions relative to pseudo-fence as with real fence */ + stonith_constraints(node, stonith_op, data_set); + order_actions(stonith_op, done, pe_order_implies_then); +} + /* * Create dependencies for stonith and shutdown operations */ @@ -1369,24 +1433,12 @@ stage6(pe_working_set_t * data_set) for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; - /* Guest nodes are "fenced" by recovering their container resource. - * The container stop may be explicit, or implied by the fencing of the - * guest's host. + /* Guest nodes are "fenced" by recovering their container resource, + * so handle them separately. */ if (is_container_remote_node(node)) { if (node->details->remote_requires_reset && need_stonith) { - resource_t *container = node->details->remote_rsc->container; - char *key = stop_key(container); - GListPtr stop_list = find_actions(container->actions, key, NULL); - - crm_info("Implying node %s is down when container %s is stopped (%p)", - node->details->uname, container->id, stop_list); - if(stop_list) { - stonith_constraints(node, stop_list->data, data_set); - } - - g_list_free(stop_list); - free(key); + fence_guest(node, done, data_set); } continue; } diff --git a/pengine/graph.c b/pengine/graph.c index ee7c7c8..569cf6e 100644 --- a/pengine/graph.c +++ b/pengine/graph.c @@ -715,13 +715,7 @@ stonith_constraints(node_t * node, action_t * stonith_op, pe_working_set_t * dat CRM_CHECK(stonith_op != NULL, return FALSE); for (r = data_set->resources; r != NULL; r = r->next) { - resource_t *rsc = (resource_t *) r->data; - - if ((stonith_op->rsc == NULL) - || ((stonith_op->rsc != rsc) && (stonith_op->rsc != rsc->container))) { - - rsc_stonith_ordering(rsc, stonith_op, data_set); - } + rsc_stonith_ordering((resource_t *) r->data, stonith_op, data_set); } return TRUE; } @@ -888,7 +882,11 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) } if (safe_str_eq(action->task, CRM_OP_FENCE)) { - action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); + /* All fences need node info; guest node fences are pseudo-events */ + action_xml = create_xml_node(NULL, + is_set(action->flags, pe_action_pseudo)? + XML_GRAPH_TAG_PSEUDO_EVENT : + XML_GRAPH_TAG_CRM_EVENT); } else if (safe_str_eq(action->task, CRM_OP_SHUTDOWN)) { action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); diff --git a/pengine/native.c b/pengine/native.c index 56a1434..ff4467b 100644 --- a/pengine/native.c +++ b/pengine/native.c @@ -2902,48 +2902,6 @@ native_start_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set } } -/* User data to pass to guest node iterator */ -struct action_list_s { - GListPtr search_list; /* list of actions to search */ - GListPtr result_list; /* list of matching actions for this node */ - const char *key; /* action key to match */ -}; - -/*! - * \internal - * \brief Prepend a node's actions matching a key to a list - * - * \param[in] node Guest node - * \param[in/out] data User data - */ -static void prepend_node_actions(const node_t *node, void *data) -{ - GListPtr actions; - struct action_list_s *info = (struct action_list_s *) data; - - actions = find_actions(info->search_list, info->key, node); - info->result_list = g_list_concat(actions, info->result_list); -} - -static GListPtr -find_fence_target_node_actions(GListPtr search_list, const char *key, node_t *fence_target, pe_working_set_t *data_set) -{ - struct action_list_s action_list; - - /* Actions on the target that match the key are implied by the fencing */ - action_list.search_list = search_list; - action_list.result_list = find_actions(search_list, key, fence_target); - action_list.key = key; - - /* - * If the target is a host for any guest nodes, actions on those nodes - * that match the key are also implied by the fencing. - */ - pe_foreach_guest_node(data_set, fence_target, prepend_node_actions, &action_list); - - return action_list.result_list; -} - static void native_stop_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set_t * data_set) { @@ -2963,8 +2921,7 @@ native_stop_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set_ /* Get a list of stop actions potentially implied by the fencing */ key = stop_key(rsc); - action_list = find_fence_target_node_actions(rsc->actions, key, target, - data_set); + action_list = find_actions(rsc->actions, key, target); free(key); for (gIter = action_list; gIter != NULL; gIter = gIter->next) { @@ -3061,8 +3018,7 @@ native_stop_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set_ /* Get a list of demote actions potentially implied by the fencing */ key = demote_key(rsc); - action_list = find_fence_target_node_actions(rsc->actions, key, target, - data_set); + action_list = find_actions(rsc->actions, key, target); free(key); for (gIter = action_list; gIter != NULL; gIter = gIter->next) { -- 1.8.3.1 From b7ce740edf3d71fcccead2288bf0ab11037f9672 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 23 Nov 2016 14:56:29 -0600 Subject: [PATCH 6/8] Fix: pengine: consider guest node unclean if its host is unclean --- lib/pengine/unpack.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index a49e108..6737273 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1406,6 +1406,7 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) { resource_t *rsc = this_node->details->remote_rsc; resource_t *container = NULL; + pe_node_t *host = NULL; /* If there is a node state entry for a (former) Pacemaker Remote node * but no resource creating that node, the node's connection resource will @@ -1418,6 +1419,10 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) container = rsc->container; + if (container && (g_list_length(rsc->running_on) == 1)) { + host = rsc->running_on->data; + } + /* If the resource is currently started, mark it online. */ if (rsc->role == RSC_ROLE_STARTED) { crm_trace("%s node %s presumed ONLINE because connection resource is started", @@ -1451,6 +1456,13 @@ determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = FALSE; + + } else if (host && (host->details->online == FALSE) + && host->details->unclean) { + crm_trace("Guest node %s UNCLEAN because host is unclean", + this_node->details->id); + this_node->details->online = FALSE; + this_node->details->remote_requires_reset = TRUE; } remote_online_done: -- 1.8.3.1 From a9977a2dc1135d040088b90e1ea307b3ff71c0b5 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 22 Mar 2016 16:43:53 -0500 Subject: [PATCH 7/8] Test: pengine: update regression tests for guest node pseudo-fencing --- pengine/test10/bug-cl-5247.dot | 45 +++-- pengine/test10/bug-cl-5247.exp | 121 ++++++++----- pengine/test10/bug-cl-5247.summary | 18 +- pengine/test10/bug-rh-1097457.dot | 23 ++- pengine/test10/bug-rh-1097457.exp | 70 ++++++-- pengine/test10/bug-rh-1097457.summary | 12 +- pengine/test10/guest-node-host-dies.dot | 17 +- pengine/test10/guest-node-host-dies.exp | 190 +++++++++++++-------- pengine/test10/guest-node-host-dies.summary | 48 +++--- pengine/test10/whitebox-fail1.dot | 15 +- pengine/test10/whitebox-fail1.exp | 50 +++++- pengine/test10/whitebox-fail1.summary | 12 +- pengine/test10/whitebox-fail2.dot | 15 +- pengine/test10/whitebox-fail2.exp | 50 +++++- pengine/test10/whitebox-fail2.summary | 12 +- pengine/test10/whitebox-imply-stop-on-fence.dot | 9 +- pengine/test10/whitebox-imply-stop-on-fence.exp | 177 +++++++++++-------- .../test10/whitebox-imply-stop-on-fence.summary | 22 +-- pengine/test10/whitebox-ms-ordering.dot | 18 ++ pengine/test10/whitebox-ms-ordering.exp | 76 +++++++++ pengine/test10/whitebox-ms-ordering.summary | 3 + pengine/test10/whitebox-unexpectedly-running.dot | 5 + pengine/test10/whitebox-unexpectedly-running.exp | 30 ++++ .../test10/whitebox-unexpectedly-running.summary | 2 + 24 files changed, 748 insertions(+), 292 deletions(-) diff --git a/pengine/test10/bug-cl-5247.dot b/pengine/test10/bug-cl-5247.dot index ed728ac..0ab7893 100644 --- a/pengine/test10/bug-cl-5247.dot +++ b/pengine/test10/bug-cl-5247.dot @@ -1,4 +1,6 @@ digraph "g" { +"all_stopped" -> "prmStonith1-2_start_0 bl460g8n4" [ style = bold] +"all_stopped" -> "prmStonith2-2_start_0 bl460g8n3" [ style = bold] "all_stopped" [ style=bold color="green" fontcolor="orange"] "grpStonith1_running_0" [ style=bold color="green" fontcolor="orange"] "grpStonith1_start_0" -> "grpStonith1_running_0" [ style = bold] @@ -52,7 +54,7 @@ digraph "g" { "msPostgresql_post_notify_demoted_0" -> "pgsql_post_notify_demoted_0 pgsr01" [ style = bold] "msPostgresql_post_notify_demoted_0" [ style=bold color="green" fontcolor="orange"] "msPostgresql_post_notify_stopped_0" -> "msPostgresql_confirmed-post_notify_stopped_0" [ style = bold] -"msPostgresql_post_notify_stopped_0" -> "pgsql_post_notify_stop_0 pgsr01" [ style = bold] +"msPostgresql_post_notify_stopped_0" -> "pgsql_post_notify_stonith_0 pgsr01" [ style = bold] "msPostgresql_post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] "msPostgresql_pre_notify_demote_0" -> "msPostgresql_confirmed-pre_notify_demote_0" [ style = bold] "msPostgresql_pre_notify_demote_0" -> "pgsql_pre_notify_demote_0 pgsr01" [ style = bold] @@ -65,21 +67,21 @@ digraph "g" { "msPostgresql_stop_0" [ style=bold color="green" fontcolor="orange"] "msPostgresql_stopped_0" -> "msPostgresql_post_notify_stopped_0" [ style = bold] "msPostgresql_stopped_0" [ style=bold color="green" fontcolor="orange"] -"pgsql_confirmed-post_notify_stop_0" -> "all_stopped" [ style = bold] -"pgsql_confirmed-post_notify_stop_0" -> "pgsql_monitor_9000 pgsr01" [ style = bold] -"pgsql_confirmed-post_notify_stop_0" [ style=bold color="green" fontcolor="orange"] +"pgsql_confirmed-post_notify_stonith_0" -> "all_stopped" [ style = bold] +"pgsql_confirmed-post_notify_stonith_0" -> "pgsql_monitor_9000 pgsr01" [ style = bold] +"pgsql_confirmed-post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] "pgsql_demote_0 pgsr02" -> "msPostgresql_demoted_0" [ style = bold] "pgsql_demote_0 pgsr02" -> "pgsql_stop_0 pgsr02" [ style = bold] "pgsql_demote_0 pgsr02" [ style=bold color="green" fontcolor="orange"] "pgsql_monitor_9000 pgsr01" [ style=bold color="green" fontcolor="black"] "pgsql_post_notify_demoted_0 pgsr01" -> "msPostgresql_confirmed-post_notify_demoted_0" [ style = bold] "pgsql_post_notify_demoted_0 pgsr01" [ style=bold color="green" fontcolor="black"] -"pgsql_post_notify_stop_0 pgsr01" -> "msPostgresql_confirmed-post_notify_stopped_0" [ style = bold] -"pgsql_post_notify_stop_0 pgsr01" -> "pgsql_confirmed-post_notify_stop_0" [ style = bold] -"pgsql_post_notify_stop_0 pgsr01" [ style=bold color="green" fontcolor="black"] -"pgsql_post_notify_stop_0" -> "pgsql_confirmed-post_notify_stop_0" [ style = bold] -"pgsql_post_notify_stop_0" -> "pgsql_post_notify_stop_0 pgsr01" [ style = bold] -"pgsql_post_notify_stop_0" [ style=bold color="green" fontcolor="orange"] +"pgsql_post_notify_stonith_0 pgsr01" -> "msPostgresql_confirmed-post_notify_stopped_0" [ style = bold] +"pgsql_post_notify_stonith_0 pgsr01" -> "pgsql_confirmed-post_notify_stonith_0" [ style = bold] +"pgsql_post_notify_stonith_0 pgsr01" [ style=bold color="green" fontcolor="black"] +"pgsql_post_notify_stonith_0" -> "pgsql_confirmed-post_notify_stonith_0" [ style = bold] +"pgsql_post_notify_stonith_0" -> "pgsql_post_notify_stonith_0 pgsr01" [ style = bold] +"pgsql_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] "pgsql_pre_notify_demote_0 pgsr01" -> "msPostgresql_confirmed-pre_notify_demote_0" [ style = bold] "pgsql_pre_notify_demote_0 pgsr01" [ style=bold color="green" fontcolor="black"] "pgsql_pre_notify_stop_0 pgsr01" -> "msPostgresql_confirmed-pre_notify_stop_0" [ style = bold] @@ -91,13 +93,7 @@ digraph "g" { "pgsr02_stop_0 bl460g8n4" -> "prmDB2_stop_0 bl460g8n4" [ style = bold] "pgsr02_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] "prmDB2_stop_0 bl460g8n4" -> "all_stopped" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "master-group_stop_0" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "msPostgresql_stop_0" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "pgsql_demote_0 pgsr02" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "pgsql_post_notify_stop_0" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "pgsql_stop_0 pgsr02" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "vip-master_stop_0 pgsr02" [ style = bold] -"prmDB2_stop_0 bl460g8n4" -> "vip-rep_stop_0 pgsr02" [ style = bold] +"prmDB2_stop_0 bl460g8n4" -> "stonith 'off' pgsr02" [ style = bold] "prmDB2_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] "prmStonith1-2_monitor_3600000 bl460g8n4" [ style=bold color="green" fontcolor="black"] "prmStonith1-2_start_0 bl460g8n4" -> "grpStonith1_running_0" [ style = bold] @@ -106,7 +102,7 @@ digraph "g" { "prmStonith1-2_stop_0 bl460g8n4" -> "all_stopped" [ style = bold] "prmStonith1-2_stop_0 bl460g8n4" -> "grpStonith1_stopped_0" [ style = bold] "prmStonith1-2_stop_0 bl460g8n4" -> "prmStonith1-2_start_0 bl460g8n4" [ style = bold] -"prmStonith1-2_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="orange"] +"prmStonith1-2_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] "prmStonith2-2_monitor_3600000 bl460g8n3" [ style=bold color="green" fontcolor="black"] "prmStonith2-2_start_0 bl460g8n3" -> "grpStonith2_running_0" [ style = bold] "prmStonith2-2_start_0 bl460g8n3" -> "prmStonith2-2_monitor_3600000 bl460g8n3" [ style = bold] @@ -115,6 +111,19 @@ digraph "g" { "prmStonith2-2_stop_0 bl460g8n3" -> "grpStonith2_stopped_0" [ style = bold] "prmStonith2-2_stop_0 bl460g8n3" -> "prmStonith2-2_start_0 bl460g8n3" [ style = bold] "prmStonith2-2_stop_0 bl460g8n3" [ style=bold color="green" fontcolor="black"] +"stonith 'off' pgsr02" -> "master-group_stop_0" [ style = bold] +"stonith 'off' pgsr02" -> "msPostgresql_stop_0" [ style = bold] +"stonith 'off' pgsr02" -> "pgsql_demote_0 pgsr02" [ style = bold] +"stonith 'off' pgsr02" -> "pgsql_post_notify_stonith_0" [ style = bold] +"stonith 'off' pgsr02" -> "pgsql_stop_0 pgsr02" [ style = bold] +"stonith 'off' pgsr02" -> "stonith_complete" [ style = bold] +"stonith 'off' pgsr02" -> "vip-master_stop_0 pgsr02" [ style = bold] +"stonith 'off' pgsr02" -> "vip-rep_stop_0 pgsr02" [ style = bold] +"stonith 'off' pgsr02" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "vip-master_start_0 pgsr01" [ style = bold] +"stonith_complete" -> "vip-rep_start_0 pgsr01" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] "vip-master_monitor_10000 pgsr01" [ style=bold color="green" fontcolor="black"] "vip-master_start_0 pgsr01" -> "master-group_running_0" [ style = bold] "vip-master_start_0 pgsr01" -> "vip-master_monitor_10000 pgsr01" [ style = bold] diff --git a/pengine/test10/bug-cl-5247.exp b/pengine/test10/bug-cl-5247.exp index 24bccdd..59a62af 100644 --- a/pengine/test10/bug-cl-5247.exp +++ b/pengine/test10/bug-cl-5247.exp @@ -20,7 +20,7 @@ - + @@ -64,9 +64,10 @@ - + + - + @@ -83,7 +84,10 @@ - + + + + @@ -175,6 +179,9 @@ + + + @@ -221,10 +228,10 @@ - + - + @@ -285,6 +292,9 @@ + + + @@ -295,14 +305,14 @@ - - - + + + @@ -335,6 +345,9 @@ + + + @@ -345,37 +358,37 @@ - + - + - - + + - + - + - - + + - + @@ -387,14 +400,14 @@ - - - + + + @@ -405,16 +418,16 @@ - + - + - + @@ -427,7 +440,7 @@ - + @@ -440,7 +453,7 @@ - + @@ -453,9 +466,9 @@ - + - + @@ -463,7 +476,7 @@ - + @@ -482,7 +495,7 @@ - + @@ -497,7 +510,7 @@ - + @@ -527,7 +540,7 @@ - + @@ -577,7 +590,7 @@ - + @@ -607,7 +620,7 @@ - + @@ -646,14 +659,14 @@ - - - + + + @@ -670,6 +683,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -679,7 +719,7 @@ - + @@ -700,7 +740,10 @@ - + + + + diff --git a/pengine/test10/bug-cl-5247.summary b/pengine/test10/bug-cl-5247.summary index 5564286..09dc301 100644 --- a/pengine/test10/bug-cl-5247.summary +++ b/pengine/test10/bug-cl-5247.summary @@ -28,7 +28,7 @@ Transition Summary: Executing cluster transition: * Pseudo action: grpStonith1_stop_0 - * Pseudo action: prmStonith1-2_stop_0 + * Resource action: prmStonith1-2 stop on bl460g8n4 * Pseudo action: grpStonith2_stop_0 * Resource action: prmStonith2-2 stop on bl460g8n3 * Pseudo action: msPostgresql_pre_notify_demote_0 @@ -36,18 +36,14 @@ Executing cluster transition: * Resource action: prmDB2 stop on bl460g8n4 * Pseudo action: grpStonith1_stopped_0 * Pseudo action: grpStonith1_start_0 - * Resource action: prmStonith1-2 start on bl460g8n4 - * Resource action: prmStonith1-2 monitor=3600000 on bl460g8n4 * Pseudo action: grpStonith2_stopped_0 * Pseudo action: grpStonith2_start_0 - * Resource action: prmStonith2-2 start on bl460g8n3 - * Resource action: prmStonith2-2 monitor=3600000 on bl460g8n3 - * Pseudo action: pgsql_post_notify_stop_0 * Resource action: pgsql notify on pgsr01 * Pseudo action: msPostgresql_confirmed-pre_notify_demote_0 * Pseudo action: msPostgresql_demote_0 - * Pseudo action: grpStonith1_running_0 - * Pseudo action: grpStonith2_running_0 + * Pseudo action: stonith-pgsr02-off on pgsr02 + * Pseudo action: stonith_complete + * Pseudo action: pgsql_post_notify_stop_0 * Pseudo action: pgsql_demote_0 * Pseudo action: msPostgresql_demoted_0 * Pseudo action: msPostgresql_post_notify_demoted_0 @@ -75,6 +71,12 @@ Executing cluster transition: * Pseudo action: pgsql_notified_0 * Resource action: pgsql monitor=9000 on pgsr01 * Pseudo action: all_stopped + * Resource action: prmStonith1-2 start on bl460g8n4 + * Resource action: prmStonith1-2 monitor=3600000 on bl460g8n4 + * Resource action: prmStonith2-2 start on bl460g8n3 + * Resource action: prmStonith2-2 monitor=3600000 on bl460g8n3 + * Pseudo action: grpStonith1_running_0 + * Pseudo action: grpStonith2_running_0 Using the original execution date of: 2015-08-12 02:53:40Z Revised cluster status: diff --git a/pengine/test10/bug-rh-1097457.dot b/pengine/test10/bug-rh-1097457.dot index 078d177..ece2834 100644 --- a/pengine/test10/bug-rh-1097457.dot +++ b/pengine/test10/bug-rh-1097457.dot @@ -47,14 +47,9 @@ digraph "g" { "VM2_start_0 lama3" -> "VM2_monitor_10000 lama3" [ style = bold] "VM2_start_0 lama3" -> "lamaVM2_start_0 lama3" [ style = bold] "VM2_start_0 lama3" [ style=bold color="green" fontcolor="black"] -"VM2_stop_0 lama3" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold] -"VM2_stop_0 lama3" -> "FAKE4_stop_0 lamaVM2" [ style = bold] -"VM2_stop_0 lama3" -> "FAKE6-clone_stop_0" [ style = bold] -"VM2_stop_0 lama3" -> "FAKE6_stop_0 lamaVM2" [ style = bold] -"VM2_stop_0 lama3" -> "FSlun3_stop_0 lamaVM2" [ style = bold] "VM2_stop_0 lama3" -> "VM2_start_0 lama3" [ style = bold] "VM2_stop_0 lama3" -> "all_stopped" [ style = bold] -"VM2_stop_0 lama3" -> "lamaVM2-G4_stop_0" [ style = bold] +"VM2_stop_0 lama3" -> "stonith 'reboot' lamaVM2" [ style = bold] "VM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"] "all_stopped" [ style=bold color="green" fontcolor="orange"] "lamaVM2-G4_running_0" [ style=bold color="green" fontcolor="orange"] @@ -83,4 +78,20 @@ digraph "g" { "lamaVM2_stop_0 lama3" -> "all_stopped" [ style = bold] "lamaVM2_stop_0 lama3" -> "lamaVM2_start_0 lama3" [ style = bold] "lamaVM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' lamaVM2" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold] +"stonith 'reboot' lamaVM2" -> "FAKE4_stop_0 lamaVM2" [ style = bold] +"stonith 'reboot' lamaVM2" -> "FAKE6-clone_stop_0" [ style = bold] +"stonith 'reboot' lamaVM2" -> "FAKE6_stop_0 lamaVM2" [ style = bold] +"stonith 'reboot' lamaVM2" -> "FSlun3_stop_0 lamaVM2" [ style = bold] +"stonith 'reboot' lamaVM2" -> "lamaVM2-G4_stop_0" [ style = bold] +"stonith 'reboot' lamaVM2" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lamaVM2" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "FAKE4-IP_start_0 lamaVM2" [ style = bold] +"stonith_complete" -> "FAKE4_start_0 lamaVM2" [ style = bold] +"stonith_complete" -> "FAKE6_start_0 lamaVM2" [ style = bold] +"stonith_complete" -> "FSlun3_start_0 lama2" [ style = bold] +"stonith_complete" -> "VM2_start_0 lama3" [ style = bold] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "lamaVM2_start_0 lama3" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] } diff --git a/pengine/test10/bug-rh-1097457.exp b/pengine/test10/bug-rh-1097457.exp index 94a4e8d..0c3430c 100644 --- a/pengine/test10/bug-rh-1097457.exp +++ b/pengine/test10/bug-rh-1097457.exp @@ -10,6 +10,9 @@ + + + @@ -65,6 +68,9 @@ + + + @@ -75,10 +81,10 @@ - + - + @@ -121,7 +127,7 @@ - + @@ -175,6 +181,9 @@ + + + @@ -185,14 +194,14 @@ - - - + + + @@ -231,6 +240,9 @@ + + + @@ -241,10 +253,10 @@ - + - + @@ -281,6 +293,9 @@ + + + @@ -291,10 +306,10 @@ - + - + @@ -337,7 +352,7 @@ - + @@ -382,6 +397,9 @@ + + + @@ -411,6 +429,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -434,6 +479,9 @@ + + + diff --git a/pengine/test10/bug-rh-1097457.summary b/pengine/test10/bug-rh-1097457.summary index d78d951..f8d2c14 100644 --- a/pengine/test10/bug-rh-1097457.summary +++ b/pengine/test10/bug-rh-1097457.summary @@ -42,18 +42,20 @@ Transition Summary: Executing cluster transition: * Resource action: lamaVM2 stop on lama3 * Resource action: VM2 stop on lama3 + * Pseudo action: stonith-lamaVM2-reboot on lamaVM2 + * Pseudo action: stonith_complete + * Resource action: VM2 start on lama3 + * Resource action: VM2 monitor=10000 on lama3 * Pseudo action: lamaVM2-G4_stop_0 * Pseudo action: FAKE4-IP_stop_0 * Pseudo action: FAKE6-clone_stop_0 - * Resource action: VM2 start on lama3 - * Resource action: VM2 monitor=10000 on lama3 + * Resource action: lamaVM2 start on lama3 + * Resource action: lamaVM2 monitor=30000 on lama3 + * Resource action: FSlun3 monitor=10000 on lamaVM2 * Pseudo action: FAKE4_stop_0 * Pseudo action: FAKE6_stop_0 * Pseudo action: FAKE6-clone_stopped_0 * Pseudo action: FAKE6-clone_start_0 - * Resource action: lamaVM2 start on lama3 - * Resource action: lamaVM2 monitor=30000 on lama3 - * Resource action: FSlun3 monitor=10000 on lamaVM2 * Pseudo action: lamaVM2-G4_stopped_0 * Resource action: FAKE6 start on lamaVM2 * Resource action: FAKE6 monitor=30000 on lamaVM2 diff --git a/pengine/test10/guest-node-host-dies.dot b/pengine/test10/guest-node-host-dies.dot index 01858b3..c1ced94 100644 --- a/pengine/test10/guest-node-host-dies.dot +++ b/pengine/test10/guest-node-host-dies.dot @@ -5,16 +5,19 @@ digraph "g" { "Fencing_stop_0 rhel7-4" -> "Fencing_start_0 rhel7-4" [ style = bold] "Fencing_stop_0 rhel7-4" -> "all_stopped" [ style = bold] "Fencing_stop_0 rhel7-4" [ style=bold color="green" fontcolor="black"] +"all_stopped" -> "Fencing_start_0 rhel7-4" [ style = bold] "all_stopped" [ style=bold color="green" fontcolor="orange"] "container1_start_0 rhel7-2" -> "lxc1_start_0 rhel7-2" [ style = bold] "container1_start_0 rhel7-2" [ style=bold color="green" fontcolor="black"] "container1_stop_0 rhel7-1" -> "all_stopped" [ style = bold] "container1_stop_0 rhel7-1" -> "container1_start_0 rhel7-2" [ style = bold] +"container1_stop_0 rhel7-1" -> "stonith 'reboot' lxc1" [ style = bold] "container1_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] "container2_start_0 rhel7-3" -> "lxc2_start_0 rhel7-3" [ style = bold] "container2_start_0 rhel7-3" [ style=bold color="green" fontcolor="black"] "container2_stop_0 rhel7-1" -> "all_stopped" [ style = bold] "container2_stop_0 rhel7-1" -> "container2_start_0 rhel7-3" [ style = bold] +"container2_stop_0 rhel7-1" -> "stonith 'reboot' lxc2" [ style = bold] "container2_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] "lxc-ms-master_demote_0" -> "lxc-ms-master_demoted_0" [ style = bold] "lxc-ms-master_demote_0" -> "lxc-ms_demote_0 lxc1" [ style = bold] @@ -42,7 +45,7 @@ digraph "g" { "lxc-ms_demote_0 lxc1" -> "lxc-ms-master_demoted_0" [ style = bold] "lxc-ms_demote_0 lxc1" -> "lxc-ms_promote_0 lxc1" [ style = bold] "lxc-ms_demote_0 lxc1" -> "lxc-ms_stop_0 lxc1" [ style = bold] -"lxc-ms_demote_0 lxc1" [ style=bold color="green" fontcolor="black"] +"lxc-ms_demote_0 lxc1" [ style=bold color="green" fontcolor="orange"] "lxc-ms_monitor_10000 lxc2" [ style=bold color="green" fontcolor="black"] "lxc-ms_promote_0 lxc1" -> "lxc-ms-master_promoted_0" [ style = bold] "lxc-ms_promote_0 lxc1" [ style=bold color="green" fontcolor="black"] @@ -84,11 +87,17 @@ digraph "g" { "rsc_rhel7-1_stop_0 rhel7-1" -> "all_stopped" [ style = bold] "rsc_rhel7-1_stop_0 rhel7-1" -> "rsc_rhel7-1_start_0 rhel7-5" [ style = bold] "rsc_rhel7-1_stop_0 rhel7-1" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' lxc1" -> "lxc-ms-master_stop_0" [ style = bold] +"stonith 'reboot' lxc1" -> "lxc-ms_demote_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "lxc-ms_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc1" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' lxc2" -> "lxc-ms-master_stop_0" [ style = bold] +"stonith 'reboot' lxc2" -> "lxc-ms_stop_0 lxc2" [ style = bold] +"stonith 'reboot' lxc2" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc2" [ style=bold color="green" fontcolor="orange"] "stonith 'reboot' rhel7-1" -> "container1_stop_0 rhel7-1" [ style = bold] "stonith 'reboot' rhel7-1" -> "container2_stop_0 rhel7-1" [ style = bold] -"stonith 'reboot' rhel7-1" -> "lxc-ms-master_stop_0" [ style = bold] -"stonith 'reboot' rhel7-1" -> "lxc-ms_stop_0 lxc1" [ style = bold] -"stonith 'reboot' rhel7-1" -> "lxc-ms_stop_0 lxc2" [ style = bold] "stonith 'reboot' rhel7-1" -> "lxc1_stop_0 rhel7-1" [ style = bold] "stonith 'reboot' rhel7-1" -> "lxc2_stop_0 rhel7-1" [ style = bold] "stonith 'reboot' rhel7-1" -> "rsc_rhel7-1_stop_0 rhel7-1" [ style = bold] diff --git a/pengine/test10/guest-node-host-dies.exp b/pengine/test10/guest-node-host-dies.exp index b3c24be..6bd7a60 100644 --- a/pengine/test10/guest-node-host-dies.exp +++ b/pengine/test10/guest-node-host-dies.exp @@ -1,7 +1,7 @@ - + @@ -10,14 +10,17 @@ - + - + + + + @@ -30,33 +33,33 @@ - + - + - + - + - + @@ -65,26 +68,26 @@ - + - + - + - + @@ -93,7 +96,7 @@ - + @@ -102,20 +105,20 @@ - + - + - + @@ -124,7 +127,7 @@ - + @@ -133,23 +136,23 @@ - + - + - + - + @@ -164,69 +167,72 @@ - + + - + - + - + + + + - + - - + - + - + + + + + + + - - + - + - - - - + - - - - + - - - + + + + - + - + @@ -239,7 +245,7 @@ - + @@ -254,17 +260,16 @@ - - - - + + + - + - + @@ -276,7 +281,7 @@ - + @@ -299,7 +304,7 @@ - + @@ -329,10 +334,10 @@ - + - + @@ -350,7 +355,10 @@ - + + + + @@ -362,7 +370,7 @@ - + @@ -409,7 +417,7 @@ - + @@ -427,7 +435,7 @@ - + @@ -453,7 +461,7 @@ - + @@ -471,13 +479,13 @@ - + - + @@ -490,40 +498,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + - + - + - + - + - + - + - + - + diff --git a/pengine/test10/guest-node-host-dies.summary b/pengine/test10/guest-node-host-dies.summary index 8a1bfd4..83af5ce 100644 --- a/pengine/test10/guest-node-host-dies.summary +++ b/pengine/test10/guest-node-host-dies.summary @@ -2,58 +2,58 @@ Current cluster status: Node rhel7-1 (1): UNCLEAN (offline) Online: [ rhel7-2 rhel7-3 rhel7-4 rhel7-5 ] -Containers: [ lxc1:container1 lxc2:container2 ] Fencing (stonith:fence_xvm): Started rhel7-4 rsc_rhel7-1 (ocf::heartbeat:IPaddr2): Started rhel7-1 ( UNCLEAN ) - container1 (ocf::heartbeat:VirtualDomain): Started rhel7-1 ( UNCLEAN ) - container2 (ocf::heartbeat:VirtualDomain): Started rhel7-1 ( UNCLEAN ) + container1 (ocf::heartbeat:VirtualDomain): FAILED rhel7-1 (UNCLEAN) + container2 (ocf::heartbeat:VirtualDomain): FAILED rhel7-1 (UNCLEAN) Master/Slave Set: lxc-ms-master [lxc-ms] - Masters: [ lxc1 ] - Slaves: [ lxc2 ] + Stopped: [ rhel7-1 rhel7-2 rhel7-3 rhel7-4 rhel7-5 ] Transition Summary: * Restart Fencing (Started rhel7-4) * Move rsc_rhel7-1 (Started rhel7-1 -> rhel7-5) - * Move container1 (Started rhel7-1 -> rhel7-2) - * Move container2 (Started rhel7-1 -> rhel7-3) - * Restart lxc-ms:0 (Master lxc1) - * Restart lxc-ms:1 (Slave lxc2) + * Recover container1 (Started rhel7-1 -> rhel7-2) + * Recover container2 (Started rhel7-1 -> rhel7-3) + * Recover lxc-ms:0 (Master lxc1) + * Recover lxc-ms:1 (Slave lxc2) * Move lxc1 (Started rhel7-1 -> rhel7-2) * Move lxc2 (Started rhel7-1 -> rhel7-3) Executing cluster transition: * Resource action: Fencing stop on rhel7-4 - * Resource action: Fencing start on rhel7-4 - * Resource action: Fencing monitor=120000 on rhel7-4 * Pseudo action: lxc-ms-master_demote_0 * Fencing rhel7-1 (reboot) - * Pseudo action: stonith_complete * Pseudo action: rsc_rhel7-1_stop_0 - * Resource action: lxc-ms demote on lxc1 - * Pseudo action: lxc-ms-master_demoted_0 - * Pseudo action: lxc-ms-master_stop_0 * Pseudo action: lxc1_stop_0 * Pseudo action: lxc2_stop_0 - * Resource action: rsc_rhel7-1 start on rhel7-5 * Pseudo action: container1_stop_0 * Pseudo action: container2_stop_0 + * Pseudo action: stonith-lxc2-reboot on lxc2 + * Pseudo action: stonith-lxc1-reboot on lxc1 + * Pseudo action: stonith_complete + * Resource action: rsc_rhel7-1 start on rhel7-5 + * Resource action: container1 start on rhel7-2 + * Resource action: container2 start on rhel7-3 + * Pseudo action: lxc-ms_demote_0 + * Pseudo action: lxc-ms-master_demoted_0 + * Pseudo action: lxc-ms-master_stop_0 + * Resource action: lxc1 start on rhel7-2 + * Resource action: lxc2 start on rhel7-3 + * Resource action: rsc_rhel7-1 monitor=5000 on rhel7-5 * Pseudo action: lxc-ms_stop_0 * Pseudo action: lxc-ms_stop_0 * Pseudo action: lxc-ms-master_stopped_0 * Pseudo action: lxc-ms-master_start_0 + * Resource action: lxc1 monitor=30000 on rhel7-2 + * Resource action: lxc2 monitor=30000 on rhel7-3 * Pseudo action: all_stopped - * Resource action: rsc_rhel7-1 monitor=5000 on rhel7-5 - * Resource action: container1 start on rhel7-2 - * Resource action: container2 start on rhel7-3 - * Resource action: lxc1 start on rhel7-2 - * Resource action: lxc2 start on rhel7-3 + * Resource action: Fencing start on rhel7-4 + * Resource action: Fencing monitor=120000 on rhel7-4 * Resource action: lxc-ms start on lxc1 * Resource action: lxc-ms start on lxc2 - * Resource action: lxc-ms monitor=10000 on lxc2 * Pseudo action: lxc-ms-master_running_0 - * Resource action: lxc1 monitor=30000 on rhel7-2 - * Resource action: lxc2 monitor=30000 on rhel7-3 + * Resource action: lxc-ms monitor=10000 on lxc2 * Pseudo action: lxc-ms-master_promote_0 * Resource action: lxc-ms promote on lxc1 * Pseudo action: lxc-ms-master_promoted_0 diff --git a/pengine/test10/whitebox-fail1.dot b/pengine/test10/whitebox-fail1.dot index 0f0fe26..c6380ea 100644 --- a/pengine/test10/whitebox-fail1.dot +++ b/pengine/test10/whitebox-fail1.dot @@ -25,11 +25,9 @@ digraph "g" { "all_stopped" [ style=bold color="green" fontcolor="orange"] "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] -"container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] -"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] -"container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] "container1_stop_0 18node2" -> "all_stopped" [ style = bold] "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] +"container1_stop_0 18node2" -> "stonith 'reboot' lxc1" [ style = bold] "container1_stop_0 18node2" [ style=bold color="green" fontcolor="black"] "lxc1_monitor_30000 18node2" [ style=bold color="green" fontcolor="black"] "lxc1_start_0 18node2" -> "B_monitor_10000 lxc1" [ style = bold] @@ -42,4 +40,15 @@ digraph "g" { "lxc1_stop_0 18node2" -> "container1_stop_0 18node2" [ style = bold] "lxc1_stop_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "lxc1_stop_0 18node2" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' lxc1" -> "B_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "M-clone_stop_0" [ style = bold] +"stonith 'reboot' lxc1" -> "M_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc1" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "B_start_0 lxc1" [ style = bold] +"stonith_complete" -> "M_start_0 lxc1" [ style = bold] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "container1_start_0 18node2" [ style = bold] +"stonith_complete" -> "lxc1_start_0 18node2" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] } diff --git a/pengine/test10/whitebox-fail1.exp b/pengine/test10/whitebox-fail1.exp index 9629a76..03e83c3 100644 --- a/pengine/test10/whitebox-fail1.exp +++ b/pengine/test10/whitebox-fail1.exp @@ -10,6 +10,9 @@ + + + @@ -58,6 +61,9 @@ + + + @@ -68,10 +74,10 @@ - + - + @@ -98,7 +104,7 @@ - + @@ -159,6 +165,9 @@ + + + @@ -169,7 +178,7 @@ - + @@ -187,6 +196,9 @@ + + + @@ -216,6 +228,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -233,6 +272,9 @@ + + + diff --git a/pengine/test10/whitebox-fail1.summary b/pengine/test10/whitebox-fail1.summary index 1586407..1872e9a 100644 --- a/pengine/test10/whitebox-fail1.summary +++ b/pengine/test10/whitebox-fail1.summary @@ -22,20 +22,22 @@ Transition Summary: Executing cluster transition: * Resource action: lxc1 stop on 18node2 * Resource action: container1 stop on 18node2 + * Pseudo action: stonith-lxc1-reboot on lxc1 + * Pseudo action: stonith_complete + * Resource action: container1 start on 18node2 * Pseudo action: M-clone_stop_0 * Pseudo action: B_stop_0 - * Resource action: container1 start on 18node2 + * Resource action: lxc1 start on 18node2 + * Resource action: lxc1 monitor=30000 on 18node2 * Pseudo action: M_stop_0 * Pseudo action: M-clone_stopped_0 * Pseudo action: M-clone_start_0 - * Resource action: lxc1 start on 18node2 - * Resource action: lxc1 monitor=30000 on 18node2 + * Resource action: B start on lxc1 * Pseudo action: all_stopped * Resource action: M start on lxc1 * Pseudo action: M-clone_running_0 - * Resource action: B start on lxc1 - * Resource action: M monitor=10000 on lxc1 * Resource action: B monitor=10000 on lxc1 + * Resource action: M monitor=10000 on lxc1 Revised cluster status: Online: [ 18node1 18node2 18node3 ] diff --git a/pengine/test10/whitebox-fail2.dot b/pengine/test10/whitebox-fail2.dot index 0f0fe26..c6380ea 100644 --- a/pengine/test10/whitebox-fail2.dot +++ b/pengine/test10/whitebox-fail2.dot @@ -25,11 +25,9 @@ digraph "g" { "all_stopped" [ style=bold color="green" fontcolor="orange"] "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] -"container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] -"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] -"container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] "container1_stop_0 18node2" -> "all_stopped" [ style = bold] "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] +"container1_stop_0 18node2" -> "stonith 'reboot' lxc1" [ style = bold] "container1_stop_0 18node2" [ style=bold color="green" fontcolor="black"] "lxc1_monitor_30000 18node2" [ style=bold color="green" fontcolor="black"] "lxc1_start_0 18node2" -> "B_monitor_10000 lxc1" [ style = bold] @@ -42,4 +40,15 @@ digraph "g" { "lxc1_stop_0 18node2" -> "container1_stop_0 18node2" [ style = bold] "lxc1_stop_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] "lxc1_stop_0 18node2" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' lxc1" -> "B_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "M-clone_stop_0" [ style = bold] +"stonith 'reboot' lxc1" -> "M_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc1" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "B_start_0 lxc1" [ style = bold] +"stonith_complete" -> "M_start_0 lxc1" [ style = bold] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "container1_start_0 18node2" [ style = bold] +"stonith_complete" -> "lxc1_start_0 18node2" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] } diff --git a/pengine/test10/whitebox-fail2.exp b/pengine/test10/whitebox-fail2.exp index 9629a76..03e83c3 100644 --- a/pengine/test10/whitebox-fail2.exp +++ b/pengine/test10/whitebox-fail2.exp @@ -10,6 +10,9 @@ + + + @@ -58,6 +61,9 @@ + + + @@ -68,10 +74,10 @@ - + - + @@ -98,7 +104,7 @@ - + @@ -159,6 +165,9 @@ + + + @@ -169,7 +178,7 @@ - + @@ -187,6 +196,9 @@ + + + @@ -216,6 +228,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -233,6 +272,9 @@ + + + diff --git a/pengine/test10/whitebox-fail2.summary b/pengine/test10/whitebox-fail2.summary index ab40d99..5db6588 100644 --- a/pengine/test10/whitebox-fail2.summary +++ b/pengine/test10/whitebox-fail2.summary @@ -22,20 +22,22 @@ Transition Summary: Executing cluster transition: * Resource action: lxc1 stop on 18node2 * Resource action: container1 stop on 18node2 + * Pseudo action: stonith-lxc1-reboot on lxc1 + * Pseudo action: stonith_complete + * Resource action: container1 start on 18node2 * Pseudo action: M-clone_stop_0 * Pseudo action: B_stop_0 - * Resource action: container1 start on 18node2 + * Resource action: lxc1 start on 18node2 + * Resource action: lxc1 monitor=30000 on 18node2 * Pseudo action: M_stop_0 * Pseudo action: M-clone_stopped_0 * Pseudo action: M-clone_start_0 - * Resource action: lxc1 start on 18node2 - * Resource action: lxc1 monitor=30000 on 18node2 + * Resource action: B start on lxc1 * Pseudo action: all_stopped * Resource action: M start on lxc1 * Pseudo action: M-clone_running_0 - * Resource action: B start on lxc1 - * Resource action: M monitor=10000 on lxc1 * Resource action: B monitor=10000 on lxc1 + * Resource action: M monitor=10000 on lxc1 Revised cluster status: Online: [ 18node1 18node2 18node3 ] diff --git a/pengine/test10/whitebox-imply-stop-on-fence.dot b/pengine/test10/whitebox-imply-stop-on-fence.dot index b3fd40b..0e17a16 100644 --- a/pengine/test10/whitebox-imply-stop-on-fence.dot +++ b/pengine/test10/whitebox-imply-stop-on-fence.dot @@ -6,6 +6,7 @@ "R-lxc-01_kiff-01_stop_0 kiff-01" -> "R-lxc-01_kiff-01_start_0 kiff-02" [ style = bold] "R-lxc-01_kiff-01_stop_0 kiff-01" -> "all_stopped" [ style = bold] "R-lxc-01_kiff-01_stop_0 kiff-01" -> "shared0-clone_stop_0" [ style = bold] +"R-lxc-01_kiff-01_stop_0 kiff-01" -> "stonith 'reboot' lxc-01_kiff-01" [ style = bold] "R-lxc-01_kiff-01_stop_0 kiff-01" [ style=bold color="green" fontcolor="orange"] "R-lxc-02_kiff-01_monitor_10000 kiff-02" [ style=bold color="green" fontcolor="black"] "R-lxc-02_kiff-01_start_0 kiff-02" -> "R-lxc-02_kiff-01_monitor_10000 kiff-02" [ style = bold] @@ -14,7 +15,9 @@ "R-lxc-02_kiff-01_stop_0 kiff-01" -> "R-lxc-02_kiff-01_start_0 kiff-02" [ style = bold] "R-lxc-02_kiff-01_stop_0 kiff-01" -> "all_stopped" [ style = bold] "R-lxc-02_kiff-01_stop_0 kiff-01" -> "shared0-clone_stop_0" [ style = bold] +"R-lxc-02_kiff-01_stop_0 kiff-01" -> "stonith 'reboot' lxc-02_kiff-01" [ style = bold] "R-lxc-02_kiff-01_stop_0 kiff-01" [ style=bold color="green" fontcolor="orange"] +"all_stopped" -> "fence-kiff-02_start_0 kiff-02" [ style = bold] "all_stopped" [ style=bold color="green" fontcolor="orange"] "clvmd-clone_stop_0" -> "clvmd-clone_stopped_0" [ style = bold] "clvmd-clone_stop_0" -> "clvmd_stop_0 kiff-01" [ style = bold] @@ -74,8 +77,12 @@ "stonith 'reboot' kiff-01" -> "shared0-clone_stop_0" [ style = bold] "stonith 'reboot' kiff-01" -> "shared0_stop_0 kiff-01" [ style = bold] "stonith 'reboot' kiff-01" -> "stonith_complete" [ style = bold] -"stonith 'reboot' kiff-01" -> "vm-fs_stop_0 lxc-01_kiff-01" [ style = bold] "stonith 'reboot' kiff-01" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' lxc-01_kiff-01" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc-01_kiff-01" -> "vm-fs_stop_0 lxc-01_kiff-01" [ style = bold] +"stonith 'reboot' lxc-01_kiff-01" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' lxc-02_kiff-01" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc-02_kiff-01" [ style=bold color="green" fontcolor="orange"] "stonith_complete" -> "R-lxc-01_kiff-01_start_0 kiff-02" [ style = bold] "stonith_complete" -> "R-lxc-02_kiff-01_start_0 kiff-02" [ style = bold] "stonith_complete" -> "all_stopped" [ style = bold] diff --git a/pengine/test10/whitebox-imply-stop-on-fence.exp b/pengine/test10/whitebox-imply-stop-on-fence.exp index 0bd42b7..e38f4ea 100644 --- a/pengine/test10/whitebox-imply-stop-on-fence.exp +++ b/pengine/test10/whitebox-imply-stop-on-fence.exp @@ -1,33 +1,36 @@ - + - + - + - + + + + - + @@ -35,16 +38,16 @@ - + - + - + @@ -53,28 +56,28 @@ - + - + - + - + - + @@ -83,16 +86,16 @@ - + - + - + @@ -101,28 +104,28 @@ - + - + - + - + - + @@ -131,13 +134,13 @@ - + - + @@ -146,31 +149,31 @@ - + - + - + - + - + - + @@ -179,27 +182,27 @@ - + - + - + - + @@ -208,7 +211,7 @@ - + @@ -223,27 +226,27 @@ - + - + - + - + @@ -252,7 +255,7 @@ - + @@ -267,48 +270,48 @@ - + - + - + - - - - - - + + + + - + + + + + + + - - - - + + + - - - - + @@ -334,7 +337,7 @@ - + @@ -378,7 +381,7 @@ - + @@ -402,6 +405,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -413,7 +446,7 @@ - + @@ -423,35 +456,41 @@ + + + + + + - + - + - + - + - + - + - + - + - + diff --git a/pengine/test10/whitebox-imply-stop-on-fence.summary b/pengine/test10/whitebox-imply-stop-on-fence.summary index 3ee9570..50a3446 100644 --- a/pengine/test10/whitebox-imply-stop-on-fence.summary +++ b/pengine/test10/whitebox-imply-stop-on-fence.summary @@ -2,7 +2,7 @@ Current cluster status: Node kiff-01 (1): UNCLEAN (offline) Online: [ kiff-02 ] -Containers: [ lxc-01_kiff-01:R-lxc-01_kiff-01 lxc-01_kiff-02:R-lxc-01_kiff-02 lxc-02_kiff-01:R-lxc-02_kiff-01 lxc-02_kiff-02:R-lxc-02_kiff-02 ] +Containers: [ lxc-01_kiff-02:R-lxc-01_kiff-02 lxc-02_kiff-02:R-lxc-02_kiff-02 ] fence-kiff-01 (stonith:fence_ipmilan): Started kiff-02 fence-kiff-02 (stonith:fence_ipmilan): Started kiff-01 (UNCLEAN) @@ -18,37 +18,37 @@ Containers: [ lxc-01_kiff-01:R-lxc-01_kiff-01 lxc-01_kiff-02:R-lxc-01_kiff-02 lx shared0 (ocf::heartbeat:Filesystem): Started kiff-01 (UNCLEAN) Started: [ kiff-02 ] Stopped: [ lxc-01_kiff-01 lxc-01_kiff-02 lxc-02_kiff-01 lxc-02_kiff-02 ] - R-lxc-01_kiff-01 (ocf::heartbeat:VirtualDomain): Started kiff-01 (UNCLEAN) + R-lxc-01_kiff-01 (ocf::heartbeat:VirtualDomain): FAILED kiff-01 (UNCLEAN) R-lxc-02_kiff-01 (ocf::heartbeat:VirtualDomain): Started kiff-01 (UNCLEAN) R-lxc-01_kiff-02 (ocf::heartbeat:VirtualDomain): Started kiff-02 R-lxc-02_kiff-02 (ocf::heartbeat:VirtualDomain): Started kiff-02 - vm-fs (ocf::heartbeat:Filesystem): Started lxc-01_kiff-01 + vm-fs (ocf::heartbeat:Filesystem): FAILED lxc-01_kiff-01 Transition Summary: * Move fence-kiff-02 (Started kiff-01 -> kiff-02) * Stop dlm:0 (kiff-01) * Stop clvmd:0 (kiff-01) * Stop shared0:0 (kiff-01) - * Move R-lxc-01_kiff-01 (Started kiff-01 -> kiff-02) + * Recover R-lxc-01_kiff-01 (Started kiff-01 -> kiff-02) * Move R-lxc-02_kiff-01 (Started kiff-01 -> kiff-02) - * Restart vm-fs (Started lxc-01_kiff-01) + * Recover vm-fs (Started lxc-01_kiff-01) * Move lxc-01_kiff-01 (Started kiff-01 -> kiff-02) * Move lxc-02_kiff-01 (Started kiff-01 -> kiff-02) Executing cluster transition: * Pseudo action: fence-kiff-02_stop_0 * Fencing kiff-01 (reboot) - * Pseudo action: stonith_complete - * Resource action: fence-kiff-02 start on kiff-02 - * Pseudo action: vm-fs_stop_0 * Pseudo action: lxc-01_kiff-01_stop_0 * Pseudo action: lxc-02_kiff-01_stop_0 - * Resource action: fence-kiff-02 monitor=60000 on kiff-02 * Pseudo action: R-lxc-01_kiff-01_stop_0 * Pseudo action: R-lxc-02_kiff-01_stop_0 + * Pseudo action: stonith-lxc-02_kiff-01-reboot on lxc-02_kiff-01 + * Pseudo action: stonith-lxc-01_kiff-01-reboot on lxc-01_kiff-01 + * Pseudo action: stonith_complete * Pseudo action: shared0-clone_stop_0 * Resource action: R-lxc-01_kiff-01 start on kiff-02 * Resource action: R-lxc-02_kiff-01 start on kiff-02 + * Pseudo action: vm-fs_stop_0 * Resource action: lxc-01_kiff-01 start on kiff-02 * Resource action: lxc-02_kiff-01 start on kiff-02 * Pseudo action: shared0_stop_0 @@ -56,16 +56,18 @@ Executing cluster transition: * Resource action: R-lxc-01_kiff-01 monitor=10000 on kiff-02 * Resource action: R-lxc-02_kiff-01 monitor=10000 on kiff-02 * Resource action: vm-fs start on lxc-01_kiff-01 - * Resource action: vm-fs monitor=20000 on lxc-01_kiff-01 * Resource action: lxc-01_kiff-01 monitor=30000 on kiff-02 * Resource action: lxc-02_kiff-01 monitor=30000 on kiff-02 * Pseudo action: clvmd-clone_stop_0 + * Resource action: vm-fs monitor=20000 on lxc-01_kiff-01 * Pseudo action: clvmd_stop_0 * Pseudo action: clvmd-clone_stopped_0 * Pseudo action: dlm-clone_stop_0 * Pseudo action: dlm_stop_0 * Pseudo action: dlm-clone_stopped_0 * Pseudo action: all_stopped + * Resource action: fence-kiff-02 start on kiff-02 + * Resource action: fence-kiff-02 monitor=60000 on kiff-02 Revised cluster status: Online: [ kiff-02 ] diff --git a/pengine/test10/whitebox-ms-ordering.dot b/pengine/test10/whitebox-ms-ordering.dot index 7f03a65..bd77363 100644 --- a/pengine/test10/whitebox-ms-ordering.dot +++ b/pengine/test10/whitebox-ms-ordering.dot @@ -79,4 +79,22 @@ "lxc2_start_0 18node1" -> "lxc-ms_start_0 lxc2" [ style = bold] "lxc2_start_0 18node1" -> "lxc2_monitor_30000 18node1" [ style = bold] "lxc2_start_0 18node1" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' lxc1" -> "lxc-ms-master_stop_0" [ style = bold] +"stonith 'reboot' lxc1" -> "lxc-ms_demote_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "lxc-ms_stop_0 lxc1" [ style = bold] +"stonith 'reboot' lxc1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc1" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' lxc2" -> "lxc-ms-master_stop_0" [ style = bold] +"stonith 'reboot' lxc2" -> "lxc-ms_stop_0 lxc2" [ style = bold] +"stonith 'reboot' lxc2" -> "stonith_complete" [ style = bold] +"stonith 'reboot' lxc2" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" -> "container1_start_0 18node1" [ style = bold] +"stonith_complete" -> "container2_start_0 18node1" [ style = bold] +"stonith_complete" -> "lxc-ms_promote_0 lxc1" [ style = bold] +"stonith_complete" -> "lxc-ms_start_0 lxc1" [ style = bold] +"stonith_complete" -> "lxc-ms_start_0 lxc2" [ style = bold] +"stonith_complete" -> "lxc1_start_0 18node1" [ style = bold] +"stonith_complete" -> "lxc2_start_0 18node1" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] } diff --git a/pengine/test10/whitebox-ms-ordering.exp b/pengine/test10/whitebox-ms-ordering.exp index 0566f41..7aecfba 100644 --- a/pengine/test10/whitebox-ms-ordering.exp +++ b/pengine/test10/whitebox-ms-ordering.exp @@ -16,6 +16,9 @@ + + + @@ -62,6 +65,9 @@ + + + @@ -111,6 +117,9 @@ + + + @@ -130,6 +139,9 @@ + + + @@ -154,6 +166,9 @@ + + + @@ -166,6 +181,9 @@ + + + @@ -228,6 +246,9 @@ + + + @@ -240,6 +261,9 @@ + + + @@ -323,6 +347,12 @@ + + + + + + @@ -391,6 +421,9 @@ + + + @@ -417,10 +450,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -432,6 +505,9 @@ + + + diff --git a/pengine/test10/whitebox-ms-ordering.summary b/pengine/test10/whitebox-ms-ordering.summary index d6bbaaf..5d7c042 100644 --- a/pengine/test10/whitebox-ms-ordering.summary +++ b/pengine/test10/whitebox-ms-ordering.summary @@ -27,6 +27,9 @@ Executing cluster transition: * Resource action: lxc-ms monitor on 18node2 * Resource action: lxc-ms monitor on 18node1 * Pseudo action: lxc-ms-master_demote_0 + * Pseudo action: stonith-lxc2-reboot on lxc2 + * Pseudo action: stonith-lxc1-reboot on lxc1 + * Pseudo action: stonith_complete * Resource action: container1 start on 18node1 * Resource action: container2 start on 18node1 * Pseudo action: lxc-ms_demote_0 diff --git a/pengine/test10/whitebox-unexpectedly-running.dot b/pengine/test10/whitebox-unexpectedly-running.dot index d87344a..2915d77 100644 --- a/pengine/test10/whitebox-unexpectedly-running.dot +++ b/pengine/test10/whitebox-unexpectedly-running.dot @@ -5,9 +5,14 @@ "FAKE_start_0 18builder" [ style=bold color="green" fontcolor="black"] "FAKE_stop_0 18builder" -> "FAKE_start_0 18builder" [ style = bold] "FAKE_stop_0 18builder" -> "all_stopped" [ style = bold] +"FAKE_stop_0 18builder" -> "stonith 'reboot' remote1" [ style = bold] "FAKE_stop_0 18builder" [ style=bold color="green" fontcolor="black"] "all_stopped" [ style=bold color="green" fontcolor="orange"] "remote1_monitor_30000 18builder" [ style=bold color="green" fontcolor="black"] "remote1_start_0 18builder" -> "remote1_monitor_30000 18builder" [ style = bold] "remote1_start_0 18builder" [ style=bold color="green" fontcolor="black"] +"stonith 'reboot' remote1" -> "stonith_complete" [ style = bold] +"stonith 'reboot' remote1" [ style=bold color="green" fontcolor="orange"] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] } diff --git a/pengine/test10/whitebox-unexpectedly-running.exp b/pengine/test10/whitebox-unexpectedly-running.exp index 29cd66b..c5272f2 100644 --- a/pengine/test10/whitebox-unexpectedly-running.exp +++ b/pengine/test10/whitebox-unexpectedly-running.exp @@ -62,6 +62,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -70,6 +97,9 @@ + + + diff --git a/pengine/test10/whitebox-unexpectedly-running.summary b/pengine/test10/whitebox-unexpectedly-running.summary index f834e41..7d5b908 100644 --- a/pengine/test10/whitebox-unexpectedly-running.summary +++ b/pengine/test10/whitebox-unexpectedly-running.summary @@ -10,6 +10,8 @@ Transition Summary: Executing cluster transition: * Resource action: FAKE stop on 18builder + * Pseudo action: stonith-remote1-reboot on remote1 + * Pseudo action: stonith_complete * Pseudo action: all_stopped * Resource action: FAKE start on 18builder * Resource action: remote1 start on 18builder -- 1.8.3.1 From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Fri, 15 Apr 2016 15:04:03 -0500 Subject: [PATCH 8/8] Fix: crmd: update cache status for guest node whose host is fenced Normally, the remote RA's stop action handles setting the peer cache state to down (along with other side effects of a stop). However, if a guest node's host is fenced, the RA will not be called. Check for the fencing pseudo-action created by the pengine in this case. --- crmd/crmd_lrm.h | 1 + crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++-- crmd/te_actions.c | 4 +++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index 412ce5b..08ba947 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti lrmd_key_value_t * params); void remote_ra_cleanup(lrm_state_t * lrm_state); void remote_ra_fail(const char *node_name); +void remote_ra_process_pseudo(xmlNode *xml); gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index b9c5068..eb995ea 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -226,14 +226,20 @@ remote_node_up(const char *node_name) free_xml(update); } +enum down_opts { + DOWN_KEEP_LRM, + DOWN_ERASE_LRM +}; + /*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node + * \param[in] opts Whether to keep or erase LRM history */ static void -remote_node_down(const char *node_name) +remote_node_down(const char *node_name, const enum down_opts opts) { xmlNode *update; int call_id = 0; @@ -246,6 +252,14 @@ remote_node_down(const char *node_name) /* Purge node's transient attributes */ erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); + /* Normally, the LRM operation history should be kept until the node comes + * back up. However, after a successful fence, we want to clear it, so we + * don't think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { + erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); + } + /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); @@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd) if (ra_data) { if (ra_data->migrate_status != takeover_complete) { /* Stop means down if we didn't successfully migrate elsewhere */ - remote_node_down(cmd->rsc_id); + remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM); } else if (AM_I_DC == FALSE) { /* Only the connection host and DC track node state, * so if the connection migrated elsewhere and we aren't DC, @@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name) } } +/* A guest node fencing implied by host fencing looks like: + * + * + * + * + * + * + * + */ +#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \ + "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \ + "/" XML_CIB_TAG_NODE + +/*! + * \internal + * \brief Check a pseudo-action for Pacemaker Remote node side effects + * + * \param[in] xml XML of pseudo-action to check + */ +void +remote_ra_process_pseudo(xmlNode *xml) +{ + xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE); + + if (numXpathResults(search) == 1) { + xmlNode *result = getXpathResult(search, 0); + + /* Normally, we handle the necessary side effects of a guest node stop + * action when reporting the remote agent's result. However, if the stop + * is implied due to fencing, it will be a fencing pseudo-event, and + * there won't be a result to report. Handle that case here. + * + * This will result in a duplicate call to remote_node_down() if the + * guest stop was real instead of implied, but that shouldn't hurt. + * + * There is still one corner case that isn't handled: if a guest node + * isn't running any resources when its host is fenced, it will appear + * to be cleanly stopped, so there will be no pseudo-fence, and our + * peer cache state will be incorrect unless and until the guest is + * recovered. + */ + if (result) { + const char *remote = ID(result); + + if (remote) { + remote_node_down(remote, DOWN_ERASE_LRM); + } + } + } + freeXpathObject(search); +} diff --git a/crmd/te_actions.c b/crmd/te_actions.c index c971273..01538af 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action) static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { + /* Check action for Pacemaker Remote node side effects */ + remote_ra_process_pseudo(pseudo->xml); + crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id, crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY)); te_action_confirmed(pseudo); -- 1.8.3.1