|
 |
356a11 |
From 083c3a49ad41bd17387c8ae661c23b44d4b845c6 Mon Sep 17 00:00:00 2001
|
|
 |
356a11 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
 |
356a11 |
Date: Tue, 30 May 2017 14:43:25 -0500
|
|
 |
356a11 |
Subject: [PATCH] Log: pengine,libpe_status: revisit fencing messages
|
|
 |
356a11 |
|
|
 |
356a11 |
---
|
|
 |
356a11 |
lib/pengine/unpack.c | 72 ++++++++++++++++++++++++++++++++--------------------
|
|
 |
356a11 |
pengine/allocate.c | 65 ++++++++++++++++++++++++++---------------------
|
|
 |
356a11 |
2 files changed, 81 insertions(+), 56 deletions(-)
|
|
 |
356a11 |
|
|
 |
356a11 |
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
|
|
 |
356a11 |
index 377100c..21eca90 100644
|
|
 |
356a11 |
--- a/lib/pengine/unpack.c
|
|
 |
356a11 |
+++ b/lib/pengine/unpack.c
|
|
 |
356a11 |
@@ -63,6 +63,13 @@ is_dangling_container_remote_node(node_t *node)
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
|
|
 |
356a11 |
+/*!
|
|
 |
356a11 |
+ * \brief Schedule a fence action for a node
|
|
 |
356a11 |
+ *
|
|
 |
356a11 |
+ * \param[in,out] data_set Current working set of cluster
|
|
 |
356a11 |
+ * \param[in,out] node Node to fence
|
|
 |
356a11 |
+ * \param[in] reason Text description of why fencing is needed
|
|
 |
356a11 |
+ */
|
|
 |
356a11 |
void
|
|
 |
356a11 |
pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
|
|
 |
356a11 |
{
|
|
 |
356a11 |
@@ -74,11 +81,13 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
|
|
 |
356a11 |
|
|
 |
356a11 |
if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
|
|
 |
356a11 |
if (!is_set(rsc->flags, pe_rsc_managed)) {
|
|
 |
356a11 |
- crm_notice("Not fencing node %s due to '%s': container %s is"
|
|
 |
356a11 |
- " unmanaged"
|
|
 |
356a11 |
- "%s", node->details->uname, reason, rsc->id);
|
|
 |
356a11 |
+ crm_notice("Not fencing guest node %s "
|
|
 |
356a11 |
+ "(otherwise would because %s): "
|
|
 |
356a11 |
+ "its guest resource %s is unmanaged",
|
|
 |
356a11 |
+ node->details->uname, reason, rsc->id);
|
|
 |
356a11 |
} else {
|
|
 |
356a11 |
- crm_warn("Remote node %s will be fenced due to '%s' by recovering %s",
|
|
 |
356a11 |
+ crm_warn("Guest node %s will be fenced "
|
|
 |
356a11 |
+ "(by recovering its guest resource %s): %s",
|
|
 |
356a11 |
node->details->uname, rsc->id, reason);
|
|
 |
356a11 |
|
|
 |
356a11 |
/* We don't mark the node as unclean because that would prevent the
|
|
 |
356a11 |
@@ -91,8 +100,9 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
} else if (is_dangling_container_remote_node(node)) {
|
|
 |
356a11 |
- crm_info("Cleaning up dangling connection resource for guest node %s due to '%s'"
|
|
 |
356a11 |
- " (fencing is already done, guest resource no longer exists)",
|
|
 |
356a11 |
+ crm_info("Cleaning up dangling connection for guest node %s: "
|
|
 |
356a11 |
+ "fencing was already done because %s, "
|
|
 |
356a11 |
+ "and guest resource no longer exists",
|
|
 |
356a11 |
node->details->uname, reason);
|
|
 |
356a11 |
set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
|
|
 |
356a11 |
|
|
 |
356a11 |
@@ -100,31 +110,29 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
|
|
 |
356a11 |
resource_t *rsc = node->details->remote_rsc;
|
|
 |
356a11 |
|
|
 |
356a11 |
if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) {
|
|
 |
356a11 |
- crm_notice("Not fencing node %s due to '%s': connection is unmanaged",
|
|
 |
356a11 |
+ crm_notice("Not fencing remote node %s "
|
|
 |
356a11 |
+ "(otherwise would because %s): connection is unmanaged",
|
|
 |
356a11 |
node->details->uname, reason);
|
|
 |
356a11 |
} else if(node->details->remote_requires_reset == FALSE) {
|
|
 |
356a11 |
node->details->remote_requires_reset = TRUE;
|
|
 |
356a11 |
- if (pe_can_fence(data_set, node)) {
|
|
 |
356a11 |
- crm_warn("Remote node %s will be fenced due to %s", node->details->uname, reason);
|
|
 |
356a11 |
- } else {
|
|
 |
356a11 |
- crm_warn("Remote node %s is unclean due to %s", node->details->uname, reason);
|
|
 |
356a11 |
- }
|
|
 |
356a11 |
+ crm_warn("Remote node %s %s: %s",
|
|
 |
356a11 |
+ node->details->uname,
|
|
 |
356a11 |
+ pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
|
|
 |
356a11 |
+ reason);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
node->details->unclean = TRUE;
|
|
 |
356a11 |
|
|
 |
356a11 |
} else if (node->details->unclean) {
|
|
 |
356a11 |
- if (pe_can_fence(data_set, node)) {
|
|
 |
356a11 |
- crm_trace("Node %s would also be fenced due to '%s'", node->details->uname, reason);
|
|
 |
356a11 |
- } else {
|
|
 |
356a11 |
- crm_trace("Node %s is also unclean due to '%s'", node->details->uname, reason);
|
|
 |
356a11 |
- }
|
|
 |
356a11 |
-
|
|
 |
356a11 |
- } else if (pe_can_fence(data_set, node)) {
|
|
 |
356a11 |
- crm_warn("Node %s will be fenced due to %s", node->details->uname, reason);
|
|
 |
356a11 |
- node->details->unclean = TRUE;
|
|
 |
356a11 |
+ crm_trace("Cluster node %s %s because %s",
|
|
 |
356a11 |
+ node->details->uname,
|
|
 |
356a11 |
+ pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean",
|
|
 |
356a11 |
+ reason);
|
|
 |
356a11 |
|
|
 |
356a11 |
} else {
|
|
 |
356a11 |
- crm_warn("Node %s is unclean due to %s", node->details->uname, reason);
|
|
 |
356a11 |
+ crm_warn("Cluster node %s %s: %s",
|
|
 |
356a11 |
+ node->details->uname,
|
|
 |
356a11 |
+ pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
|
|
 |
356a11 |
+ reason);
|
|
 |
356a11 |
node->details->unclean = TRUE;
|
|
 |
356a11 |
}
|
|
 |
356a11 |
}
|
|
 |
356a11 |
@@ -1878,6 +1886,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
xmlNode * migrate_op, pe_working_set_t * data_set)
|
|
 |
356a11 |
{
|
|
 |
356a11 |
node_t *tmpnode = NULL;
|
|
 |
356a11 |
+ char *reason = NULL;
|
|
 |
356a11 |
+
|
|
 |
356a11 |
CRM_ASSERT(rsc);
|
|
 |
356a11 |
pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
|
|
 |
356a11 |
rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail));
|
|
 |
356a11 |
@@ -1907,7 +1917,6 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
&& node->details->maintenance == FALSE
|
|
 |
356a11 |
&& is_set(rsc->flags, pe_rsc_managed)) {
|
|
 |
356a11 |
|
|
 |
356a11 |
- char *reason = NULL;
|
|
 |
356a11 |
gboolean should_fence = FALSE;
|
|
 |
356a11 |
|
|
 |
356a11 |
/* If this is a guest node, fence it (regardless of whether fencing is
|
|
 |
356a11 |
@@ -1922,14 +1931,19 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
should_fence = TRUE;
|
|
 |
356a11 |
|
|
 |
356a11 |
} else if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
|
|
 |
356a11 |
- if (is_baremetal_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
|
|
 |
356a11 |
+ if (is_baremetal_remote_node(node) && node->details->remote_rsc
|
|
 |
356a11 |
+ && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
|
|
 |
356a11 |
+
|
|
 |
356a11 |
/* setting unseen = true means that fencing of the remote node will
|
|
 |
356a11 |
* only occur if the connection resource is not going to start somewhere.
|
|
 |
356a11 |
* This allows connection resources on a failed cluster-node to move to
|
|
 |
356a11 |
* another node without requiring the baremetal remote nodes to be fenced
|
|
 |
356a11 |
* as well. */
|
|
 |
356a11 |
node->details->unseen = TRUE;
|
|
 |
356a11 |
- reason = crm_strdup_printf("%s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id);
|
|
 |
356a11 |
+ reason = crm_strdup_printf("%s is active there (fencing will be"
|
|
 |
356a11 |
+ " revoked if remote connection can "
|
|
 |
356a11 |
+ "be re-established elsewhere)",
|
|
 |
356a11 |
+ rsc->id);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
should_fence = TRUE;
|
|
 |
356a11 |
}
|
|
 |
356a11 |
@@ -1959,7 +1973,9 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
/* treat it as if it is still running
|
|
 |
356a11 |
* but also mark the node as unclean
|
|
 |
356a11 |
*/
|
|
 |
356a11 |
- pe_fence_node(data_set, node, "resource failure(s)");
|
|
 |
356a11 |
+ reason = crm_strdup_printf("%s failed there", rsc->id);
|
|
 |
356a11 |
+ pe_fence_node(data_set, node, reason);
|
|
 |
356a11 |
+ free(reason);
|
|
 |
356a11 |
break;
|
|
 |
356a11 |
|
|
 |
356a11 |
case action_fail_standby:
|
|
 |
356a11 |
@@ -2002,6 +2018,7 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
stop_action(rsc, node, FALSE);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
break;
|
|
 |
356a11 |
+
|
|
 |
356a11 |
case action_fail_reset_remote:
|
|
 |
356a11 |
set_bit(rsc->flags, pe_rsc_failed);
|
|
 |
356a11 |
if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
|
|
 |
356a11 |
@@ -2015,7 +2032,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
|
|
 |
356a11 |
|
|
 |
356a11 |
/* connection resource to baremetal resource failed in a way that
|
|
 |
356a11 |
* should result in fencing the remote-node. */
|
|
 |
356a11 |
- pe_fence_node(data_set, tmpnode, "of connection failure(s)");
|
|
 |
356a11 |
+ pe_fence_node(data_set, tmpnode,
|
|
 |
356a11 |
+ "remote connection is unrecoverable");
|
|
 |
356a11 |
}
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
diff --git a/pengine/allocate.c b/pengine/allocate.c
|
|
 |
356a11 |
index 0020af6..f2987cc 100644
|
|
 |
356a11 |
--- a/pengine/allocate.c
|
|
 |
356a11 |
+++ b/pengine/allocate.c
|
|
 |
356a11 |
@@ -467,7 +467,7 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
|
|
 |
356a11 |
set_bit(action_clear->flags, pe_action_runnable);
|
|
 |
356a11 |
|
|
 |
356a11 |
crm_notice("Clearing failure of %s on %s "
|
|
 |
356a11 |
- "action definition changed " CRM_XS " %s",
|
|
 |
356a11 |
+ "because action definition changed " CRM_XS " %s",
|
|
 |
356a11 |
rsc->id, node->details->uname, action_clear->uuid);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
}
|
|
 |
356a11 |
@@ -1789,7 +1789,6 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
|
|
 |
356a11 |
CRM_ASSERT(action->node);
|
|
 |
356a11 |
CRM_ASSERT(is_remote_node(action->node));
|
|
 |
356a11 |
- CRM_ASSERT(action->node->details->remote_rsc);
|
|
 |
356a11 |
|
|
 |
356a11 |
remote_rsc = action->node->details->remote_rsc;
|
|
 |
356a11 |
CRM_ASSERT(remote_rsc);
|
|
 |
356a11 |
@@ -1801,7 +1800,13 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
pe_fence_node(data_set, action->node, "container failed");
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
- crm_trace("%s %s %s %s %d", action->uuid, action->task, remote_rsc->id, container->id, is_set(container->flags, pe_rsc_failed));
|
|
 |
356a11 |
+ crm_trace("Order %s action %s relative to %s%s for %s%s",
|
|
 |
356a11 |
+ action->task, action->uuid,
|
|
 |
356a11 |
+ is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
|
|
 |
356a11 |
+ remote_rsc->id,
|
|
 |
356a11 |
+ is_set(container->flags, pe_rsc_failed)? "failed " : "",
|
|
 |
356a11 |
+ container->id);
|
|
 |
356a11 |
+
|
|
 |
356a11 |
switch (task) {
|
|
 |
356a11 |
case start_rsc:
|
|
 |
356a11 |
case action_promote:
|
|
 |
356a11 |
@@ -1874,6 +1879,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
node_t *cluster_node = NULL;
|
|
 |
356a11 |
enum action_tasks task = text2task(action->task);
|
|
 |
356a11 |
enum remote_connection_state state = remote_state_unknown;
|
|
 |
356a11 |
+ enum pe_ordering order_opts = pe_order_none;
|
|
 |
356a11 |
|
|
 |
356a11 |
if (action->rsc == NULL) {
|
|
 |
356a11 |
return;
|
|
 |
356a11 |
@@ -1881,7 +1887,6 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
|
|
 |
356a11 |
CRM_ASSERT(action->node);
|
|
 |
356a11 |
CRM_ASSERT(is_remote_node(action->node));
|
|
 |
356a11 |
- CRM_ASSERT(action->node->details->remote_rsc);
|
|
 |
356a11 |
|
|
 |
356a11 |
remote_rsc = action->node->details->remote_rsc;
|
|
 |
356a11 |
CRM_ASSERT(remote_rsc);
|
|
 |
356a11 |
@@ -1895,7 +1900,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
* on that remote node until after it starts elsewhere.
|
|
 |
356a11 |
*/
|
|
 |
356a11 |
if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) {
|
|
 |
356a11 |
- /* There is no-where left to run the connection resource
|
|
 |
356a11 |
+ /* There is nowhere left to run the connection resource,
|
|
 |
356a11 |
* and the resource is in a failed state (either directly
|
|
 |
356a11 |
* or because it is located on a failed node).
|
|
 |
356a11 |
*
|
|
 |
356a11 |
@@ -1903,8 +1908,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
* or if there are resources in an unknown state (probe), we
|
|
 |
356a11 |
* must assume the worst and fence it.
|
|
 |
356a11 |
*/
|
|
 |
356a11 |
-
|
|
 |
356a11 |
- if(is_set(action->node->details->remote_rsc->flags, pe_rsc_failed)) {
|
|
 |
356a11 |
+ if (is_set(remote_rsc->flags, pe_rsc_failed)) {
|
|
 |
356a11 |
state = remote_state_failed;
|
|
 |
356a11 |
} else if(cluster_node && cluster_node->details->unclean) {
|
|
 |
356a11 |
state = remote_state_failed;
|
|
 |
356a11 |
@@ -1934,22 +1938,31 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
state = remote_state_alive;
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
- crm_trace("%s %s %s %d %d", action->uuid, action->task, action->node->details->uname, state, is_set(remote_rsc->flags, pe_rsc_failed));
|
|
 |
356a11 |
+ crm_trace("Order %s action %s relative to %s%s (state %d)",
|
|
 |
356a11 |
+ action->task, action->uuid,
|
|
 |
356a11 |
+ is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
|
|
 |
356a11 |
+ remote_rsc->id, state);
|
|
 |
356a11 |
switch (task) {
|
|
 |
356a11 |
case start_rsc:
|
|
 |
356a11 |
case action_promote:
|
|
 |
356a11 |
- if(state == remote_state_failed) {
|
|
 |
356a11 |
- /* Wait for the connection resource to be up and force recovery */
|
|
 |
356a11 |
- custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
|
|
 |
356a11 |
- action->rsc, NULL, action,
|
|
 |
356a11 |
- pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
|
|
 |
356a11 |
- } else {
|
|
 |
356a11 |
- /* Ensure the connection resource is up and assume everything is as we left it */
|
|
 |
356a11 |
- custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
|
|
 |
356a11 |
- action->rsc, NULL, action,
|
|
 |
356a11 |
- pe_order_preserve | pe_order_runnable_left, data_set);
|
|
 |
356a11 |
+ /* This as an internally generated constraint exempt from
|
|
 |
356a11 |
+ * user constraint prohibitions, and this action isn't runnable
|
|
 |
356a11 |
+ * if the connection start isn't runnable.
|
|
 |
356a11 |
+ */
|
|
 |
356a11 |
+ order_opts = pe_order_preserve | pe_order_runnable_left;
|
|
 |
356a11 |
+
|
|
 |
356a11 |
+ if (state == remote_state_failed) {
|
|
 |
356a11 |
+ /* Force recovery, by making this action required */
|
|
 |
356a11 |
+ order_opts |= pe_order_implies_then;
|
|
 |
356a11 |
}
|
|
 |
356a11 |
+
|
|
 |
356a11 |
+ /* Ensure connection is up before running this action */
|
|
 |
356a11 |
+ custom_action_order(remote_rsc,
|
|
 |
356a11 |
+ generate_op_key(remote_rsc->id, RSC_START, 0),
|
|
 |
356a11 |
+ NULL, action->rsc, NULL, action, order_opts,
|
|
 |
356a11 |
+ data_set);
|
|
 |
356a11 |
break;
|
|
 |
356a11 |
+
|
|
 |
356a11 |
case stop_rsc:
|
|
 |
356a11 |
/* Handle special case with remote node where stop actions need to be
|
|
 |
356a11 |
* ordered after the connection resource starts somewhere else.
|
|
 |
356a11 |
@@ -1975,22 +1988,19 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
|
|
 |
356a11 |
pe_order_preserve | pe_order_implies_first, data_set);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
break;
|
|
 |
356a11 |
- case action_demote:
|
|
 |
356a11 |
|
|
 |
356a11 |
- /* If the connection is being torn down, we don't want
|
|
 |
356a11 |
- * to build a constraint between a resource's demotion and
|
|
 |
356a11 |
- * the connection resource starting... because the connection
|
|
 |
356a11 |
- * resource can not start. The connection might already be up,
|
|
 |
356a11 |
- * but the "start" action would not be allowed, which in turn would
|
|
 |
356a11 |
- * block the demotion of any resources living in the node.
|
|
 |
356a11 |
+ case action_demote:
|
|
 |
356a11 |
+ /* Only order this demote relative to the connection start if the
|
|
 |
356a11 |
+ * connection isn't being torn down. Otherwise, the demote would be
|
|
 |
356a11 |
+ * blocked because the connection start would not be allowed.
|
|
 |
356a11 |
*/
|
|
 |
356a11 |
-
|
|
 |
356a11 |
if(state == remote_state_resting || state == remote_state_unknown) {
|
|
 |
356a11 |
custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
|
|
 |
356a11 |
action->rsc, NULL, action,
|
|
 |
356a11 |
pe_order_preserve, data_set);
|
|
 |
356a11 |
} /* Otherwise we can rely on the stop ordering */
|
|
 |
356a11 |
break;
|
|
 |
356a11 |
+
|
|
 |
356a11 |
default:
|
|
 |
356a11 |
/* Wait for the connection resource to be up */
|
|
 |
356a11 |
if (is_recurring_action(action)) {
|
|
 |
356a11 |
@@ -2261,15 +2271,12 @@ stage7(pe_working_set_t * data_set)
|
|
 |
356a11 |
order_probes(data_set);
|
|
 |
356a11 |
|
|
 |
356a11 |
crm_trace("Updating %d actions", g_list_length(data_set->actions));
|
|
 |
356a11 |
-
|
|
 |
356a11 |
for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
|
|
 |
356a11 |
action_t *action = (action_t *) gIter->data;
|
|
 |
356a11 |
|
|
 |
356a11 |
update_action(action);
|
|
 |
356a11 |
}
|
|
 |
356a11 |
|
|
 |
356a11 |
- crm_trace("Processing reloads");
|
|
 |
356a11 |
-
|
|
 |
356a11 |
LogNodeActions(data_set, FALSE);
|
|
 |
356a11 |
for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
|
|
 |
356a11 |
resource_t *rsc = (resource_t *) gIter->data;
|
|
 |
356a11 |
--
|
|
 |
356a11 |
1.8.3.1
|
|
 |
356a11 |
|