From 083c3a49ad41bd17387c8ae661c23b44d4b845c6 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 30 May 2017 14:43:25 -0500 Subject: [PATCH] Log: pengine,libpe_status: revisit fencing messages --- lib/pengine/unpack.c | 72 ++++++++++++++++++++++++++++++++-------------------- pengine/allocate.c | 65 ++++++++++++++++++++++++++--------------------- 2 files changed, 81 insertions(+), 56 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 377100c..21eca90 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -63,6 +63,13 @@ is_dangling_container_remote_node(node_t *node) } +/*! + * \brief Schedule a fence action for a node + * + * \param[in,out] data_set Current working set of cluster + * \param[in,out] node Node to fence + * \param[in] reason Text description of why fencing is needed + */ void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) { @@ -74,11 +81,13 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { if (!is_set(rsc->flags, pe_rsc_managed)) { - crm_notice("Not fencing node %s due to '%s': container %s is" - " unmanaged" - "%s", node->details->uname, reason, rsc->id); + crm_notice("Not fencing guest node %s " + "(otherwise would because %s): " + "its guest resource %s is unmanaged", + node->details->uname, reason, rsc->id); } else { - crm_warn("Remote node %s will be fenced due to '%s' by recovering %s", + crm_warn("Guest node %s will be fenced " + "(by recovering its guest resource %s): %s", node->details->uname, rsc->id, reason); /* We don't mark the node as unclean because that would prevent the @@ -91,8 +100,9 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) } } else if (is_dangling_container_remote_node(node)) { - crm_info("Cleaning up dangling connection resource for guest node %s due to '%s'" - " (fencing is already done, guest resource no longer exists)", + crm_info("Cleaning up dangling connection for guest node %s: " + "fencing was already done because %s, " + "and guest resource no longer exists", node->details->uname, reason); set_bit(node->details->remote_rsc->flags, pe_rsc_failed); @@ -100,31 +110,29 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) resource_t *rsc = node->details->remote_rsc; if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) { - crm_notice("Not fencing node %s due to '%s': connection is unmanaged", + crm_notice("Not fencing remote node %s " + "(otherwise would because %s): connection is unmanaged", node->details->uname, reason); } else if(node->details->remote_requires_reset == FALSE) { node->details->remote_requires_reset = TRUE; - if (pe_can_fence(data_set, node)) { - crm_warn("Remote node %s will be fenced due to %s", node->details->uname, reason); - } else { - crm_warn("Remote node %s is unclean due to %s", node->details->uname, reason); - } + crm_warn("Remote node %s %s: %s", + node->details->uname, + pe_can_fence(data_set, node)? "will be fenced" : "is unclean", + reason); } node->details->unclean = TRUE; } else if (node->details->unclean) { - if (pe_can_fence(data_set, node)) { - crm_trace("Node %s would also be fenced due to '%s'", node->details->uname, reason); - } else { - crm_trace("Node %s is also unclean due to '%s'", node->details->uname, reason); - } - - } else if (pe_can_fence(data_set, node)) { - crm_warn("Node %s will be fenced due to %s", node->details->uname, reason); - node->details->unclean = TRUE; + crm_trace("Cluster node %s %s because %s", + node->details->uname, + pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean", + reason); } else { - crm_warn("Node %s is unclean due to %s", node->details->uname, reason); + crm_warn("Cluster node %s %s: %s", + node->details->uname, + pe_can_fence(data_set, node)? "will be fenced" : "is unclean", + reason); node->details->unclean = TRUE; } } @@ -1878,6 +1886,8 @@ process_rsc_state(resource_t * rsc, node_t * node, xmlNode * migrate_op, pe_working_set_t * data_set) { node_t *tmpnode = NULL; + char *reason = NULL; + CRM_ASSERT(rsc); pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s", rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail)); @@ -1907,7 +1917,6 @@ process_rsc_state(resource_t * rsc, node_t * node, && node->details->maintenance == FALSE && is_set(rsc->flags, pe_rsc_managed)) { - char *reason = NULL; gboolean should_fence = FALSE; /* If this is a guest node, fence it (regardless of whether fencing is @@ -1922,14 +1931,19 @@ process_rsc_state(resource_t * rsc, node_t * node, should_fence = TRUE; } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) { - if (is_baremetal_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) { + if (is_baremetal_remote_node(node) && node->details->remote_rsc + && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) { + /* setting unseen = true means that fencing of the remote node will * only occur if the connection resource is not going to start somewhere. * This allows connection resources on a failed cluster-node to move to * another node without requiring the baremetal remote nodes to be fenced * as well. */ node->details->unseen = TRUE; - reason = crm_strdup_printf("%s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id); + reason = crm_strdup_printf("%s is active there (fencing will be" + " revoked if remote connection can " + "be re-established elsewhere)", + rsc->id); } should_fence = TRUE; } @@ -1959,7 +1973,9 @@ process_rsc_state(resource_t * rsc, node_t * node, /* treat it as if it is still running * but also mark the node as unclean */ - pe_fence_node(data_set, node, "resource failure(s)"); + reason = crm_strdup_printf("%s failed there", rsc->id); + pe_fence_node(data_set, node, reason); + free(reason); break; case action_fail_standby: @@ -2002,6 +2018,7 @@ process_rsc_state(resource_t * rsc, node_t * node, stop_action(rsc, node, FALSE); } break; + case action_fail_reset_remote: set_bit(rsc->flags, pe_rsc_failed); if (is_set(data_set->flags, pe_flag_stonith_enabled)) { @@ -2015,7 +2032,8 @@ process_rsc_state(resource_t * rsc, node_t * node, /* connection resource to baremetal resource failed in a way that * should result in fencing the remote-node. */ - pe_fence_node(data_set, tmpnode, "of connection failure(s)"); + pe_fence_node(data_set, tmpnode, + "remote connection is unrecoverable"); } } diff --git a/pengine/allocate.c b/pengine/allocate.c index 0020af6..f2987cc 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -467,7 +467,7 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki set_bit(action_clear->flags, pe_action_runnable); crm_notice("Clearing failure of %s on %s " - "action definition changed " CRM_XS " %s", + "because action definition changed " CRM_XS " %s", rsc->id, node->details->uname, action_clear->uuid); } } @@ -1789,7 +1789,6 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set) CRM_ASSERT(action->node); CRM_ASSERT(is_remote_node(action->node)); - CRM_ASSERT(action->node->details->remote_rsc); remote_rsc = action->node->details->remote_rsc; CRM_ASSERT(remote_rsc); @@ -1801,7 +1800,13 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set) pe_fence_node(data_set, action->node, "container failed"); } - crm_trace("%s %s %s %s %d", action->uuid, action->task, remote_rsc->id, container->id, is_set(container->flags, pe_rsc_failed)); + crm_trace("Order %s action %s relative to %s%s for %s%s", + action->task, action->uuid, + is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "", + remote_rsc->id, + is_set(container->flags, pe_rsc_failed)? "failed " : "", + container->id); + switch (task) { case start_rsc: case action_promote: @@ -1874,6 +1879,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) node_t *cluster_node = NULL; enum action_tasks task = text2task(action->task); enum remote_connection_state state = remote_state_unknown; + enum pe_ordering order_opts = pe_order_none; if (action->rsc == NULL) { return; @@ -1881,7 +1887,6 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) CRM_ASSERT(action->node); CRM_ASSERT(is_remote_node(action->node)); - CRM_ASSERT(action->node->details->remote_rsc); remote_rsc = action->node->details->remote_rsc; CRM_ASSERT(remote_rsc); @@ -1895,7 +1900,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) * on that remote node until after it starts elsewhere. */ if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) { - /* There is no-where left to run the connection resource + /* There is nowhere left to run the connection resource, * and the resource is in a failed state (either directly * or because it is located on a failed node). * @@ -1903,8 +1908,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) * or if there are resources in an unknown state (probe), we * must assume the worst and fence it. */ - - if(is_set(action->node->details->remote_rsc->flags, pe_rsc_failed)) { + if (is_set(remote_rsc->flags, pe_rsc_failed)) { state = remote_state_failed; } else if(cluster_node && cluster_node->details->unclean) { state = remote_state_failed; @@ -1934,22 +1938,31 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) state = remote_state_alive; } - crm_trace("%s %s %s %d %d", action->uuid, action->task, action->node->details->uname, state, is_set(remote_rsc->flags, pe_rsc_failed)); + crm_trace("Order %s action %s relative to %s%s (state %d)", + action->task, action->uuid, + is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "", + remote_rsc->id, state); switch (task) { case start_rsc: case action_promote: - if(state == remote_state_failed) { - /* Wait for the connection resource to be up and force recovery */ - custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL, - action->rsc, NULL, action, - pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set); - } else { - /* Ensure the connection resource is up and assume everything is as we left it */ - custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL, - action->rsc, NULL, action, - pe_order_preserve | pe_order_runnable_left, data_set); + /* This as an internally generated constraint exempt from + * user constraint prohibitions, and this action isn't runnable + * if the connection start isn't runnable. + */ + order_opts = pe_order_preserve | pe_order_runnable_left; + + if (state == remote_state_failed) { + /* Force recovery, by making this action required */ + order_opts |= pe_order_implies_then; } + + /* Ensure connection is up before running this action */ + custom_action_order(remote_rsc, + generate_op_key(remote_rsc->id, RSC_START, 0), + NULL, action->rsc, NULL, action, order_opts, + data_set); break; + case stop_rsc: /* Handle special case with remote node where stop actions need to be * ordered after the connection resource starts somewhere else. @@ -1975,22 +1988,19 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) pe_order_preserve | pe_order_implies_first, data_set); } break; - case action_demote: - /* If the connection is being torn down, we don't want - * to build a constraint between a resource's demotion and - * the connection resource starting... because the connection - * resource can not start. The connection might already be up, - * but the "start" action would not be allowed, which in turn would - * block the demotion of any resources living in the node. + case action_demote: + /* Only order this demote relative to the connection start if the + * connection isn't being torn down. Otherwise, the demote would be + * blocked because the connection start would not be allowed. */ - if(state == remote_state_resting || state == remote_state_unknown) { custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL, action->rsc, NULL, action, pe_order_preserve, data_set); } /* Otherwise we can rely on the stop ordering */ break; + default: /* Wait for the connection resource to be up */ if (is_recurring_action(action)) { @@ -2261,15 +2271,12 @@ stage7(pe_working_set_t * data_set) order_probes(data_set); crm_trace("Updating %d actions", g_list_length(data_set->actions)); - for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { action_t *action = (action_t *) gIter->data; update_action(action); } - crm_trace("Processing reloads"); - LogNodeActions(data_set, FALSE); for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; -- 1.8.3.1