From c97ac6cda7f148dfdf09b8b4f4ce9762d9c59bcd Mon Sep 17 00:00:00 2001
From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 4 Jul 2017 14:16:52 +1000
Subject: [PATCH] Fix: PE: Ensure unrecoverable remote nodes are fenced even if
no resources can run on them
---
pengine/allocate.c | 106 ++++++++++++++++++++++++++++++++---------------------
1 file changed, 65 insertions(+), 41 deletions(-)
diff --git a/pengine/allocate.c b/pengine/allocate.c
index 3a95be6..3a883ad 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -39,6 +39,16 @@ void migrate_reload_madness(pe_working_set_t * data_set);
extern void ReloadRsc(resource_t * rsc, node_t *node, pe_working_set_t * data_set);
extern gboolean DeleteRsc(resource_t * rsc, node_t * node, gboolean optional, pe_working_set_t * data_set);
static void apply_remote_node_ordering(pe_working_set_t *data_set);
+static enum remote_connection_state get_remote_node_state(pe_node_t *node);
+enum remote_connection_state
+{
+ remote_state_unknown = 0,
+ remote_state_alive = 1,
+ remote_state_resting = 2,
+ remote_state_failed = 3,
+ remote_state_stopped = 4
+};
+
resource_alloc_functions_t resource_class_alloc_functions[] = {
{
@@ -886,21 +896,25 @@ probe_resources(pe_working_set_t * data_set)
{
action_t *probe_node_complete = NULL;
- GListPtr gIter = NULL;
- GListPtr gIter2 = NULL;
-
- for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
+ for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
node_t *node = (node_t *) gIter->data;
const char *probed = g_hash_table_lookup(node->details->attrs, CRM_OP_PROBED);
- if (node->details->online == FALSE) {
+ if (is_container_remote_node(node)) {
+ /* TODO enable guest node probes once ordered probing is implemented */
continue;
- } else if (node->details->unclean) {
+ } else if (node->details->online == FALSE && node->details->remote_rsc) {
+ enum remote_connection_state state = get_remote_node_state(node);
+ if(state == remote_state_failed) {
+ pe_fence_node(data_set, node, "the connection is unrecoverable");
+ }
continue;
- } else if (is_container_remote_node(node)) {
- /* TODO enable guest node probes once ordered probing is implemented */
+ } else if(node->details->online == FALSE) {
+ continue;
+
+ } else if (node->details->unclean) {
continue;
} else if (node->details->rsc_discovery_enabled == FALSE) {
@@ -916,7 +930,7 @@ probe_resources(pe_working_set_t * data_set)
continue;
}
- for (gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
+ for (GListPtr gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
resource_t *rsc = (resource_t *) gIter2->data;
rsc->cmds->create_probe(rsc, node, probe_node_complete, FALSE, data_set);
@@ -1749,15 +1763,6 @@ rsc_order_first(resource_t * lh_rsc, order_constraint_t * order, pe_working_set_
extern gboolean update_action(action_t * action);
extern void update_colo_start_chain(action_t * action);
-enum remote_connection_state
-{
- remote_state_unknown = 0,
- remote_state_alive = 1,
- remote_state_resting = 2,
- remote_state_failed = 3,
- remote_state_stopped = 4
-};
-
static int
is_recurring_action(action_t *action)
{
@@ -1874,29 +1879,24 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
}
}
-static void
-apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
+static enum remote_connection_state
+get_remote_node_state(pe_node_t *node)
{
resource_t *remote_rsc = NULL;
node_t *cluster_node = NULL;
- enum action_tasks task = text2task(action->task);
- enum remote_connection_state state = remote_state_unknown;
- enum pe_ordering order_opts = pe_order_none;
- if (action->rsc == NULL) {
- return;
+ if(node == NULL) {
+ return remote_state_unknown;
}
- CRM_ASSERT(action->node);
- CRM_ASSERT(is_remote_node(action->node));
-
- remote_rsc = action->node->details->remote_rsc;
+ remote_rsc = node->details->remote_rsc;
CRM_ASSERT(remote_rsc);
if(remote_rsc->running_on) {
cluster_node = remote_rsc->running_on->data;
}
+
/* If the cluster node the remote connection resource resides on
* is unclean or went offline, we can't process any operations
* on that remote node until after it starts elsewhere.
@@ -1911,21 +1911,21 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
* must assume the worst and fence it.
*/
if (is_set(remote_rsc->flags, pe_rsc_failed)) {
- state = remote_state_failed;
+ return remote_state_failed;
} else if(cluster_node && cluster_node->details->unclean) {
- state = remote_state_failed;
+ return remote_state_failed;
} else {
- state = remote_state_stopped;
+ return remote_state_stopped;
}
} else if (cluster_node == NULL) {
/* Connection is recoverable but not currently running anywhere, see if we can recover it first */
- state = remote_state_unknown;
+ return remote_state_unknown;
} else if(cluster_node->details->unclean == TRUE
|| cluster_node->details->online == FALSE) {
/* Connection is running on a dead node, see if we can recover it first */
- state = remote_state_resting;
+ return remote_state_resting;
} else if (g_list_length(remote_rsc->running_on) > 1
&& remote_rsc->partial_migration_source
@@ -1934,10 +1934,34 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
* wait until after the resource migrates before performing
* any actions.
*/
- state = remote_state_resting;
+ return remote_state_resting;
- } else {
- state = remote_state_alive;
+ }
+ return remote_state_alive;
+}
+
+static void
+apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
+{
+ resource_t *remote_rsc = NULL;
+ node_t *cluster_node = NULL;
+ enum action_tasks task = text2task(action->task);
+ enum remote_connection_state state = get_remote_node_state(action->node);
+
+ enum pe_ordering order_opts = pe_order_none;
+
+ if (action->rsc == NULL) {
+ return;
+ }
+
+ CRM_ASSERT(action->node);
+ CRM_ASSERT(is_remote_node(action->node));
+
+ remote_rsc = action->node->details->remote_rsc;
+ CRM_ASSERT(remote_rsc);
+
+ if(remote_rsc->running_on) {
+ cluster_node = remote_rsc->running_on->data;
}
crm_trace("Order %s action %s relative to %s%s (state %d)",
@@ -2049,13 +2073,11 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
static void
apply_remote_node_ordering(pe_working_set_t *data_set)
{
- GListPtr gIter = data_set->actions;
-
if (is_set(data_set->flags, pe_flag_have_remote_nodes) == FALSE) {
return;
}
- for (; gIter != NULL; gIter = gIter->next) {
+ for (GListPtr gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
action_t *action = (action_t *) gIter->data;
if (action->rsc == NULL) {
@@ -2092,12 +2114,14 @@ apply_remote_node_ordering(pe_working_set_t *data_set)
is_remote_node(action->node) == FALSE ||
action->node->details->remote_rsc == NULL ||
is_set(action->flags, pe_action_pseudo)) {
- crm_trace("Nothing required for %s", action->uuid);
+ crm_trace("Nothing required for %s on %s", action->uuid, action->node?action->node->details->uname:"NA");
} else if(action->node->details->remote_rsc->container) {
+ crm_trace("Container ordering for %s", action->uuid);
apply_container_ordering(action, data_set);
} else {
+ crm_trace("Remote ordering for %s", action->uuid);
apply_remote_ordering(action, data_set);
}
}
--
1.8.3.1