Blob Blame History Raw
From c97ac6cda7f148dfdf09b8b4f4ce9762d9c59bcd Mon Sep 17 00:00:00 2001
From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 4 Jul 2017 14:16:52 +1000
Subject: [PATCH] Fix: PE: Ensure unrecoverable remote nodes are fenced even if
 no resources can run on them

---
 pengine/allocate.c | 106 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/pengine/allocate.c b/pengine/allocate.c
index 3a95be6..3a883ad 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -39,6 +39,16 @@ void migrate_reload_madness(pe_working_set_t * data_set);
 extern void ReloadRsc(resource_t * rsc, node_t *node, pe_working_set_t * data_set);
 extern gboolean DeleteRsc(resource_t * rsc, node_t * node, gboolean optional, pe_working_set_t * data_set);
 static void apply_remote_node_ordering(pe_working_set_t *data_set);
+static enum remote_connection_state get_remote_node_state(pe_node_t *node);
+enum remote_connection_state 
+{
+    remote_state_unknown = 0,
+    remote_state_alive = 1,
+    remote_state_resting = 2,
+    remote_state_failed = 3,
+    remote_state_stopped = 4
+};
+
 
 resource_alloc_functions_t resource_class_alloc_functions[] = {
     {
@@ -886,21 +896,25 @@ probe_resources(pe_working_set_t * data_set)
 {
     action_t *probe_node_complete = NULL;
 
-    GListPtr gIter = NULL;
-    GListPtr gIter2 = NULL;
-
-    for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
+    for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
         node_t *node = (node_t *) gIter->data;
         const char *probed = g_hash_table_lookup(node->details->attrs, CRM_OP_PROBED);
 
-        if (node->details->online == FALSE) {
+        if (is_container_remote_node(node)) {
+            /* TODO enable guest node probes once ordered probing is implemented */
             continue;
 
-        } else if (node->details->unclean) {
+        } else if (node->details->online == FALSE && node->details->remote_rsc) {
+            enum remote_connection_state state = get_remote_node_state(node);
+            if(state == remote_state_failed) {
+                pe_fence_node(data_set, node, "the connection is unrecoverable");
+            }
             continue;
 
-        } else if (is_container_remote_node(node)) {
-            /* TODO enable guest node probes once ordered probing is implemented */
+        } else if(node->details->online == FALSE) {
+            continue;
+
+        } else if (node->details->unclean) {
             continue;
 
         } else if (node->details->rsc_discovery_enabled == FALSE) {
@@ -916,7 +930,7 @@ probe_resources(pe_working_set_t * data_set)
             continue;
         }
 
-        for (gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
+        for (GListPtr gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
             resource_t *rsc = (resource_t *) gIter2->data;
 
             rsc->cmds->create_probe(rsc, node, probe_node_complete, FALSE, data_set);
@@ -1749,15 +1763,6 @@ rsc_order_first(resource_t * lh_rsc, order_constraint_t * order, pe_working_set_
 extern gboolean update_action(action_t * action);
 extern void update_colo_start_chain(action_t * action);
 
-enum remote_connection_state 
-{
-    remote_state_unknown = 0,
-    remote_state_alive = 1,
-    remote_state_resting = 2,
-    remote_state_failed = 3,
-    remote_state_stopped = 4
-};
-
 static int
 is_recurring_action(action_t *action) 
 {
@@ -1874,29 +1879,24 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
     }
 }
 
-static void
-apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
+static enum remote_connection_state
+get_remote_node_state(pe_node_t *node) 
 {
     resource_t *remote_rsc = NULL;
     node_t *cluster_node = NULL;
-    enum action_tasks task = text2task(action->task);
-    enum remote_connection_state state = remote_state_unknown;
-    enum pe_ordering order_opts = pe_order_none;
 
-    if (action->rsc == NULL) {
-        return;
+    if(node == NULL) {
+        return remote_state_unknown;
     }
 
-    CRM_ASSERT(action->node);
-    CRM_ASSERT(is_remote_node(action->node));
-
-    remote_rsc = action->node->details->remote_rsc;
+    remote_rsc = node->details->remote_rsc;
     CRM_ASSERT(remote_rsc);
 
     if(remote_rsc->running_on) {
         cluster_node = remote_rsc->running_on->data;
     }
 
+
     /* If the cluster node the remote connection resource resides on
      * is unclean or went offline, we can't process any operations
      * on that remote node until after it starts elsewhere.
@@ -1911,21 +1911,21 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
          * must assume the worst and fence it.
          */
         if (is_set(remote_rsc->flags, pe_rsc_failed)) {
-            state = remote_state_failed;
+            return remote_state_failed;
         } else if(cluster_node && cluster_node->details->unclean) {
-            state = remote_state_failed;
+            return remote_state_failed;
         } else {
-            state = remote_state_stopped;
+            return remote_state_stopped;
         }
 
     } else if (cluster_node == NULL) {
         /* Connection is recoverable but not currently running anywhere, see if we can recover it first */
-        state = remote_state_unknown;
+        return remote_state_unknown;
 
     } else if(cluster_node->details->unclean == TRUE
               || cluster_node->details->online == FALSE) {
         /* Connection is running on a dead node, see if we can recover it first */
-        state = remote_state_resting;
+        return remote_state_resting;
 
     } else if (g_list_length(remote_rsc->running_on) > 1
                && remote_rsc->partial_migration_source
@@ -1934,10 +1934,34 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
          * wait until after the resource migrates before performing
          * any actions.
          */
-        state = remote_state_resting;
+        return remote_state_resting;
 
-    } else {
-        state = remote_state_alive;
+    }
+    return remote_state_alive;
+}
+
+static void
+apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
+{
+    resource_t *remote_rsc = NULL;
+    node_t *cluster_node = NULL;
+    enum action_tasks task = text2task(action->task);
+    enum remote_connection_state state = get_remote_node_state(action->node);
+
+    enum pe_ordering order_opts = pe_order_none;
+
+    if (action->rsc == NULL) {
+        return;
+    }
+
+    CRM_ASSERT(action->node);
+    CRM_ASSERT(is_remote_node(action->node));
+
+    remote_rsc = action->node->details->remote_rsc;
+    CRM_ASSERT(remote_rsc);
+
+    if(remote_rsc->running_on) {
+        cluster_node = remote_rsc->running_on->data;
     }
 
     crm_trace("Order %s action %s relative to %s%s (state %d)",
@@ -2049,13 +2073,11 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
 static void
 apply_remote_node_ordering(pe_working_set_t *data_set)
 {
-    GListPtr gIter = data_set->actions;
-
     if (is_set(data_set->flags, pe_flag_have_remote_nodes) == FALSE) {
         return;
     }
 
-    for (; gIter != NULL; gIter = gIter->next) {
+    for (GListPtr gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
         action_t *action = (action_t *) gIter->data;
 
         if (action->rsc == NULL) {
@@ -2092,12 +2114,14 @@ apply_remote_node_ordering(pe_working_set_t *data_set)
             is_remote_node(action->node) == FALSE ||
             action->node->details->remote_rsc == NULL ||
             is_set(action->flags, pe_action_pseudo)) {
-            crm_trace("Nothing required for %s", action->uuid);
+            crm_trace("Nothing required for %s on %s", action->uuid, action->node?action->node->details->uname:"NA");
 
         } else if(action->node->details->remote_rsc->container) {
+            crm_trace("Container ordering for %s", action->uuid);
             apply_container_ordering(action, data_set);
 
         } else {
+            crm_trace("Remote ordering for %s", action->uuid);
             apply_remote_ordering(action, data_set);
         }
     }
-- 
1.8.3.1