Blame SOURCES/082-unrecoverable-remotes.patch

356a11
From c97ac6cda7f148dfdf09b8b4f4ce9762d9c59bcd Mon Sep 17 00:00:00 2001
356a11
From: Andrew Beekhof <andrew@beekhof.net>
356a11
Date: Tue, 4 Jul 2017 14:16:52 +1000
356a11
Subject: [PATCH] Fix: PE: Ensure unrecoverable remote nodes are fenced even if
356a11
 no resources can run on them
356a11
356a11
---
356a11
 pengine/allocate.c | 106 ++++++++++++++++++++++++++++++++---------------------
356a11
 1 file changed, 65 insertions(+), 41 deletions(-)
356a11
356a11
diff --git a/pengine/allocate.c b/pengine/allocate.c
356a11
index 3a95be6..3a883ad 100644
356a11
--- a/pengine/allocate.c
356a11
+++ b/pengine/allocate.c
356a11
@@ -39,6 +39,16 @@ void migrate_reload_madness(pe_working_set_t * data_set);
356a11
 extern void ReloadRsc(resource_t * rsc, node_t *node, pe_working_set_t * data_set);
356a11
 extern gboolean DeleteRsc(resource_t * rsc, node_t * node, gboolean optional, pe_working_set_t * data_set);
356a11
 static void apply_remote_node_ordering(pe_working_set_t *data_set);
356a11
+static enum remote_connection_state get_remote_node_state(pe_node_t *node);
356a11
+enum remote_connection_state 
356a11
+{
356a11
+    remote_state_unknown = 0,
356a11
+    remote_state_alive = 1,
356a11
+    remote_state_resting = 2,
356a11
+    remote_state_failed = 3,
356a11
+    remote_state_stopped = 4
356a11
+};
356a11
+
356a11
 
356a11
 resource_alloc_functions_t resource_class_alloc_functions[] = {
356a11
     {
356a11
@@ -886,21 +896,25 @@ probe_resources(pe_working_set_t * data_set)
356a11
 {
356a11
     action_t *probe_node_complete = NULL;
356a11
 
356a11
-    GListPtr gIter = NULL;
356a11
-    GListPtr gIter2 = NULL;
356a11
-
356a11
-    for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
356a11
+    for (GListPtr gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
356a11
         node_t *node = (node_t *) gIter->data;
356a11
         const char *probed = g_hash_table_lookup(node->details->attrs, CRM_OP_PROBED);
356a11
 
356a11
-        if (node->details->online == FALSE) {
356a11
+        if (is_container_remote_node(node)) {
356a11
+            /* TODO enable guest node probes once ordered probing is implemented */
356a11
             continue;
356a11
 
356a11
-        } else if (node->details->unclean) {
356a11
+        } else if (node->details->online == FALSE && node->details->remote_rsc) {
356a11
+            enum remote_connection_state state = get_remote_node_state(node);
356a11
+            if(state == remote_state_failed) {
356a11
+                pe_fence_node(data_set, node, "the connection is unrecoverable");
356a11
+            }
356a11
             continue;
356a11
 
356a11
-        } else if (is_container_remote_node(node)) {
356a11
-            /* TODO enable guest node probes once ordered probing is implemented */
356a11
+        } else if(node->details->online == FALSE) {
356a11
+            continue;
356a11
+
356a11
+        } else if (node->details->unclean) {
356a11
             continue;
356a11
 
356a11
         } else if (node->details->rsc_discovery_enabled == FALSE) {
356a11
@@ -916,7 +930,7 @@ probe_resources(pe_working_set_t * data_set)
356a11
             continue;
356a11
         }
356a11
 
356a11
-        for (gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
356a11
+        for (GListPtr gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) {
356a11
             resource_t *rsc = (resource_t *) gIter2->data;
356a11
 
356a11
             rsc->cmds->create_probe(rsc, node, probe_node_complete, FALSE, data_set);
356a11
@@ -1749,15 +1763,6 @@ rsc_order_first(resource_t * lh_rsc, order_constraint_t * order, pe_working_set_
356a11
 extern gboolean update_action(action_t * action);
356a11
 extern void update_colo_start_chain(action_t * action);
356a11
 
356a11
-enum remote_connection_state 
356a11
-{
356a11
-    remote_state_unknown = 0,
356a11
-    remote_state_alive = 1,
356a11
-    remote_state_resting = 2,
356a11
-    remote_state_failed = 3,
356a11
-    remote_state_stopped = 4
356a11
-};
356a11
-
356a11
 static int
356a11
 is_recurring_action(action_t *action) 
356a11
 {
356a11
@@ -1874,29 +1879,24 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
356a11
     }
356a11
 }
356a11
 
356a11
-static void
356a11
-apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
+static enum remote_connection_state
356a11
+get_remote_node_state(pe_node_t *node) 
356a11
 {
356a11
     resource_t *remote_rsc = NULL;
356a11
     node_t *cluster_node = NULL;
356a11
-    enum action_tasks task = text2task(action->task);
356a11
-    enum remote_connection_state state = remote_state_unknown;
356a11
-    enum pe_ordering order_opts = pe_order_none;
356a11
 
356a11
-    if (action->rsc == NULL) {
356a11
-        return;
356a11
+    if(node == NULL) {
356a11
+        return remote_state_unknown;
356a11
     }
356a11
 
356a11
-    CRM_ASSERT(action->node);
356a11
-    CRM_ASSERT(is_remote_node(action->node));
356a11
-
356a11
-    remote_rsc = action->node->details->remote_rsc;
356a11
+    remote_rsc = node->details->remote_rsc;
356a11
     CRM_ASSERT(remote_rsc);
356a11
 
356a11
     if(remote_rsc->running_on) {
356a11
         cluster_node = remote_rsc->running_on->data;
356a11
     }
356a11
 
356a11
+
356a11
     /* If the cluster node the remote connection resource resides on
356a11
      * is unclean or went offline, we can't process any operations
356a11
      * on that remote node until after it starts elsewhere.
356a11
@@ -1911,21 +1911,21 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
          * must assume the worst and fence it.
356a11
          */
356a11
         if (is_set(remote_rsc->flags, pe_rsc_failed)) {
356a11
-            state = remote_state_failed;
356a11
+            return remote_state_failed;
356a11
         } else if(cluster_node && cluster_node->details->unclean) {
356a11
-            state = remote_state_failed;
356a11
+            return remote_state_failed;
356a11
         } else {
356a11
-            state = remote_state_stopped;
356a11
+            return remote_state_stopped;
356a11
         }
356a11
 
356a11
     } else if (cluster_node == NULL) {
356a11
         /* Connection is recoverable but not currently running anywhere, see if we can recover it first */
356a11
-        state = remote_state_unknown;
356a11
+        return remote_state_unknown;
356a11
 
356a11
     } else if(cluster_node->details->unclean == TRUE
356a11
               || cluster_node->details->online == FALSE) {
356a11
         /* Connection is running on a dead node, see if we can recover it first */
356a11
-        state = remote_state_resting;
356a11
+        return remote_state_resting;
356a11
 
356a11
     } else if (g_list_length(remote_rsc->running_on) > 1
356a11
                && remote_rsc->partial_migration_source
356a11
@@ -1934,10 +1934,34 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
          * wait until after the resource migrates before performing
356a11
          * any actions.
356a11
          */
356a11
-        state = remote_state_resting;
356a11
+        return remote_state_resting;
356a11
 
356a11
-    } else {
356a11
-        state = remote_state_alive;
356a11
+    }
356a11
+    return remote_state_alive;
356a11
+}
356a11
+
356a11
+static void
356a11
+apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
+{
356a11
+    resource_t *remote_rsc = NULL;
356a11
+    node_t *cluster_node = NULL;
356a11
+    enum action_tasks task = text2task(action->task);
356a11
+    enum remote_connection_state state = get_remote_node_state(action->node);
356a11
+
356a11
+    enum pe_ordering order_opts = pe_order_none;
356a11
+
356a11
+    if (action->rsc == NULL) {
356a11
+        return;
356a11
+    }
356a11
+
356a11
+    CRM_ASSERT(action->node);
356a11
+    CRM_ASSERT(is_remote_node(action->node));
356a11
+
356a11
+    remote_rsc = action->node->details->remote_rsc;
356a11
+    CRM_ASSERT(remote_rsc);
356a11
+
356a11
+    if(remote_rsc->running_on) {
356a11
+        cluster_node = remote_rsc->running_on->data;
356a11
     }
356a11
 
356a11
     crm_trace("Order %s action %s relative to %s%s (state %d)",
356a11
@@ -2049,13 +2073,11 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
 static void
356a11
 apply_remote_node_ordering(pe_working_set_t *data_set)
356a11
 {
356a11
-    GListPtr gIter = data_set->actions;
356a11
-
356a11
     if (is_set(data_set->flags, pe_flag_have_remote_nodes) == FALSE) {
356a11
         return;
356a11
     }
356a11
 
356a11
-    for (; gIter != NULL; gIter = gIter->next) {
356a11
+    for (GListPtr gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
356a11
         action_t *action = (action_t *) gIter->data;
356a11
 
356a11
         if (action->rsc == NULL) {
356a11
@@ -2092,12 +2114,14 @@ apply_remote_node_ordering(pe_working_set_t *data_set)
356a11
             is_remote_node(action->node) == FALSE ||
356a11
             action->node->details->remote_rsc == NULL ||
356a11
             is_set(action->flags, pe_action_pseudo)) {
356a11
-            crm_trace("Nothing required for %s", action->uuid);
356a11
+            crm_trace("Nothing required for %s on %s", action->uuid, action->node?action->node->details->uname:"NA");
356a11
 
356a11
         } else if(action->node->details->remote_rsc->container) {
356a11
+            crm_trace("Container ordering for %s", action->uuid);
356a11
             apply_container_ordering(action, data_set);
356a11
 
356a11
         } else {
356a11
+            crm_trace("Remote ordering for %s", action->uuid);
356a11
             apply_remote_ordering(action, data_set);
356a11
         }
356a11
     }
356a11
-- 
356a11
1.8.3.1
356a11