542ee2
From 11685256d35035ae69985d1f4536d0ed68951efe Mon Sep 17 00:00:00 2001
542ee2
From: Ken Gaillot <kgaillot@redhat.com>
542ee2
Date: Thu, 24 Oct 2019 17:35:48 -0500
542ee2
Subject: [PATCH 4/5] Fix: scheduler: properly detect whether guest node is
542ee2
 fenceable
542ee2
542ee2
Guest nodes are "fenced" by stopping their container resource. Previously, we
542ee2
assumed that this was always possible. However, it may not be if the
542ee2
container's host is failed and not fenceable (e.g. due to lack of quorum).
542ee2
542ee2
Now, we check guest nodes for fenceability as we do for other nodes,
542ee2
with the criteria being that the guest's host must be either online or
542ee2
fenceable. Additionally, when creating a new action that normally does not
542ee2
require fencing, we make the action unrunnable if it is on an non-fenceable
542ee2
guest node, because the action cannot be attempted in that case.
542ee2
---
542ee2
 lib/pengine/utils.c | 55 ++++++++++++++++++++++++++++++++++++++---------------
542ee2
 pengine/allocate.c  |  3 ++-
542ee2
 2 files changed, 42 insertions(+), 16 deletions(-)
542ee2
542ee2
diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c
542ee2
index 97241af..671ef76 100644
542ee2
--- a/lib/pengine/utils.c
542ee2
+++ b/lib/pengine/utils.c
542ee2
@@ -92,36 +92,49 @@ pe_free_rsc_action_details(pe_action_t *action)
542ee2
  * \param[in] data_set  Working set for cluster
542ee2
  * \param[in] node      Name of node to check
542ee2
  *
542ee2
- * \return TRUE if node can be fenced, FALSE otherwise
542ee2
- *
542ee2
- * \note This function should only be called for cluster nodes and baremetal
542ee2
- *       remote nodes; guest nodes are fenced by stopping their container
542ee2
- *       resource, so fence execution requirements do not apply to them.
542ee2
+ * \return true if node can be fenced, false otherwise
542ee2
  */
542ee2
-bool pe_can_fence(pe_working_set_t * data_set, node_t *node)
542ee2
+bool
542ee2
+pe_can_fence(pe_working_set_t *data_set, pe_node_t *node)
542ee2
 {
542ee2
-    if(is_not_set(data_set->flags, pe_flag_stonith_enabled)) {
542ee2
-        return FALSE; /* Turned off */
542ee2
+    if (is_container_remote_node(node)) {
542ee2
+        /* Guest nodes are fenced by stopping their container resource. We can
542ee2
+         * do that if the container's host is either online or fenceable.
542ee2
+         */
542ee2
+        pe_resource_t *rsc = node->details->remote_rsc->container;
542ee2
+
542ee2
+        for (GList *n = rsc->running_on; n != NULL; n = n->next) {
542ee2
+            pe_node_t *container_node = n->data;
542ee2
+
542ee2
+            if (!container_node->details->online
542ee2
+                && !pe_can_fence(data_set, container_node)) {
542ee2
+                return false;
542ee2
+            }
542ee2
+        }
542ee2
+        return true;
542ee2
+
542ee2
+    } else if(is_not_set(data_set->flags, pe_flag_stonith_enabled)) {
542ee2
+        return false; /* Turned off */
542ee2
 
542ee2
     } else if (is_not_set(data_set->flags, pe_flag_have_stonith_resource)) {
542ee2
-        return FALSE; /* No devices */
542ee2
+        return false; /* No devices */
542ee2
 
542ee2
     } else if (is_set(data_set->flags, pe_flag_have_quorum)) {
542ee2
-        return TRUE;
542ee2
+        return true;
542ee2
 
542ee2
     } else if (data_set->no_quorum_policy == no_quorum_ignore) {
542ee2
-        return TRUE;
542ee2
+        return true;
542ee2
 
542ee2
     } else if(node == NULL) {
542ee2
-        return FALSE;
542ee2
+        return false;
542ee2
 
542ee2
     } else if(node->details->online) {
542ee2
         crm_notice("We can fence %s without quorum because they're in our membership", node->details->uname);
542ee2
-        return TRUE;
542ee2
+        return true;
542ee2
     }
542ee2
 
542ee2
     crm_trace("Cannot fence %s", node->details->uname);
542ee2
-    return FALSE;
542ee2
+    return false;
542ee2
 }
542ee2
 
542ee2
 node_t *
542ee2
@@ -576,7 +589,19 @@ custom_action(resource_t * rsc, char *key, const char *task,
542ee2
         } else if (action->needs == rsc_req_nothing) {
542ee2
             pe_rsc_trace(rsc, "Action %s does not require anything", action->uuid);
542ee2
             pe_action_set_reason(action, NULL, TRUE);
542ee2
-            pe_set_action_bit(action, pe_action_runnable);
542ee2
+            if (is_container_remote_node(action->node)
542ee2
+                && !pe_can_fence(data_set, action->node)) {
542ee2
+                /* An action that requires nothing usually does not require any
542ee2
+                 * fencing in order to be runnable. However, there is an
542ee2
+                 * exception: an action cannot be completed if it is on a guest
542ee2
+                 * node whose host is unclean and cannot be fenced.
542ee2
+                 */
542ee2
+                pe_clear_action_bit(action, pe_action_runnable);
542ee2
+                crm_debug("%s\t%s (cancelled : host cannot be fenced)",
542ee2
+                          action->node->details->uname, action->uuid);
542ee2
+            } else {
542ee2
+                pe_set_action_bit(action, pe_action_runnable);
542ee2
+            }
542ee2
 #if 0
542ee2
             /*
542ee2
              * No point checking this
542ee2
diff --git a/pengine/allocate.c b/pengine/allocate.c
542ee2
index e30cb1c..b819af3 100644
542ee2
--- a/pengine/allocate.c
542ee2
+++ b/pengine/allocate.c
542ee2
@@ -1584,7 +1584,8 @@ stage6(pe_working_set_t * data_set)
542ee2
          * so handle them separately.
542ee2
          */
542ee2
         if (is_container_remote_node(node)) {
542ee2
-            if (node->details->remote_requires_reset && need_stonith) {
542ee2
+            if (node->details->remote_requires_reset && need_stonith
542ee2
+                && pe_can_fence(data_set, node)) {
542ee2
                 fence_guest(node, data_set);
542ee2
             }
542ee2
             continue;
542ee2
-- 
542ee2
1.8.3.1
542ee2