Blob Blame History Raw
From 083c3a49ad41bd17387c8ae661c23b44d4b845c6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 30 May 2017 14:43:25 -0500
Subject: [PATCH] Log: pengine,libpe_status: revisit fencing messages

---
 lib/pengine/unpack.c | 72 ++++++++++++++++++++++++++++++++--------------------
 pengine/allocate.c   | 65 ++++++++++++++++++++++++++---------------------
 2 files changed, 81 insertions(+), 56 deletions(-)

diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 377100c..21eca90 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -63,6 +63,13 @@ is_dangling_container_remote_node(node_t *node)
 }
 
 
+/*!
+ * \brief Schedule a fence action for a node
+ *
+ * \param[in,out] data_set  Current working set of cluster
+ * \param[in,out] node      Node to fence
+ * \param[in]     reason    Text description of why fencing is needed
+ */
 void
 pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
 {
@@ -74,11 +81,13 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
 
         if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
             if (!is_set(rsc->flags, pe_rsc_managed)) {
-                crm_notice("Not fencing node %s due to '%s': container %s is"
-                           " unmanaged"
-                           "%s", node->details->uname, reason, rsc->id);
+                crm_notice("Not fencing guest node %s "
+                           "(otherwise would because %s): "
+                           "its guest resource %s is unmanaged",
+                           node->details->uname, reason, rsc->id);
             } else {
-                crm_warn("Remote node %s will be fenced due to '%s' by recovering %s",
+                crm_warn("Guest node %s will be fenced "
+                         "(by recovering its guest resource %s): %s",
                          node->details->uname, rsc->id, reason);
 
                 /* We don't mark the node as unclean because that would prevent the
@@ -91,8 +100,9 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
         }
 
     } else if (is_dangling_container_remote_node(node)) {
-        crm_info("Cleaning up dangling connection resource for guest node %s due to '%s'"
-                 " (fencing is already done, guest resource no longer exists)",
+        crm_info("Cleaning up dangling connection for guest node %s: "
+                 "fencing was already done because %s, "
+                 "and guest resource no longer exists",
                  node->details->uname, reason);
         set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
 
@@ -100,31 +110,29 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
         resource_t *rsc = node->details->remote_rsc;
 
         if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) {
-            crm_notice("Not fencing node %s due to '%s': connection is unmanaged",
+            crm_notice("Not fencing remote node %s "
+                       "(otherwise would because %s): connection is unmanaged",
                        node->details->uname, reason);
         } else if(node->details->remote_requires_reset == FALSE) {
             node->details->remote_requires_reset = TRUE;
-            if (pe_can_fence(data_set, node)) {
-                crm_warn("Remote node %s will be fenced due to %s", node->details->uname, reason);
-            } else {
-                crm_warn("Remote node %s is unclean due to %s", node->details->uname, reason);
-            }
+            crm_warn("Remote node %s %s: %s",
+                     node->details->uname,
+                     pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
+                     reason);
         }
         node->details->unclean = TRUE;
 
     } else if (node->details->unclean) {
-        if (pe_can_fence(data_set, node)) {
-            crm_trace("Node %s would also be fenced due to '%s'", node->details->uname, reason);
-        } else {
-            crm_trace("Node %s is also unclean due to '%s'", node->details->uname, reason);
-        }
-
-    } else if (pe_can_fence(data_set, node)) {
-        crm_warn("Node %s will be fenced due to %s", node->details->uname, reason);
-        node->details->unclean = TRUE;
+        crm_trace("Cluster node %s %s because %s",
+                  node->details->uname,
+                  pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean",
+                  reason);
 
     } else {
-        crm_warn("Node %s is unclean due to %s", node->details->uname, reason);
+        crm_warn("Cluster node %s %s: %s",
+                 node->details->uname,
+                 pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
+                 reason);
         node->details->unclean = TRUE;
     }
 }
@@ -1878,6 +1886,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
                   xmlNode * migrate_op, pe_working_set_t * data_set)
 {
     node_t *tmpnode = NULL;
+    char *reason = NULL;
+
     CRM_ASSERT(rsc);
     pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
                  rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail));
@@ -1907,7 +1917,6 @@ process_rsc_state(resource_t * rsc, node_t * node,
         && node->details->maintenance == FALSE
         && is_set(rsc->flags, pe_rsc_managed)) {
 
-        char *reason = NULL;
         gboolean should_fence = FALSE;
 
         /* If this is a guest node, fence it (regardless of whether fencing is
@@ -1922,14 +1931,19 @@ process_rsc_state(resource_t * rsc, node_t * node,
             should_fence = TRUE;
 
         } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
-            if (is_baremetal_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
+            if (is_baremetal_remote_node(node) && node->details->remote_rsc
+                && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
+
                 /* setting unseen = true means that fencing of the remote node will
                  * only occur if the connection resource is not going to start somewhere.
                  * This allows connection resources on a failed cluster-node to move to
                  * another node without requiring the baremetal remote nodes to be fenced
                  * as well. */
                 node->details->unseen = TRUE;
-                reason = crm_strdup_printf("%s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id);
+                reason = crm_strdup_printf("%s is active there (fencing will be"
+                                           " revoked if remote connection can "
+                                           "be re-established elsewhere)",
+                                           rsc->id);
             }
             should_fence = TRUE;
         }
@@ -1959,7 +1973,9 @@ process_rsc_state(resource_t * rsc, node_t * node,
             /* treat it as if it is still running
              * but also mark the node as unclean
              */
-            pe_fence_node(data_set, node, "resource failure(s)");
+            reason = crm_strdup_printf("%s failed there", rsc->id);
+            pe_fence_node(data_set, node, reason);
+            free(reason);
             break;
 
         case action_fail_standby:
@@ -2002,6 +2018,7 @@ process_rsc_state(resource_t * rsc, node_t * node,
                 stop_action(rsc, node, FALSE);
             }
             break;
+
         case action_fail_reset_remote:
             set_bit(rsc->flags, pe_rsc_failed);
             if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
@@ -2015,7 +2032,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
 
                     /* connection resource to baremetal resource failed in a way that
                      * should result in fencing the remote-node. */
-                    pe_fence_node(data_set, tmpnode, "of connection failure(s)");
+                    pe_fence_node(data_set, tmpnode,
+                                  "remote connection is unrecoverable");
                 }
             }
 
diff --git a/pengine/allocate.c b/pengine/allocate.c
index 0020af6..f2987cc 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -467,7 +467,7 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
             set_bit(action_clear->flags, pe_action_runnable);
 
             crm_notice("Clearing failure of %s on %s "
-                       "action definition changed " CRM_XS " %s",
+                       "because action definition changed " CRM_XS " %s",
                        rsc->id, node->details->uname, action_clear->uuid);
         }
     }
@@ -1789,7 +1789,6 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
 
     CRM_ASSERT(action->node);
     CRM_ASSERT(is_remote_node(action->node));
-    CRM_ASSERT(action->node->details->remote_rsc);
 
     remote_rsc = action->node->details->remote_rsc;
     CRM_ASSERT(remote_rsc);
@@ -1801,7 +1800,13 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
         pe_fence_node(data_set, action->node, "container failed");
     }
 
-    crm_trace("%s %s %s %s %d", action->uuid, action->task, remote_rsc->id, container->id, is_set(container->flags, pe_rsc_failed));
+    crm_trace("Order %s action %s relative to %s%s for %s%s",
+              action->task, action->uuid,
+              is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
+              remote_rsc->id,
+              is_set(container->flags, pe_rsc_failed)? "failed " : "",
+              container->id);
+
     switch (task) {
         case start_rsc:
         case action_promote:
@@ -1874,6 +1879,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
     node_t *cluster_node = NULL;
     enum action_tasks task = text2task(action->task);
     enum remote_connection_state state = remote_state_unknown;
+    enum pe_ordering order_opts = pe_order_none;
 
     if (action->rsc == NULL) {
         return;
@@ -1881,7 +1887,6 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
 
     CRM_ASSERT(action->node);
     CRM_ASSERT(is_remote_node(action->node));
-    CRM_ASSERT(action->node->details->remote_rsc);
 
     remote_rsc = action->node->details->remote_rsc;
     CRM_ASSERT(remote_rsc);
@@ -1895,7 +1900,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
      * on that remote node until after it starts elsewhere.
      */
     if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) {
-        /* There is no-where left to run the connection resource
+        /* There is nowhere left to run the connection resource,
          * and the resource is in a failed state (either directly
          * or because it is located on a failed node).
          *
@@ -1903,8 +1908,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
          * or if there are resources in an unknown state (probe), we
          * must assume the worst and fence it.
          */
-
-        if(is_set(action->node->details->remote_rsc->flags, pe_rsc_failed)) {
+        if (is_set(remote_rsc->flags, pe_rsc_failed)) {
             state = remote_state_failed;
         } else if(cluster_node && cluster_node->details->unclean) {
             state = remote_state_failed;
@@ -1934,22 +1938,31 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
         state = remote_state_alive;
     }
 
-    crm_trace("%s %s %s %d %d", action->uuid, action->task, action->node->details->uname, state, is_set(remote_rsc->flags, pe_rsc_failed));
+    crm_trace("Order %s action %s relative to %s%s (state %d)",
+              action->task, action->uuid,
+              is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
+              remote_rsc->id, state);
     switch (task) {
         case start_rsc:
         case action_promote:
-            if(state == remote_state_failed) {
-                /* Wait for the connection resource to be up and force recovery */
-                custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
-                                    action->rsc, NULL, action,
-                                    pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
-            } else {
-                /* Ensure the connection resource is up and assume everything is as we left it */
-                custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
-                                    action->rsc, NULL, action,
-                                    pe_order_preserve | pe_order_runnable_left, data_set);
+            /* This as an internally generated constraint exempt from
+             * user constraint prohibitions, and this action isn't runnable
+             * if the connection start isn't runnable.
+             */
+            order_opts = pe_order_preserve | pe_order_runnable_left;
+
+            if (state == remote_state_failed) {
+                /* Force recovery, by making this action required */
+                order_opts |= pe_order_implies_then;
             }
+
+            /* Ensure connection is up before running this action */
+            custom_action_order(remote_rsc,
+                                generate_op_key(remote_rsc->id, RSC_START, 0),
+                                NULL, action->rsc, NULL, action, order_opts,
+                                data_set);
             break;
+
         case stop_rsc:
             /* Handle special case with remote node where stop actions need to be
              * ordered after the connection resource starts somewhere else.
@@ -1975,22 +1988,19 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
                                     pe_order_preserve | pe_order_implies_first, data_set);
             }
             break;
-        case action_demote:
 
-            /* If the connection is being torn down, we don't want
-             * to build a constraint between a resource's demotion and
-             * the connection resource starting... because the connection
-             * resource can not start. The connection might already be up,
-             * but the "start" action would not be allowed, which in turn would
-             * block the demotion of any resources living in the node.
+        case action_demote:
+            /* Only order this demote relative to the connection start if the
+             * connection isn't being torn down. Otherwise, the demote would be
+             * blocked because the connection start would not be allowed.
              */
-
             if(state == remote_state_resting || state == remote_state_unknown) {
                 custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
                                     action->rsc, NULL, action,
                                     pe_order_preserve, data_set);
             } /* Otherwise we can rely on the stop ordering */
             break;
+
         default:
             /* Wait for the connection resource to be up */
             if (is_recurring_action(action)) {
@@ -2261,15 +2271,12 @@ stage7(pe_working_set_t * data_set)
     order_probes(data_set);
 
     crm_trace("Updating %d actions", g_list_length(data_set->actions));
-
     for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
         action_t *action = (action_t *) gIter->data;
 
         update_action(action);
     }
 
-    crm_trace("Processing reloads");
-
     LogNodeActions(data_set, FALSE);
     for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
         resource_t *rsc = (resource_t *) gIter->data;
-- 
1.8.3.1