356a11
From 083c3a49ad41bd17387c8ae661c23b44d4b845c6 Mon Sep 17 00:00:00 2001
356a11
From: Ken Gaillot <kgaillot@redhat.com>
356a11
Date: Tue, 30 May 2017 14:43:25 -0500
356a11
Subject: [PATCH] Log: pengine,libpe_status: revisit fencing messages
356a11
356a11
---
356a11
 lib/pengine/unpack.c | 72 ++++++++++++++++++++++++++++++++--------------------
356a11
 pengine/allocate.c   | 65 ++++++++++++++++++++++++++---------------------
356a11
 2 files changed, 81 insertions(+), 56 deletions(-)
356a11
356a11
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
356a11
index 377100c..21eca90 100644
356a11
--- a/lib/pengine/unpack.c
356a11
+++ b/lib/pengine/unpack.c
356a11
@@ -63,6 +63,13 @@ is_dangling_container_remote_node(node_t *node)
356a11
 }
356a11
 
356a11
 
356a11
+/*!
356a11
+ * \brief Schedule a fence action for a node
356a11
+ *
356a11
+ * \param[in,out] data_set  Current working set of cluster
356a11
+ * \param[in,out] node      Node to fence
356a11
+ * \param[in]     reason    Text description of why fencing is needed
356a11
+ */
356a11
 void
356a11
 pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
356a11
 {
356a11
@@ -74,11 +81,13 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
356a11
 
356a11
         if (is_set(rsc->flags, pe_rsc_failed) == FALSE) {
356a11
             if (!is_set(rsc->flags, pe_rsc_managed)) {
356a11
-                crm_notice("Not fencing node %s due to '%s': container %s is"
356a11
-                           " unmanaged"
356a11
-                           "%s", node->details->uname, reason, rsc->id);
356a11
+                crm_notice("Not fencing guest node %s "
356a11
+                           "(otherwise would because %s): "
356a11
+                           "its guest resource %s is unmanaged",
356a11
+                           node->details->uname, reason, rsc->id);
356a11
             } else {
356a11
-                crm_warn("Remote node %s will be fenced due to '%s' by recovering %s",
356a11
+                crm_warn("Guest node %s will be fenced "
356a11
+                         "(by recovering its guest resource %s): %s",
356a11
                          node->details->uname, rsc->id, reason);
356a11
 
356a11
                 /* We don't mark the node as unclean because that would prevent the
356a11
@@ -91,8 +100,9 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
356a11
         }
356a11
 
356a11
     } else if (is_dangling_container_remote_node(node)) {
356a11
-        crm_info("Cleaning up dangling connection resource for guest node %s due to '%s'"
356a11
-                 " (fencing is already done, guest resource no longer exists)",
356a11
+        crm_info("Cleaning up dangling connection for guest node %s: "
356a11
+                 "fencing was already done because %s, "
356a11
+                 "and guest resource no longer exists",
356a11
                  node->details->uname, reason);
356a11
         set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
356a11
 
356a11
@@ -100,31 +110,29 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
356a11
         resource_t *rsc = node->details->remote_rsc;
356a11
 
356a11
         if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) {
356a11
-            crm_notice("Not fencing node %s due to '%s': connection is unmanaged",
356a11
+            crm_notice("Not fencing remote node %s "
356a11
+                       "(otherwise would because %s): connection is unmanaged",
356a11
                        node->details->uname, reason);
356a11
         } else if(node->details->remote_requires_reset == FALSE) {
356a11
             node->details->remote_requires_reset = TRUE;
356a11
-            if (pe_can_fence(data_set, node)) {
356a11
-                crm_warn("Remote node %s will be fenced due to %s", node->details->uname, reason);
356a11
-            } else {
356a11
-                crm_warn("Remote node %s is unclean due to %s", node->details->uname, reason);
356a11
-            }
356a11
+            crm_warn("Remote node %s %s: %s",
356a11
+                     node->details->uname,
356a11
+                     pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
356a11
+                     reason);
356a11
         }
356a11
         node->details->unclean = TRUE;
356a11
 
356a11
     } else if (node->details->unclean) {
356a11
-        if (pe_can_fence(data_set, node)) {
356a11
-            crm_trace("Node %s would also be fenced due to '%s'", node->details->uname, reason);
356a11
-        } else {
356a11
-            crm_trace("Node %s is also unclean due to '%s'", node->details->uname, reason);
356a11
-        }
356a11
-
356a11
-    } else if (pe_can_fence(data_set, node)) {
356a11
-        crm_warn("Node %s will be fenced due to %s", node->details->uname, reason);
356a11
-        node->details->unclean = TRUE;
356a11
+        crm_trace("Cluster node %s %s because %s",
356a11
+                  node->details->uname,
356a11
+                  pe_can_fence(data_set, node)? "would also be fenced" : "also is unclean",
356a11
+                  reason);
356a11
 
356a11
     } else {
356a11
-        crm_warn("Node %s is unclean due to %s", node->details->uname, reason);
356a11
+        crm_warn("Cluster node %s %s: %s",
356a11
+                 node->details->uname,
356a11
+                 pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
356a11
+                 reason);
356a11
         node->details->unclean = TRUE;
356a11
     }
356a11
 }
356a11
@@ -1878,6 +1886,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
                   xmlNode * migrate_op, pe_working_set_t * data_set)
356a11
 {
356a11
     node_t *tmpnode = NULL;
356a11
+    char *reason = NULL;
356a11
+
356a11
     CRM_ASSERT(rsc);
356a11
     pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
356a11
                  rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail));
356a11
@@ -1907,7 +1917,6 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
         && node->details->maintenance == FALSE
356a11
         && is_set(rsc->flags, pe_rsc_managed)) {
356a11
 
356a11
-        char *reason = NULL;
356a11
         gboolean should_fence = FALSE;
356a11
 
356a11
         /* If this is a guest node, fence it (regardless of whether fencing is
356a11
@@ -1922,14 +1931,19 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
             should_fence = TRUE;
356a11
 
356a11
         } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
356a11
-            if (is_baremetal_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
356a11
+            if (is_baremetal_remote_node(node) && node->details->remote_rsc
356a11
+                && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
356a11
+
356a11
                 /* setting unseen = true means that fencing of the remote node will
356a11
                  * only occur if the connection resource is not going to start somewhere.
356a11
                  * This allows connection resources on a failed cluster-node to move to
356a11
                  * another node without requiring the baremetal remote nodes to be fenced
356a11
                  * as well. */
356a11
                 node->details->unseen = TRUE;
356a11
-                reason = crm_strdup_printf("%s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id);
356a11
+                reason = crm_strdup_printf("%s is active there (fencing will be"
356a11
+                                           " revoked if remote connection can "
356a11
+                                           "be re-established elsewhere)",
356a11
+                                           rsc->id);
356a11
             }
356a11
             should_fence = TRUE;
356a11
         }
356a11
@@ -1959,7 +1973,9 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
             /* treat it as if it is still running
356a11
              * but also mark the node as unclean
356a11
              */
356a11
-            pe_fence_node(data_set, node, "resource failure(s)");
356a11
+            reason = crm_strdup_printf("%s failed there", rsc->id);
356a11
+            pe_fence_node(data_set, node, reason);
356a11
+            free(reason);
356a11
             break;
356a11
 
356a11
         case action_fail_standby:
356a11
@@ -2002,6 +2018,7 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
                 stop_action(rsc, node, FALSE);
356a11
             }
356a11
             break;
356a11
+
356a11
         case action_fail_reset_remote:
356a11
             set_bit(rsc->flags, pe_rsc_failed);
356a11
             if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
356a11
@@ -2015,7 +2032,8 @@ process_rsc_state(resource_t * rsc, node_t * node,
356a11
 
356a11
                     /* connection resource to baremetal resource failed in a way that
356a11
                      * should result in fencing the remote-node. */
356a11
-                    pe_fence_node(data_set, tmpnode, "of connection failure(s)");
356a11
+                    pe_fence_node(data_set, tmpnode,
356a11
+                                  "remote connection is unrecoverable");
356a11
                 }
356a11
             }
356a11
 
356a11
diff --git a/pengine/allocate.c b/pengine/allocate.c
356a11
index 0020af6..f2987cc 100644
356a11
--- a/pengine/allocate.c
356a11
+++ b/pengine/allocate.c
356a11
@@ -467,7 +467,7 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
356a11
             set_bit(action_clear->flags, pe_action_runnable);
356a11
 
356a11
             crm_notice("Clearing failure of %s on %s "
356a11
-                       "action definition changed " CRM_XS " %s",
356a11
+                       "because action definition changed " CRM_XS " %s",
356a11
                        rsc->id, node->details->uname, action_clear->uuid);
356a11
         }
356a11
     }
356a11
@@ -1789,7 +1789,6 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
356a11
 
356a11
     CRM_ASSERT(action->node);
356a11
     CRM_ASSERT(is_remote_node(action->node));
356a11
-    CRM_ASSERT(action->node->details->remote_rsc);
356a11
 
356a11
     remote_rsc = action->node->details->remote_rsc;
356a11
     CRM_ASSERT(remote_rsc);
356a11
@@ -1801,7 +1800,13 @@ apply_container_ordering(action_t *action, pe_working_set_t *data_set)
356a11
         pe_fence_node(data_set, action->node, "container failed");
356a11
     }
356a11
 
356a11
-    crm_trace("%s %s %s %s %d", action->uuid, action->task, remote_rsc->id, container->id, is_set(container->flags, pe_rsc_failed));
356a11
+    crm_trace("Order %s action %s relative to %s%s for %s%s",
356a11
+              action->task, action->uuid,
356a11
+              is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
356a11
+              remote_rsc->id,
356a11
+              is_set(container->flags, pe_rsc_failed)? "failed " : "",
356a11
+              container->id);
356a11
+
356a11
     switch (task) {
356a11
         case start_rsc:
356a11
         case action_promote:
356a11
@@ -1874,6 +1879,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
     node_t *cluster_node = NULL;
356a11
     enum action_tasks task = text2task(action->task);
356a11
     enum remote_connection_state state = remote_state_unknown;
356a11
+    enum pe_ordering order_opts = pe_order_none;
356a11
 
356a11
     if (action->rsc == NULL) {
356a11
         return;
356a11
@@ -1881,7 +1887,6 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
 
356a11
     CRM_ASSERT(action->node);
356a11
     CRM_ASSERT(is_remote_node(action->node));
356a11
-    CRM_ASSERT(action->node->details->remote_rsc);
356a11
 
356a11
     remote_rsc = action->node->details->remote_rsc;
356a11
     CRM_ASSERT(remote_rsc);
356a11
@@ -1895,7 +1900,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
      * on that remote node until after it starts elsewhere.
356a11
      */
356a11
     if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) {
356a11
-        /* There is no-where left to run the connection resource
356a11
+        /* There is nowhere left to run the connection resource,
356a11
          * and the resource is in a failed state (either directly
356a11
          * or because it is located on a failed node).
356a11
          *
356a11
@@ -1903,8 +1908,7 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
          * or if there are resources in an unknown state (probe), we
356a11
          * must assume the worst and fence it.
356a11
          */
356a11
-
356a11
-        if(is_set(action->node->details->remote_rsc->flags, pe_rsc_failed)) {
356a11
+        if (is_set(remote_rsc->flags, pe_rsc_failed)) {
356a11
             state = remote_state_failed;
356a11
         } else if(cluster_node && cluster_node->details->unclean) {
356a11
             state = remote_state_failed;
356a11
@@ -1934,22 +1938,31 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
         state = remote_state_alive;
356a11
     }
356a11
 
356a11
-    crm_trace("%s %s %s %d %d", action->uuid, action->task, action->node->details->uname, state, is_set(remote_rsc->flags, pe_rsc_failed));
356a11
+    crm_trace("Order %s action %s relative to %s%s (state %d)",
356a11
+              action->task, action->uuid,
356a11
+              is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "",
356a11
+              remote_rsc->id, state);
356a11
     switch (task) {
356a11
         case start_rsc:
356a11
         case action_promote:
356a11
-            if(state == remote_state_failed) {
356a11
-                /* Wait for the connection resource to be up and force recovery */
356a11
-                custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
356a11
-                                    action->rsc, NULL, action,
356a11
-                                    pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
356a11
-            } else {
356a11
-                /* Ensure the connection resource is up and assume everything is as we left it */
356a11
-                custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
356a11
-                                    action->rsc, NULL, action,
356a11
-                                    pe_order_preserve | pe_order_runnable_left, data_set);
356a11
+            /* This as an internally generated constraint exempt from
356a11
+             * user constraint prohibitions, and this action isn't runnable
356a11
+             * if the connection start isn't runnable.
356a11
+             */
356a11
+            order_opts = pe_order_preserve | pe_order_runnable_left;
356a11
+
356a11
+            if (state == remote_state_failed) {
356a11
+                /* Force recovery, by making this action required */
356a11
+                order_opts |= pe_order_implies_then;
356a11
             }
356a11
+
356a11
+            /* Ensure connection is up before running this action */
356a11
+            custom_action_order(remote_rsc,
356a11
+                                generate_op_key(remote_rsc->id, RSC_START, 0),
356a11
+                                NULL, action->rsc, NULL, action, order_opts,
356a11
+                                data_set);
356a11
             break;
356a11
+
356a11
         case stop_rsc:
356a11
             /* Handle special case with remote node where stop actions need to be
356a11
              * ordered after the connection resource starts somewhere else.
356a11
@@ -1975,22 +1988,19 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
356a11
                                     pe_order_preserve | pe_order_implies_first, data_set);
356a11
             }
356a11
             break;
356a11
-        case action_demote:
356a11
 
356a11
-            /* If the connection is being torn down, we don't want
356a11
-             * to build a constraint between a resource's demotion and
356a11
-             * the connection resource starting... because the connection
356a11
-             * resource can not start. The connection might already be up,
356a11
-             * but the "start" action would not be allowed, which in turn would
356a11
-             * block the demotion of any resources living in the node.
356a11
+        case action_demote:
356a11
+            /* Only order this demote relative to the connection start if the
356a11
+             * connection isn't being torn down. Otherwise, the demote would be
356a11
+             * blocked because the connection start would not be allowed.
356a11
              */
356a11
-
356a11
             if(state == remote_state_resting || state == remote_state_unknown) {
356a11
                 custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
356a11
                                     action->rsc, NULL, action,
356a11
                                     pe_order_preserve, data_set);
356a11
             } /* Otherwise we can rely on the stop ordering */
356a11
             break;
356a11
+
356a11
         default:
356a11
             /* Wait for the connection resource to be up */
356a11
             if (is_recurring_action(action)) {
356a11
@@ -2261,15 +2271,12 @@ stage7(pe_working_set_t * data_set)
356a11
     order_probes(data_set);
356a11
 
356a11
     crm_trace("Updating %d actions", g_list_length(data_set->actions));
356a11
-
356a11
     for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
356a11
         action_t *action = (action_t *) gIter->data;
356a11
 
356a11
         update_action(action);
356a11
     }
356a11
 
356a11
-    crm_trace("Processing reloads");
356a11
-
356a11
     LogNodeActions(data_set, FALSE);
356a11
     for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
356a11
         resource_t *rsc = (resource_t *) gIter->data;
356a11
-- 
356a11
1.8.3.1
356a11