Blame SOURCES/011-fencing-reasons.patch

533c21
From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Tue, 14 Dec 2021 16:25:21 -0600
533c21
Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function
533c21
 renames
533c21
533c21
---
533c21
 doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++--------
533c21
 1 file changed, 8 insertions(+), 8 deletions(-)
533c21
533c21
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
533c21
index a51220cac9..68158484ce 100644
533c21
--- a/doc/sphinx/Pacemaker_Development/components.rst
533c21
+++ b/doc/sphinx/Pacemaker_Development/components.rst
533c21
@@ -106,7 +106,7 @@ or messaging layer callback, which calls:
533c21
       the number of active peers), and if this is the last expected reply,
533c21
       calls
533c21
 
533c21
-      * ``call_remote_stonith()``, which calculates the timeout and sends
533c21
+      * ``request_peer_fencing()``, which calculates the timeout and sends
533c21
         ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target
533c21
 	node has a fencing "topology" (which allows specifications such as
533c21
 	"this node can be fenced either with device A, or devices B and C in
533c21
@@ -156,7 +156,7 @@ returns, and calls
533c21
   * done callback (``st_child_done()``), which calls ``schedule_stonith_command()``
533c21
     for a new device if there are further required actions to execute or if the
533c21
     original action failed, then builds and sends an XML reply to the original
533c21
-    fencer (via ``stonith_send_async_reply()``), then checks whether any
533c21
+    fencer (via ``send_async_reply()``), then checks whether any
533c21
     pending actions are the same as the one just executed and merges them if so.
533c21
 
533c21
 Fencing replies
533c21
@@ -169,18 +169,18 @@ messaging layer callback, which calls:
533c21
 
533c21
   * ``handle_reply()``, which calls
533c21
 
533c21
-    * ``process_remote_stonith_exec()``, which calls either
533c21
-      ``call_remote_stonith()`` (to retry a failed operation, or try the next
533c21
-       device in a topology is appropriate, which issues a new
533c21
+    * ``fenced_process_fencing_reply()``, which calls either
533c21
+      ``request_peer_fencing()`` (to retry a failed operation, or try the next
533c21
+      device in a topology is appropriate, which issues a new
533c21
       ``STONITH_OP_FENCE`` request, proceeding as before) or
533c21
-      ``remote_op_done()`` (if the operation is definitively failed or
533c21
+      ``finalize_op()`` (if the operation is definitively failed or
533c21
       successful).
533c21
 
533c21
-      * remote_op_done() broadcasts the result to all peers.
533c21
+      * ``finalize_op()`` broadcasts the result to all peers.
533c21
 
533c21
 Finally, all peers receive the broadcast result and call
533c21
 
533c21
-* ``remote_op_done()``, which sends the result to all local clients.
533c21
+* ``finalize_op()``, which sends the result to all local clients.
533c21
 
533c21
 
533c21
 .. index::
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Fri, 12 Nov 2021 09:58:16 -0600
533c21
Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action
533c21
 callback data
533c21
533c21
stonith_callback_data_t previously only contained the legacy return code for
533c21
the action. Use its new opaque member to store the full result, along with
533c21
accessors (available only internally for now).
533c21
---
533c21
 include/crm/fencing/internal.h |  3 ++
533c21
 lib/fencing/st_client.c        | 99 ++++++++++++++++++++++++++--------
533c21
 2 files changed, 81 insertions(+), 21 deletions(-)
533c21
533c21
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
533c21
index f0d294a0b3..eff689e59b 100644
533c21
--- a/include/crm/fencing/internal.h
533c21
+++ b/include/crm/fencing/internal.h
533c21
@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data);
533c21
 bool stonith__event_state_neq(stonith_history_t *history, void *user_data);
533c21
 
533c21
 int stonith__legacy2status(int rc);
533c21
+int stonith__exit_status(stonith_callback_data_t *data);
533c21
+int stonith__execution_status(stonith_callback_data_t *data);
533c21
+const char *stonith__exit_reason(stonith_callback_data_t *data);
533c21
 
533c21
 /*!
533c21
  * \internal
533c21
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
533c21
index 2ca094566b..9d93ffd481 100644
533c21
--- a/lib/fencing/st_client.c
533c21
+++ b/lib/fencing/st_client.c
533c21
@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks)
533c21
  * \param[in] st        Fencer API connection
533c21
  * \param[in] call_id   If positive, call ID of completed fence action, otherwise
533c21
  *                      legacy return code for early action failure
533c21
- * \param[in] rc        Legacy return code for action result
533c21
+ * \param[in] result    Full result for action
533c21
  * \param[in] userdata  User data to pass to callback
533c21
  * \param[in] callback  Fence action callback to invoke
533c21
  */
533c21
 static void
533c21
-invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata,
533c21
+invoke_fence_action_callback(stonith_t *st, int call_id,
533c21
+                             pcmk__action_result_t *result,
533c21
+                             void *userdata,
533c21
                              void (*callback) (stonith_t *st,
533c21
                                                stonith_callback_data_t *data))
533c21
 {
533c21
     stonith_callback_data_t data = { 0, };
533c21
 
533c21
     data.call_id = call_id;
533c21
-    data.rc = rc;
533c21
+    data.rc = pcmk_rc2legacy(stonith__result2rc(result));
533c21
     data.userdata = userdata;
533c21
+    data.opaque = (void *) result;
533c21
 
533c21
     callback(st, &data);
533c21
 }
533c21
@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
533c21
 {
533c21
     stonith_private_t *private = NULL;
533c21
     stonith_callback_client_t *cb_info = NULL;
533c21
-    int rc = pcmk_ok;
533c21
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
533c21
 
533c21
     CRM_CHECK(stonith != NULL, return);
533c21
     CRM_CHECK(stonith->st_private != NULL, return);
533c21
@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
533c21
 
533c21
     if (msg == NULL) {
533c21
         // Fencer didn't reply in time
533c21
-        rc = -ETIME;
533c21
+        pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
533c21
+                         "Timeout waiting for reply from fencer");
533c21
         CRM_LOG_ASSERT(call_id > 0);
533c21
 
533c21
     } else {
533c21
         // We have the fencer reply
533c21
-
533c21
-        if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) {
533c21
-            rc = -pcmk_err_generic;
533c21
-        }
533c21
-
533c21
         if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0)
533c21
             || (call_id <= 0)) {
533c21
             crm_log_xml_warn(msg, "Bad fencer reply");
533c21
         }
533c21
+        stonith__xe_get_result(msg, &result);
533c21
     }
533c21
 
533c21
     if (call_id > 0) {
533c21
@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
533c21
     }
533c21
 
533c21
     if ((cb_info != NULL) && (cb_info->callback != NULL)
533c21
-        && (rc == pcmk_ok || !(cb_info->only_success))) {
533c21
+        && (pcmk__result_ok(&result) || !(cb_info->only_success))) {
533c21
         crm_trace("Invoking callback %s for call %d",
533c21
                   crm_str(cb_info->id), call_id);
533c21
-        invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data,
533c21
-                                     cb_info->callback);
533c21
+        invoke_fence_action_callback(stonith, call_id, &result,
533c21
+                                     cb_info->user_data, cb_info->callback);
533c21
 
533c21
-    } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) {
533c21
-        crm_warn("Fencing action without registered callback failed: %s",
533c21
-                 pcmk_strerror(rc));
533c21
+    } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) {
533c21
+        crm_warn("Fencing action without registered callback failed: %d (%s)",
533c21
+                 result.exit_status,
533c21
+                 pcmk_exec_status_str(result.execution_status));
533c21
         crm_log_xml_debug(msg, "Failed fence update");
533c21
     }
533c21
 
533c21
     if (private->op_callback != NULL) {
533c21
         crm_trace("Invoking global callback for call %d", call_id);
533c21
-        invoke_fence_action_callback(stonith, call_id, rc, NULL,
533c21
+        invoke_fence_action_callback(stonith, call_id, &result, NULL,
533c21
                                      private->op_callback);
533c21
     }
533c21
 
533c21
     if (cb_info != NULL) {
533c21
         stonith_api_del_callback(stonith, call_id, FALSE);
533c21
     }
533c21
+    pcmk__reset_result(&result);
533c21
 }
533c21
 
533c21
 static gboolean
533c21
@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti
533c21
     CRM_CHECK(stonith->st_private != NULL, return -EINVAL);
533c21
     private = stonith->st_private;
533c21
 
533c21
-    if (call_id == 0) {
533c21
+    if (call_id == 0) { // Add global callback
533c21
         private->op_callback = callback;
533c21
 
533c21
-    } else if (call_id < 0) {
533c21
+    } else if (call_id < 0) { // Call failed immediately, so call callback now
533c21
         if (!(options & st_opt_report_only_success)) {
533c21
+            pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
533c21
+
533c21
             crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id));
533c21
-            invoke_fence_action_callback(stonith, call_id, call_id, user_data,
533c21
-                                         callback);
533c21
+            pcmk__set_result(&result, CRM_EX_ERROR,
533c21
+                             stonith__legacy2status(call_id), NULL);
533c21
+            invoke_fence_action_callback(stonith, call_id, &result,
533c21
+                                         user_data, callback);
533c21
         } else {
533c21
             crm_warn("Fencer call failed: %s", pcmk_strerror(call_id));
533c21
         }
533c21
@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name,
533c21
     freeXpathObject(xpath);
533c21
 }
533c21
 
533c21
+/*!
533c21
+ * \internal
533c21
+ * \brief Return the exit status from an async action callback
533c21
+ *
533c21
+ * \param[in] data  Callback data
533c21
+ *
533c21
+ * \return Exit status from callback data
533c21
+ */
533c21
+int
533c21
+stonith__exit_status(stonith_callback_data_t *data)
533c21
+{
533c21
+    if ((data == NULL) || (data->opaque == NULL)) {
533c21
+        return CRM_EX_ERROR;
533c21
+    }
533c21
+    return ((pcmk__action_result_t *) data->opaque)->exit_status;
533c21
+}
533c21
+
533c21
+/*!
533c21
+ * \internal
533c21
+ * \brief Return the execution status from an async action callback
533c21
+ *
533c21
+ * \param[in] data  Callback data
533c21
+ *
533c21
+ * \return Execution status from callback data
533c21
+ */
533c21
+int
533c21
+stonith__execution_status(stonith_callback_data_t *data)
533c21
+{
533c21
+    if ((data == NULL) || (data->opaque == NULL)) {
533c21
+        return PCMK_EXEC_UNKNOWN;
533c21
+    }
533c21
+    return ((pcmk__action_result_t *) data->opaque)->execution_status;
533c21
+}
533c21
+
533c21
+/*!
533c21
+ * \internal
533c21
+ * \brief Return the exit reason from an async action callback
533c21
+ *
533c21
+ * \param[in] data  Callback data
533c21
+ *
533c21
+ * \return Exit reason from callback data
533c21
+ */
533c21
+const char *
533c21
+stonith__exit_reason(stonith_callback_data_t *data)
533c21
+{
533c21
+    if ((data == NULL) || (data->opaque == NULL)) {
533c21
+        return NULL;
533c21
+    }
533c21
+    return ((pcmk__action_result_t *) data->opaque)->exit_reason;
533c21
+}
533c21
+
533c21
 // Deprecated functions kept only for backward API compatibility
533c21
 // LCOV_EXCL_START
533c21
 
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Tue, 9 Nov 2021 16:16:03 -0600
533c21
Subject: [PATCH 03/17] Log: controller: improve fencing result messages
533c21
533c21
Now that fence callbacks get the full result, we can log a better message.
533c21
Also check for error conditions better, improve message wording, and ensure
533c21
only a single message is logged per result.
533c21
---
533c21
 daemons/controld/controld_fencing.c | 83 +++++++++++++++++++----------
533c21
 1 file changed, 56 insertions(+), 27 deletions(-)
533c21
533c21
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
533c21
index f5a252c813..f8d2fc13f4 100644
533c21
--- a/daemons/controld/controld_fencing.c
533c21
+++ b/daemons/controld/controld_fencing.c
533c21
@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
533c21
     int stonith_id = -1;
533c21
     int transition_id = -1;
533c21
     crm_action_t *action = NULL;
533c21
-    int call_id = data->call_id;
533c21
-    int rc = data->rc;
533c21
-    char *userdata = data->userdata;
533c21
-
533c21
-    CRM_CHECK(userdata != NULL, return);
533c21
-    crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
533c21
-               pcmk_strerror(rc), rc);
533c21
+    const char *target = NULL;
533c21
 
533c21
-    if (AM_I_DC == FALSE) {
533c21
+    if ((data == NULL) || (data->userdata == NULL)) {
533c21
+        crm_err("Ignoring fence operation %d result: "
533c21
+                "No transition key given (bug?)",
533c21
+                ((data == NULL)? -1 : data->call_id));
533c21
         return;
533c21
     }
533c21
 
533c21
-    /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
533c21
-    /*       op->call_id, op->optype, op->node_name, op->op_result, */
533c21
-    /*       (char *)op->node_list, op->private_data); */
533c21
+    if (!AM_I_DC) {
533c21
+        const char *reason = stonith__exit_reason(data);
533c21
+
533c21
+        if (reason == NULL) {
533c21
+           reason = pcmk_exec_status_str(stonith__execution_status(data));
533c21
+        }
533c21
+        crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
533c21
+                   data->call_id, stonith__exit_status(data), reason,
533c21
+                   (const char *) data->userdata);
533c21
+        return;
533c21
+    }
533c21
 
533c21
-    /* filter out old STONITH actions */
533c21
-    CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
533c21
+    CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
533c21
+                                    &stonith_id, NULL),
533c21
               goto bail);
533c21
 
533c21
-    if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei)
533c21
-        || transition_graph->id != transition_id) {
533c21
-        crm_info("Ignoring STONITH action initiated outside of the current transition");
533c21
+    if (transition_graph->complete || (stonith_id < 0)
533c21
+        || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none)
533c21
+        || (transition_graph->id != transition_id)) {
533c21
+        crm_info("Ignoring fence operation %d result: "
533c21
+                 "Not from current transition " CRM_XS
533c21
+                 " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
533c21
+                 data->call_id, pcmk__btoa(transition_graph->complete),
533c21
+                 stonith_id, uuid, te_uuid, transition_id, transition_graph->id);
533c21
         goto bail;
533c21
     }
533c21
 
533c21
     action = controld_get_action(stonith_id);
533c21
     if (action == NULL) {
533c21
-        crm_err("Stonith action not matched");
533c21
+        crm_err("Ignoring fence operation %d result: "
533c21
+                "Action %d not found in transition graph (bug?) "
533c21
+                CRM_XS " uuid=%s transition=%d",
533c21
+                data->call_id, stonith_id, uuid, transition_id);
533c21
+        goto bail;
533c21
+    }
533c21
+
533c21
+    target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
533c21
+    if (target == NULL) {
533c21
+        crm_err("Ignoring fence operation %d result: No target given (bug?)",
533c21
+                data->call_id);
533c21
         goto bail;
533c21
     }
533c21
 
533c21
     stop_te_timer(action->timer);
533c21
-    if (rc == pcmk_ok) {
533c21
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
533c21
+    if (stonith__exit_status(data) == CRM_EX_OK) {
533c21
         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
533c21
         const char *op = crm_meta_value(action->params, "stonith_action");
533c21
 
533c21
-        crm_info("Stonith operation %d for %s passed", call_id, target);
533c21
+        crm_notice("Fence operation %d for %s passed", data->call_id, target);
533c21
         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
533c21
             te_action_confirmed(action, NULL);
533c21
             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
533c21
@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
533c21
         st_fail_count_reset(target);
533c21
 
533c21
     } else {
533c21
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
533c21
         enum transition_action abort_action = tg_restart;
533c21
+        int status = stonith__execution_status(data);
533c21
+        const char *reason = stonith__exit_reason(data);
533c21
 
533c21
+        if (reason == NULL) {
533c21
+            if (status == PCMK_EXEC_DONE) {
533c21
+                reason = "Agent returned error";
533c21
+            } else {
533c21
+                reason = pcmk_exec_status_str(status);
533c21
+            }
533c21
+        }
533c21
         crm__set_graph_action_flags(action, pcmk__graph_action_failed);
533c21
-        crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
533c21
-                   call_id, target, pcmk_strerror(rc));
533c21
 
533c21
         /* If no fence devices were available, there's no use in immediately
533c21
          * checking again, so don't start a new transition in that case.
533c21
          */
533c21
-        if (rc == -ENODEV) {
533c21
-            crm_warn("No devices found in cluster to fence %s, giving up",
533c21
-                     target);
533c21
+        if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
533c21
+            crm_warn("Fence operation %d for %s failed: %s "
533c21
+                     "(aborting transition and giving up for now)",
533c21
+                     data->call_id, target, reason);
533c21
             abort_action = tg_stop;
533c21
+        } else {
533c21
+            crm_notice("Fence operation %d for %s failed: %s "
533c21
+                       "(aborting transition)", data->call_id, target, reason);
533c21
         }
533c21
 
533c21
         /* Increment the fail count now, so abort_for_stonith_failure() can
533c21
@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
533c21
     trigger_graph();
533c21
 
533c21
   bail:
533c21
-    free(userdata);
533c21
+    free(data->userdata);
533c21
     free(uuid);
533c21
     return;
533c21
 }
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 22 Nov 2021 12:37:16 -0600
533c21
Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc()
533c21
 function
533c21
533c21
action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as
533c21
appropriate to the action standard. However, it was called only from a place
533c21
that did not process stonith actions, so that place can just call
533c21
services_result2ocf() directly.
533c21
533c21
This will simplify planned changes.
533c21
---
533c21
 daemons/execd/execd_commands.c | 24 ++++++------------------
533c21
 1 file changed, 6 insertions(+), 18 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 5bb2aab692..5e123e322e 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc)
533c21
     return rc;
533c21
 }
533c21
 
533c21
-static int
533c21
-action_get_uniform_rc(svc_action_t *action)
533c21
-{
533c21
-    lrmd_cmd_t *cmd = action->cb_data;
533c21
-
533c21
-    if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH,
533c21
-                            pcmk__str_casei)) {
533c21
-        return stonith2uniform_rc(cmd->action, action->rc);
533c21
-    } else {
533c21
-        enum ocf_exitcode code = services_result2ocf(action->standard,
533c21
-                                                     cmd->action, action->rc);
533c21
-
533c21
-        // Cast variable instead of function return to keep compilers happy
533c21
-        return (int) code;
533c21
-    }
533c21
-}
533c21
-
533c21
 struct notify_new_client_data {
533c21
     xmlNode *notify;
533c21
     pcmk__client_t *new_client;
533c21
@@ -848,6 +831,7 @@ action_complete(svc_action_t * action)
533c21
 {
533c21
     lrmd_rsc_t *rsc;
533c21
     lrmd_cmd_t *cmd = action->cb_data;
533c21
+    enum ocf_exitcode code;
533c21
 
533c21
 #ifdef PCMK__TIME_USE_CGT
533c21
     const char *rclass = NULL;
533c21
@@ -867,8 +851,12 @@ action_complete(svc_action_t * action)
533c21
 #endif
533c21
 
533c21
     cmd->last_pid = action->pid;
533c21
-    pcmk__set_result(&(cmd->result), action_get_uniform_rc(action),
533c21
+
533c21
+    // Cast variable instead of function return to keep compilers happy
533c21
+    code = services_result2ocf(action->standard, cmd->action, action->rc);
533c21
+    pcmk__set_result(&(cmd->result), (int) code,
533c21
                      action->status, services__exit_reason(action));
533c21
+
533c21
     rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
533c21
 
533c21
 #ifdef PCMK__TIME_USE_CGT
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 22 Nov 2021 12:39:30 -0600
533c21
Subject: [PATCH 05/17] Feature: executor: use full result from fencer for
533c21
 fence actions
533c21
533c21
Now that fence callbacks get the full result, we can improve the executor
533c21
command result for fence actions. stonith_action_complete() now takes a
533c21
full result, allowing the executor to use that directly rather than map a
533c21
legacy return code.
533c21
---
533c21
 daemons/execd/execd_commands.c | 140 +++++++++++++++++++--------------
533c21
 1 file changed, 80 insertions(+), 60 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 5e123e322e..e722994012 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -8,6 +8,7 @@
533c21
  */
533c21
 
533c21
 #include <crm_internal.h>
533c21
+#include <crm/fencing/internal.h>
533c21
 
533c21
 #include <glib.h>
533c21
 
533c21
@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc)
533c21
     }
533c21
 }
533c21
 
533c21
-static int
533c21
-stonith2uniform_rc(const char *action, int rc)
533c21
-{
533c21
-    switch (rc) {
533c21
-        case pcmk_ok:
533c21
-            rc = PCMK_OCF_OK;
533c21
-            break;
533c21
-
533c21
-        case -ENODEV:
533c21
-            /* This should be possible only for probes in practice, but
533c21
-             * interpret for all actions to be safe.
533c21
-             */
533c21
-            if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) {
533c21
-                rc = PCMK_OCF_NOT_RUNNING;
533c21
-            } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) {
533c21
-                rc = PCMK_OCF_OK;
533c21
-            } else {
533c21
-                rc = PCMK_OCF_NOT_INSTALLED;
533c21
-            }
533c21
-            break;
533c21
-
533c21
-        case -EOPNOTSUPP:
533c21
-            rc = PCMK_OCF_UNIMPLEMENT_FEATURE;
533c21
-            break;
533c21
-
533c21
-        default:
533c21
-            rc = PCMK_OCF_UNKNOWN_ERROR;
533c21
-            break;
533c21
-    }
533c21
-    return rc;
533c21
-}
533c21
-
533c21
 struct notify_new_client_data {
533c21
     xmlNode *notify;
533c21
     pcmk__client_t *new_client;
533c21
@@ -988,46 +957,84 @@ action_complete(svc_action_t * action)
533c21
     cmd_finalize(cmd, rsc);
533c21
 }
533c21
 
533c21
+/*!
533c21
+ * \internal
533c21
+ * \brief Process the result of a fence device action (start, stop, or monitor)
533c21
+ *
533c21
+ * \param[in] cmd               Fence device action that completed
533c21
+ * \param[in] exit_status       Fencer API exit status for action
533c21
+ * \param[in] execution_status  Fencer API execution status for action
533c21
+ * \param[in] exit_reason       Human-friendly detail, if action failed
533c21
+ */
533c21
 static void
533c21
-stonith_action_complete(lrmd_cmd_t * cmd, int rc)
533c21
+stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
533c21
+                        enum pcmk_exec_status execution_status,
533c21
+                        const char *exit_reason)
533c21
 {
533c21
     // This can be NULL if resource was removed before command completed
533c21
     lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
533c21
 
533c21
-    cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc);
533c21
+    // Simplify fencer exit status to uniform exit status
533c21
+    if (exit_status != CRM_EX_OK) {
533c21
+        exit_status = PCMK_OCF_UNKNOWN_ERROR;
533c21
+    }
533c21
 
533c21
-    /* This function may be called with status already set to cancelled, if a
533c21
-     * pending action was aborted. Otherwise, we need to determine status from
533c21
-     * the fencer return code.
533c21
-     */
533c21
-    if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) {
533c21
-        cmd->result.execution_status = stonith__legacy2status(rc);
533c21
+    if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) {
533c21
+        /* An in-flight fence action was cancelled. The execution status is
533c21
+         * already correct, so don't overwrite it.
533c21
+         */
533c21
+        execution_status = PCMK_EXEC_CANCELLED;
533c21
 
533c21
-        // Simplify status codes from fencer
533c21
-        switch (cmd->result.execution_status) {
533c21
+    } else {
533c21
+        /* Some execution status codes have specific meanings for the fencer
533c21
+         * that executor clients may not expect, so map them to a simple error
533c21
+         * status.
533c21
+         */
533c21
+        switch (execution_status) {
533c21
             case PCMK_EXEC_NOT_CONNECTED:
533c21
             case PCMK_EXEC_INVALID:
533c21
-            case PCMK_EXEC_NO_FENCE_DEVICE:
533c21
             case PCMK_EXEC_NO_SECRETS:
533c21
-                cmd->result.execution_status = PCMK_EXEC_ERROR;
533c21
+                execution_status = PCMK_EXEC_ERROR;
533c21
                 break;
533c21
-            default:
533c21
+
533c21
+            case PCMK_EXEC_NO_FENCE_DEVICE:
533c21
+                /* This should be possible only for probes in practice, but
533c21
+                 * interpret for all actions to be safe.
533c21
+                 */
533c21
+                if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS,
533c21
+                                 pcmk__str_none)) {
533c21
+                    exit_status = PCMK_OCF_NOT_RUNNING;
533c21
+
533c21
+                } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP,
533c21
+                                        pcmk__str_none)) {
533c21
+                    exit_status = PCMK_OCF_OK;
533c21
+
533c21
+                } else {
533c21
+                    exit_status = PCMK_OCF_NOT_INSTALLED;
533c21
+                }
533c21
+                execution_status = PCMK_EXEC_ERROR;
533c21
                 break;
533c21
-        }
533c21
 
533c21
-        // Certain successful actions change the known state of the resource
533c21
-        if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
533c21
-            if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
-                rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
533c21
-            } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
533c21
-                rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
533c21
-            }
533c21
+            case PCMK_EXEC_NOT_SUPPORTED:
533c21
+                exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE;
533c21
+                break;
533c21
+
533c21
+            default:
533c21
+                break;
533c21
         }
533c21
     }
533c21
 
533c21
-    // Give the user more detail than an OCF code
533c21
-    if (rc != -pcmk_err_generic) {
533c21
-        cmd->result.exit_reason = strdup(pcmk_strerror(rc));
533c21
+    pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason);
533c21
+
533c21
+    // Certain successful actions change the known state of the resource
533c21
+    if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
533c21
+
533c21
+        if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
+            rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
533c21
+
533c21
+        } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
533c21
+            rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
533c21
+        }
533c21
     }
533c21
 
533c21
     /* The recurring timer should not be running at this point in any case, but
533c21
@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc)
533c21
 static void
533c21
 lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
533c21
 {
533c21
-    stonith_action_complete(data->userdata, data->rc);
533c21
+    if ((data == NULL) || (data->userdata == NULL)) {
533c21
+        crm_err("Ignoring fence action result: "
533c21
+                "Invalid callback arguments (bug?)");
533c21
+    } else {
533c21
+        stonith_action_complete((lrmd_cmd_t *) data->userdata,
533c21
+                                stonith__exit_status(data),
533c21
+                                stonith__execution_status(data),
533c21
+                                stonith__exit_reason(data));
533c21
+    }
533c21
 }
533c21
 
533c21
 void
533c21
@@ -1097,7 +1112,9 @@ stonith_connection_failed(void)
533c21
     crm_err("Connection to fencer failed, finalizing %d pending operations",
533c21
             g_list_length(cmd_list));
533c21
     for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
533c21
-        stonith_action_complete(cmd_iter->data, -ENOTCONN);
533c21
+        stonith_action_complete((lrmd_cmd_t *) cmd_iter->data,
533c21
+                                CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED,
533c21
+                                "Lost connection to fencer");
533c21
     }
533c21
     g_list_free(cmd_list);
533c21
 }
533c21
@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
 
533c21
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
         rc = execd_stonith_start(stonith_api, rsc, cmd);
533c21
-        if (rc == 0) {
533c21
+        if (rc == pcmk_ok) {
533c21
             do_monitor = TRUE;
533c21
         }
533c21
 
533c21
@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
         }
533c21
     }
533c21
 
533c21
-    stonith_action_complete(cmd, rc);
533c21
+    stonith_action_complete(cmd,
533c21
+                            ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR),
533c21
+                            stonith__legacy2status(rc),
533c21
+                            rc == -pcmk_err_generic? NULL : pcmk_strerror(rc));
533c21
 }
533c21
 
533c21
 static int
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 22 Nov 2021 16:15:05 -0600
533c21
Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence
533c21
 results as a hard error
533c21
533c21
Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to
533c21
PCMK_EXEC_ERROR to keep handling of that situation the same as before the new
533c21
code was added.
533c21
533c21
However, the earlier handling was less than ideal -- a resource action that
533c21
failed due to missing secrets would be retried on the same node, and almost
533c21
certainly fail again for the same reason. Now, the executor passes along
533c21
PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the
533c21
CIB status, and the scheduler will treat it as a hard error (i.e. not retrying
533c21
on the same node).
533c21
533c21
Backward compatibility isn't a problem because the scheduler treats unknown
533c21
status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to
533c21
handle it as before. The CRM feature set has been bumped so the handling can't
533c21
flip back and forth in a mixed-version cluster.
533c21
---
533c21
 daemons/execd/execd_commands.c | 1 -
533c21
 include/crm/crm.h              | 4 ++--
533c21
 lib/pengine/unpack.c           | 3 ---
533c21
 3 files changed, 2 insertions(+), 6 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index e722994012..4ced6d1d5c 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
533c21
         switch (execution_status) {
533c21
             case PCMK_EXEC_NOT_CONNECTED:
533c21
             case PCMK_EXEC_INVALID:
533c21
-            case PCMK_EXEC_NO_SECRETS:
533c21
                 execution_status = PCMK_EXEC_ERROR;
533c21
                 break;
533c21
 
533c21
diff --git a/include/crm/crm.h b/include/crm/crm.h
533c21
index 16b35e9c55..56b07cb12a 100644
533c21
--- a/include/crm/crm.h
533c21
+++ b/include/crm/crm.h
533c21
@@ -1,5 +1,5 @@
533c21
 /*
533c21
- * Copyright 2004-2021 the Pacemaker project contributors
533c21
+ * Copyright 2004-2022 the Pacemaker project contributors
533c21
  *
533c21
  * The version control history for this file may have further details.
533c21
  *
533c21
@@ -66,7 +66,7 @@ extern "C" {
533c21
  * >=3.0.13: Fail counts include operation name and interval
533c21
  * >=3.2.0:  DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED
533c21
  */
533c21
-#  define CRM_FEATURE_SET		"3.12.0"
533c21
+#  define CRM_FEATURE_SET		"3.13.0"
533c21
 
533c21
 /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and
533c21
  * recipient of a CPG message. This imposes an arbitrary limit on cluster node
533c21
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
533c21
index 3e0384cd2a..8a2d2a6d6d 100644
533c21
--- a/lib/pengine/unpack.c
533c21
+++ b/lib/pengine/unpack.c
533c21
@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
533c21
         case PCMK_EXEC_INVALID:
533c21
             break; // Not done, do error handling
533c21
 
533c21
-        /* These should only be possible in fence action results, not operation
533c21
-         * history, but have some handling in place as a fail-safe.
533c21
-         */
533c21
         case PCMK_EXEC_NO_FENCE_DEVICE:
533c21
         case PCMK_EXEC_NO_SECRETS:
533c21
             status = PCMK_EXEC_ERROR_HARD;
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 12:05:20 -0600
533c21
Subject: [PATCH 07/17] Low: executor: improve result for fence device probes
533c21
533c21
Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy
533c21
return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and
533c21
rename to fence_probe_result). Set an appropriate exit reason when available.
533c21
---
533c21
 daemons/execd/execd_commands.c  | 57 ++++++++++++++++++++++++++-------
533c21
 daemons/execd/pacemaker-execd.h |  9 +++++-
533c21
 2 files changed, 54 insertions(+), 12 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 4ced6d1d5c..6e5505e973 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg)
533c21
     rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER);
533c21
     rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE);
533c21
     rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc);
533c21
-    rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running"
533c21
+
533c21
+    // Initialize fence device probes (to return "not running")
533c21
+    rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE;
533c21
     return rsc;
533c21
 }
533c21
 
533c21
@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
533c21
     if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
533c21
 
533c21
         if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
-            rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
533c21
+            rsc->fence_probe_result = PCMK_EXEC_DONE; // "running"
533c21
 
533c21
         } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
533c21
-            rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
533c21
+            rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running"
533c21
         }
533c21
     }
533c21
 
533c21
@@ -1081,14 +1083,13 @@ stonith_connection_failed(void)
533c21
         if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) {
533c21
             /* If we registered this fence device, we don't know whether the
533c21
              * fencer still has the registration or not. Cause future probes to
533c21
-             * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or
533c21
-             * started successfully. This is especially important if the
533c21
-             * controller also went away (possibly due to a cluster layer
533c21
-             * restart) and won't receive our client notification of any
533c21
-             * monitors finalized below.
533c21
+             * return an error until the resource is stopped or started
533c21
+             * successfully. This is especially important if the controller also
533c21
+             * went away (possibly due to a cluster layer restart) and won't
533c21
+             * receive our client notification of any monitors finalized below.
533c21
              */
533c21
-            if (rsc->st_probe_rc == pcmk_ok) {
533c21
-                rsc->st_probe_rc = pcmk_err_generic;
533c21
+            if (rsc->fence_probe_result == PCMK_EXEC_DONE) {
533c21
+                rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED;
533c21
             }
533c21
 
533c21
             if (rsc->active) {
533c21
@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd)
533c21
     return rc;
533c21
 }
533c21
 
533c21
+/*!
533c21
+ * \internal
533c21
+ * \brief  Finalize the result of a fence device probe
533c21
+ *
533c21
+ * \param[in] cmd           Probe action
533c21
+ * \param[in] probe_result  Probe result
533c21
+ */
533c21
+static void
533c21
+finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result)
533c21
+{
533c21
+    int exit_status = CRM_EX_ERROR;
533c21
+    const char *reason = NULL;
533c21
+
533c21
+    switch (probe_result) {
533c21
+        case PCMK_EXEC_DONE: // Device is "running"
533c21
+            exit_status = CRM_EX_OK;
533c21
+            break;
533c21
+
533c21
+        case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running"
533c21
+            break;
533c21
+
533c21
+        case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed()
533c21
+            reason = "Lost connection to fencer";
533c21
+            break;
533c21
+
533c21
+        default: // Shouldn't be possible
533c21
+            probe_result = PCMK_EXEC_ERROR;
533c21
+            reason = "Invalid fence device probe result (bug?)";
533c21
+            break;
533c21
+    }
533c21
+    stonith_action_complete(cmd, exit_status, probe_result, reason);
533c21
+}
533c21
+
533c21
 static void
533c21
 lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
 {
533c21
@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
         if (cmd->interval_ms > 0) {
533c21
             do_monitor = TRUE;
533c21
         } else {
533c21
-            rc = rsc->st_probe_rc;
533c21
+            finalize_fence_device_probe(cmd, rsc->fence_probe_result);
533c21
+            return;
533c21
         }
533c21
     }
533c21
 
533c21
diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h
533c21
index 51ef8d22e6..057d889584 100644
533c21
--- a/daemons/execd/pacemaker-execd.h
533c21
+++ b/daemons/execd/pacemaker-execd.h
533c21
@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s {
533c21
      * that have been handed off from the pending ops list. */
533c21
     GList *recurring_ops;
533c21
 
533c21
-    int st_probe_rc; // What value should be returned for a probe if stonith
533c21
+    /* If this resource is a fence device, probes are handled internally by the
533c21
+     * executor, and this value indicates the result that should currently be
533c21
+     * returned for probes. It should be one of:
533c21
+     * PCMK_EXEC_DONE (to indicate "running"),
533c21
+     * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or
533c21
+     * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost").
533c21
+     */
533c21
+    enum pcmk_exec_status fence_probe_result;
533c21
 
533c21
     crm_trigger_t *work;
533c21
 } lrmd_rsc_t;
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 12:14:48 -0600
533c21
Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for
533c21
 probes
533c21
533c21
For fence devices, probe results are based on earlier state determinations,
533c21
so handle them before requiring an active fencer connection. The effect may be
533c21
negligible, but it would allow probes to proceed while waiting for a
533c21
reconnection.
533c21
---
533c21
 daemons/execd/execd_commands.c | 15 ++++++++-------
533c21
 1 file changed, 8 insertions(+), 7 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 6e5505e973..5999ba19c9 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
 
533c21
     stonith_t *stonith_api = get_stonith_connection();
533c21
 
533c21
-    if (!stonith_api) {
533c21
+    if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)
533c21
+        && (cmd->interval_ms == 0)) {
533c21
+        // Probes don't require a fencer connection
533c21
+        finalize_fence_device_probe(cmd, rsc->fence_probe_result);
533c21
+        return;
533c21
+
533c21
+    } else if (stonith_api == NULL) {
533c21
         rc = -ENOTCONN;
533c21
 
533c21
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
         rc = execd_stonith_stop(stonith_api, rsc);
533c21
 
533c21
     } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
533c21
-        if (cmd->interval_ms > 0) {
533c21
-            do_monitor = TRUE;
533c21
-        } else {
533c21
-            finalize_fence_device_probe(cmd, rsc->fence_probe_result);
533c21
-            return;
533c21
-        }
533c21
+        do_monitor = TRUE;
533c21
     }
533c21
 
533c21
     if (do_monitor) {
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 12:20:34 -0600
533c21
Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence
533c21
 device actions
533c21
533c21
... and set an exit reason. Previously, it would return success for unsupported
533c21
actions. It shouldn't be possible, but it would be nice to have an indication
533c21
of what is wrong if a bug is introduced.
533c21
---
533c21
 daemons/execd/execd_commands.c | 6 ++++++
533c21
 1 file changed, 6 insertions(+)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 5999ba19c9..772d6446dc 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
 
533c21
     } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
533c21
         do_monitor = TRUE;
533c21
+
533c21
+    } else {
533c21
+        stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE,
533c21
+                                PCMK_EXEC_ERROR,
533c21
+                                "Invalid fence device action (bug?)");
533c21
+        return;
533c21
     }
533c21
 
533c21
     if (do_monitor) {
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 12:24:07 -0600
533c21
Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection
533c21
533c21
---
533c21
 daemons/execd/execd_commands.c | 5 ++++-
533c21
 1 file changed, 4 insertions(+), 1 deletion(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 772d6446dc..7ae309d94c 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
         return;
533c21
 
533c21
     } else if (stonith_api == NULL) {
533c21
-        rc = -ENOTCONN;
533c21
+        stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR,
533c21
+                                PCMK_EXEC_NOT_CONNECTED,
533c21
+                                "No connection to fencer");
533c21
+        return;
533c21
 
533c21
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
533c21
         rc = execd_stonith_start(stonith_api, rsc, cmd);
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 14:42:26 -0600
533c21
Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence
533c21
 callback
533c21
533c21
---
533c21
 daemons/fenced/cts-fence-helper.c | 7 +++++--
533c21
 1 file changed, 5 insertions(+), 2 deletions(-)
533c21
533c21
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
533c21
index 2adb032f24..c2b55d73b9 100644
533c21
--- a/daemons/fenced/cts-fence-helper.c
533c21
+++ b/daemons/fenced/cts-fence-helper.c
533c21
@@ -1,5 +1,5 @@
533c21
 /*
533c21
- * Copyright 2009-2020 the Pacemaker project contributors
533c21
+ * Copyright 2009-2021 the Pacemaker project contributors
533c21
  *
533c21
  * This source code is licensed under the GNU General Public License version 2
533c21
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
533c21
@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e)
533c21
 static void
533c21
 st_global_callback(stonith_t * stonith, stonith_callback_data_t * data)
533c21
 {
533c21
-    crm_notice("Call id %d completed with rc %d", data->call_id, data->rc);
533c21
+    crm_notice("Call %d exited %d: %s (%s)",
533c21
+               data->call_id, stonith__exit_status(data),
533c21
+               stonith__execution_status(data),
533c21
+               crm_str(stonith__exit_reason(data)));
533c21
 }
533c21
 
533c21
 static void
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Wed, 10 Nov 2021 15:10:14 -0600
533c21
Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of
533c21
 legacy return code
533c21
533c21
---
533c21
 daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++----------------
533c21
 1 file changed, 37 insertions(+), 40 deletions(-)
533c21
533c21
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
533c21
index c2b55d73b9..2739f57804 100644
533c21
--- a/daemons/fenced/cts-fence-helper.c
533c21
+++ b/daemons/fenced/cts-fence-helper.c
533c21
@@ -34,23 +34,12 @@
533c21
 static GMainLoop *mainloop = NULL;
533c21
 static crm_trigger_t *trig = NULL;
533c21
 static int mainloop_iter = 0;
533c21
-static int callback_rc = 0;
533c21
+static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
533c21
+
533c21
 typedef void (*mainloop_test_iteration_cb) (int check_event);
533c21
 
533c21
 #define MAINLOOP_DEFAULT_TIMEOUT 2
533c21
 
533c21
-#define mainloop_test_done(pass) \
533c21
-    if (pass) { \
533c21
-        crm_info("SUCCESS - %s", __func__); \
533c21
-        mainloop_iter++;   \
533c21
-        mainloop_set_trigger(trig);  \
533c21
-    } else { \
533c21
-        crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \
533c21
-        crm_exit(CRM_EX_ERROR); \
533c21
-    } \
533c21
-    callback_rc = 0; \
533c21
-
533c21
-
533c21
 enum test_modes {
533c21
     test_standard = 0,  // test using a specific developer environment
533c21
     test_passive,       // watch notifications only
533c21
@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call;
533c21
 static int expected_notifications = 0;
533c21
 static int verbose = 0;
533c21
 
533c21
+static void
533c21
+mainloop_test_done(const char *origin, bool pass)
533c21
+{
533c21
+    if (pass) {
533c21
+        crm_info("SUCCESS - %s", origin);
533c21
+        mainloop_iter++;
533c21
+        mainloop_set_trigger(trig);
533c21
+        result.execution_status = PCMK_EXEC_UNKNOWN;
533c21
+        result.exit_status = CRM_EX_OK;
533c21
+    } else {
533c21
+        crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status,
533c21
+                pcmk_exec_status_str(result.execution_status));
533c21
+        crm_exit(CRM_EX_ERROR);
533c21
+    }
533c21
+}
533c21
+
533c21
+
533c21
 static void
533c21
 dispatch_helper(int timeout)
533c21
 {
533c21
@@ -385,7 +391,9 @@ static void
533c21
 static void
533c21
 mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data)
533c21
 {
533c21
-    callback_rc = data->rc;
533c21
+    pcmk__set_result(&result, stonith__exit_status(data),
533c21
+                     stonith__execution_status(data),
533c21
+                     stonith__exit_reason(data));
533c21
     iterate_mainloop_tests(TRUE);
533c21
 }
533c21
 
533c21
@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event)
533c21
     int rc = 0;
533c21
 
533c21
     if (check_event) {
533c21
-        if (callback_rc != 0) {
533c21
-            mainloop_test_done(FALSE);
533c21
-        } else {
533c21
-            mainloop_test_done(TRUE);
533c21
-        }
533c21
+        mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK));
533c21
         return;
533c21
     }
533c21
 
533c21
     rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
533c21
     if (rc < 0) {
533c21
         crm_err("fence failed with rc %d", rc);
533c21
-        mainloop_test_done(FALSE);
533c21
+        mainloop_test_done(__func__, false);
533c21
     }
533c21
     register_callback_helper(rc);
533c21
     /* wait for event */
533c21
@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event)
533c21
     if (check_event) {
533c21
         uint32_t diff = (time(NULL) - begin);
533c21
 
533c21
-        if (callback_rc != -ETIME) {
533c21
-            mainloop_test_done(FALSE);
533c21
+        if (result.execution_status != PCMK_EXEC_TIMEOUT) {
533c21
+            mainloop_test_done(__func__, false);
533c21
         } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) {
533c21
             crm_err
533c21
                 ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d",
533c21
                  CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff);
533c21
-            mainloop_test_done(FALSE);
533c21
+            mainloop_test_done(__func__, false);
533c21
         } else {
533c21
-            mainloop_test_done(TRUE);
533c21
+            mainloop_test_done(__func__, true);
533c21
         }
533c21
         return;
533c21
     }
533c21
@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event)
533c21
     rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
533c21
     if (rc < 0) {
533c21
         crm_err("fence failed with rc %d", rc);
533c21
-        mainloop_test_done(FALSE);
533c21
+        mainloop_test_done(__func__, false);
533c21
     }
533c21
     register_callback_helper(rc);
533c21
     /* wait for event */
533c21
@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event)
533c21
     int rc = 0;
533c21
 
533c21
     if (check_event) {
533c21
-        if (callback_rc != -ENODEV) {
533c21
-            mainloop_test_done(FALSE);
533c21
-        } else {
533c21
-            mainloop_test_done(TRUE);
533c21
-        }
533c21
+        mainloop_test_done(__func__,
533c21
+                           (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE));
533c21
         return;
533c21
     }
533c21
 
533c21
     rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
533c21
     if (rc < 0) {
533c21
         crm_err("fence failed with rc %d", rc);
533c21
-        mainloop_test_done(FALSE);
533c21
+        mainloop_test_done(__func__, false);
533c21
     }
533c21
     register_callback_helper(rc);
533c21
     /* wait for event */
533c21
@@ -483,18 +484,14 @@ test_async_monitor(int check_event)
533c21
     int rc = 0;
533c21
 
533c21
     if (check_event) {
533c21
-        if (callback_rc) {
533c21
-            mainloop_test_done(FALSE);
533c21
-        } else {
533c21
-            mainloop_test_done(TRUE);
533c21
-        }
533c21
+        mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK));
533c21
         return;
533c21
     }
533c21
 
533c21
     rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT);
533c21
     if (rc < 0) {
533c21
         crm_err("monitor failed with rc %d", rc);
533c21
-        mainloop_test_done(FALSE);
533c21
+        mainloop_test_done(__func__, false);
533c21
     }
533c21
 
533c21
     register_callback_helper(rc);
533c21
@@ -531,7 +528,7 @@ test_register_async_devices(int check_event)
533c21
                               params);
533c21
     stonith_key_value_freeall(params, 1, 1);
533c21
 
533c21
-    mainloop_test_done(TRUE);
533c21
+    mainloop_test_done(__func__, true);
533c21
 }
533c21
 
533c21
 static void
533c21
@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event)
533c21
     int rc = stonith_api_connect_retry(st, crm_system_name, 10);
533c21
 
533c21
     if (rc == pcmk_ok) {
533c21
-        mainloop_test_done(TRUE);
533c21
+        mainloop_test_done(__func__, true);
533c21
         return;
533c21
     }
533c21
     crm_err("API CONNECTION FAILURE");
533c21
-    mainloop_test_done(FALSE);
533c21
+    mainloop_test_done(__func__, false);
533c21
 }
533c21
 
533c21
 static void
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 10 Jan 2022 10:27:45 -0600
533c21
Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo
533c21
533c21
caught in review
533c21
---
533c21
 doc/sphinx/Pacemaker_Development/components.rst | 2 +-
533c21
 1 file changed, 1 insertion(+), 1 deletion(-)
533c21
533c21
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
533c21
index 68158484ce..c4d10fc9f5 100644
533c21
--- a/doc/sphinx/Pacemaker_Development/components.rst
533c21
+++ b/doc/sphinx/Pacemaker_Development/components.rst
533c21
@@ -171,7 +171,7 @@ messaging layer callback, which calls:
533c21
 
533c21
     * ``fenced_process_fencing_reply()``, which calls either
533c21
       ``request_peer_fencing()`` (to retry a failed operation, or try the next
533c21
-      device in a topology is appropriate, which issues a new
533c21
+      device in a topology if appropriate, which issues a new
533c21
       ``STONITH_OP_FENCE`` request, proceeding as before) or
533c21
       ``finalize_op()`` (if the operation is definitively failed or
533c21
       successful).
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 10 Jan 2022 11:05:40 -0600
533c21
Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how
533c21
 fencing history works
533c21
533c21
---
533c21
 doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++
533c21
 1 file changed, 15 insertions(+)
533c21
533c21
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
533c21
index c4d10fc9f5..760da77c9b 100644
533c21
--- a/doc/sphinx/Pacemaker_Development/components.rst
533c21
+++ b/doc/sphinx/Pacemaker_Development/components.rst
533c21
@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call
533c21
 * ``finalize_op()``, which sends the result to all local clients.
533c21
 
533c21
 
533c21
+.. index::
533c21
+   single: fence history
533c21
+
533c21
+Fencing History
533c21
+_______________
533c21
+
533c21
+The fencer keeps a running history of all fencing operations. The bulk of the
533c21
+relevant code is in `fenced_history.c` and ensures the history is synchronized
533c21
+across all nodes even if a node leaves and rejoins the cluster.
533c21
+
533c21
+In libstonithd, this information is represented by `stonith_history_t` and is
533c21
+queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and
533c21
+`stonith_admin` use this API to display the history.
533c21
+
533c21
+
533c21
 .. index::
533c21
    single: scheduler
533c21
    single: pacemaker-schedulerd
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Mon, 10 Jan 2022 11:25:31 -0600
533c21
Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a
533c21
 callback
533c21
533c21
---
533c21
 lib/fencing/st_client.c | 6 ++++--
533c21
 1 file changed, 4 insertions(+), 2 deletions(-)
533c21
533c21
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
533c21
index 9d93ffd481..4823751267 100644
533c21
--- a/lib/fencing/st_client.c
533c21
+++ b/lib/fencing/st_client.c
533c21
@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
533c21
                                      cb_info->user_data, cb_info->callback);
533c21
 
533c21
     } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) {
533c21
-        crm_warn("Fencing action without registered callback failed: %d (%s)",
533c21
+        crm_warn("Fencing action without registered callback failed: %d (%s%s%s)",
533c21
                  result.exit_status,
533c21
-                 pcmk_exec_status_str(result.execution_status));
533c21
+                 pcmk_exec_status_str(result.execution_status),
533c21
+                 ((result.exit_reason == NULL)? "" : ": "),
533c21
+                 ((result.exit_reason == NULL)? "" : result.exit_reason));
533c21
         crm_log_xml_debug(msg, "Failed fence update");
533c21
     }
533c21
 
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Tue, 11 Jan 2022 09:28:27 -0600
533c21
Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent
533c21
533c21
... even if the line runs a little long
533c21
---
533c21
 daemons/execd/execd_commands.c | 4 ++--
533c21
 1 file changed, 2 insertions(+), 2 deletions(-)
533c21
533c21
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
533c21
index 7ae309d94c..bc3b392b2c 100644
533c21
--- a/daemons/execd/execd_commands.c
533c21
+++ b/daemons/execd/execd_commands.c
533c21
@@ -1,5 +1,5 @@
533c21
 /*
533c21
- * Copyright 2012-2021 the Pacemaker project contributors
533c21
+ * Copyright 2012-2022 the Pacemaker project contributors
533c21
  *
533c21
  * The version control history for this file may have further details.
533c21
  *
533c21
@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
533c21
     stonith_action_complete(cmd,
533c21
                             ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR),
533c21
                             stonith__legacy2status(rc),
533c21
-                            rc == -pcmk_err_generic? NULL : pcmk_strerror(rc));
533c21
+                            ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc)));
533c21
 }
533c21
 
533c21
 static int
533c21
-- 
533c21
2.27.0
533c21
533c21
533c21
From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001
533c21
From: Ken Gaillot <kgaillot@redhat.com>
533c21
Date: Tue, 11 Jan 2022 09:29:03 -0600
533c21
Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution
533c21
 status for completed tests
533c21
533c21
It doesn't matter since the value is only checked against a couple of specific
533c21
failure values, but this is less confusing.
533c21
---
533c21
 daemons/fenced/cts-fence-helper.c | 4 ++--
533c21
 1 file changed, 2 insertions(+), 2 deletions(-)
533c21
533c21
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
533c21
index 2739f57804..e222a59f9f 100644
533c21
--- a/daemons/fenced/cts-fence-helper.c
533c21
+++ b/daemons/fenced/cts-fence-helper.c
533c21
@@ -1,5 +1,5 @@
533c21
 /*
533c21
- * Copyright 2009-2021 the Pacemaker project contributors
533c21
+ * Copyright 2009-2022 the Pacemaker project contributors
533c21
  *
533c21
  * This source code is licensed under the GNU General Public License version 2
533c21
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
533c21
@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass)
533c21
         crm_info("SUCCESS - %s", origin);
533c21
         mainloop_iter++;
533c21
         mainloop_set_trigger(trig);
533c21
-        result.execution_status = PCMK_EXEC_UNKNOWN;
533c21
+        result.execution_status = PCMK_EXEC_DONE;
533c21
         result.exit_status = CRM_EX_OK;
533c21
     } else {
533c21
         crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status,
533c21
-- 
533c21
2.27.0
533c21