Blame SOURCES/011-fencing-reasons.patch

97a979
From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Tue, 14 Dec 2021 16:25:21 -0600
97a979
Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function
97a979
 renames
97a979
97a979
---
97a979
 doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++--------
97a979
 1 file changed, 8 insertions(+), 8 deletions(-)
97a979
97a979
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
97a979
index a51220cac9..68158484ce 100644
97a979
--- a/doc/sphinx/Pacemaker_Development/components.rst
97a979
+++ b/doc/sphinx/Pacemaker_Development/components.rst
97a979
@@ -106,7 +106,7 @@ or messaging layer callback, which calls:
97a979
       the number of active peers), and if this is the last expected reply,
97a979
       calls
97a979
 
97a979
-      * ``call_remote_stonith()``, which calculates the timeout and sends
97a979
+      * ``request_peer_fencing()``, which calculates the timeout and sends
97a979
         ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target
97a979
 	node has a fencing "topology" (which allows specifications such as
97a979
 	"this node can be fenced either with device A, or devices B and C in
97a979
@@ -156,7 +156,7 @@ returns, and calls
97a979
   * done callback (``st_child_done()``), which calls ``schedule_stonith_command()``
97a979
     for a new device if there are further required actions to execute or if the
97a979
     original action failed, then builds and sends an XML reply to the original
97a979
-    fencer (via ``stonith_send_async_reply()``), then checks whether any
97a979
+    fencer (via ``send_async_reply()``), then checks whether any
97a979
     pending actions are the same as the one just executed and merges them if so.
97a979
 
97a979
 Fencing replies
97a979
@@ -169,18 +169,18 @@ messaging layer callback, which calls:
97a979
 
97a979
   * ``handle_reply()``, which calls
97a979
 
97a979
-    * ``process_remote_stonith_exec()``, which calls either
97a979
-      ``call_remote_stonith()`` (to retry a failed operation, or try the next
97a979
-       device in a topology is appropriate, which issues a new
97a979
+    * ``fenced_process_fencing_reply()``, which calls either
97a979
+      ``request_peer_fencing()`` (to retry a failed operation, or try the next
97a979
+      device in a topology is appropriate, which issues a new
97a979
       ``STONITH_OP_FENCE`` request, proceeding as before) or
97a979
-      ``remote_op_done()`` (if the operation is definitively failed or
97a979
+      ``finalize_op()`` (if the operation is definitively failed or
97a979
       successful).
97a979
 
97a979
-      * remote_op_done() broadcasts the result to all peers.
97a979
+      * ``finalize_op()`` broadcasts the result to all peers.
97a979
 
97a979
 Finally, all peers receive the broadcast result and call
97a979
 
97a979
-* ``remote_op_done()``, which sends the result to all local clients.
97a979
+* ``finalize_op()``, which sends the result to all local clients.
97a979
 
97a979
 
97a979
 .. index::
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Fri, 12 Nov 2021 09:58:16 -0600
97a979
Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action
97a979
 callback data
97a979
97a979
stonith_callback_data_t previously only contained the legacy return code for
97a979
the action. Use its new opaque member to store the full result, along with
97a979
accessors (available only internally for now).
97a979
---
97a979
 include/crm/fencing/internal.h |  3 ++
97a979
 lib/fencing/st_client.c        | 99 ++++++++++++++++++++++++++--------
97a979
 2 files changed, 81 insertions(+), 21 deletions(-)
97a979
97a979
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
97a979
index f0d294a0b3..eff689e59b 100644
97a979
--- a/include/crm/fencing/internal.h
97a979
+++ b/include/crm/fencing/internal.h
97a979
@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data);
97a979
 bool stonith__event_state_neq(stonith_history_t *history, void *user_data);
97a979
 
97a979
 int stonith__legacy2status(int rc);
97a979
+int stonith__exit_status(stonith_callback_data_t *data);
97a979
+int stonith__execution_status(stonith_callback_data_t *data);
97a979
+const char *stonith__exit_reason(stonith_callback_data_t *data);
97a979
 
97a979
 /*!
97a979
  * \internal
97a979
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
97a979
index 2ca094566b..9d93ffd481 100644
97a979
--- a/lib/fencing/st_client.c
97a979
+++ b/lib/fencing/st_client.c
97a979
@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks)
97a979
  * \param[in] st        Fencer API connection
97a979
  * \param[in] call_id   If positive, call ID of completed fence action, otherwise
97a979
  *                      legacy return code for early action failure
97a979
- * \param[in] rc        Legacy return code for action result
97a979
+ * \param[in] result    Full result for action
97a979
  * \param[in] userdata  User data to pass to callback
97a979
  * \param[in] callback  Fence action callback to invoke
97a979
  */
97a979
 static void
97a979
-invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata,
97a979
+invoke_fence_action_callback(stonith_t *st, int call_id,
97a979
+                             pcmk__action_result_t *result,
97a979
+                             void *userdata,
97a979
                              void (*callback) (stonith_t *st,
97a979
                                                stonith_callback_data_t *data))
97a979
 {
97a979
     stonith_callback_data_t data = { 0, };
97a979
 
97a979
     data.call_id = call_id;
97a979
-    data.rc = rc;
97a979
+    data.rc = pcmk_rc2legacy(stonith__result2rc(result));
97a979
     data.userdata = userdata;
97a979
+    data.opaque = (void *) result;
97a979
 
97a979
     callback(st, &data);
97a979
 }
97a979
@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
97a979
 {
97a979
     stonith_private_t *private = NULL;
97a979
     stonith_callback_client_t *cb_info = NULL;
97a979
-    int rc = pcmk_ok;
97a979
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
97a979
 
97a979
     CRM_CHECK(stonith != NULL, return);
97a979
     CRM_CHECK(stonith->st_private != NULL, return);
97a979
@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
97a979
 
97a979
     if (msg == NULL) {
97a979
         // Fencer didn't reply in time
97a979
-        rc = -ETIME;
97a979
+        pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
97a979
+                         "Timeout waiting for reply from fencer");
97a979
         CRM_LOG_ASSERT(call_id > 0);
97a979
 
97a979
     } else {
97a979
         // We have the fencer reply
97a979
-
97a979
-        if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) {
97a979
-            rc = -pcmk_err_generic;
97a979
-        }
97a979
-
97a979
         if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0)
97a979
             || (call_id <= 0)) {
97a979
             crm_log_xml_warn(msg, "Bad fencer reply");
97a979
         }
97a979
+        stonith__xe_get_result(msg, &result);
97a979
     }
97a979
 
97a979
     if (call_id > 0) {
97a979
@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
97a979
     }
97a979
 
97a979
     if ((cb_info != NULL) && (cb_info->callback != NULL)
97a979
-        && (rc == pcmk_ok || !(cb_info->only_success))) {
97a979
+        && (pcmk__result_ok(&result) || !(cb_info->only_success))) {
97a979
         crm_trace("Invoking callback %s for call %d",
97a979
                   crm_str(cb_info->id), call_id);
97a979
-        invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data,
97a979
-                                     cb_info->callback);
97a979
+        invoke_fence_action_callback(stonith, call_id, &result,
97a979
+                                     cb_info->user_data, cb_info->callback);
97a979
 
97a979
-    } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) {
97a979
-        crm_warn("Fencing action without registered callback failed: %s",
97a979
-                 pcmk_strerror(rc));
97a979
+    } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) {
97a979
+        crm_warn("Fencing action without registered callback failed: %d (%s)",
97a979
+                 result.exit_status,
97a979
+                 pcmk_exec_status_str(result.execution_status));
97a979
         crm_log_xml_debug(msg, "Failed fence update");
97a979
     }
97a979
 
97a979
     if (private->op_callback != NULL) {
97a979
         crm_trace("Invoking global callback for call %d", call_id);
97a979
-        invoke_fence_action_callback(stonith, call_id, rc, NULL,
97a979
+        invoke_fence_action_callback(stonith, call_id, &result, NULL,
97a979
                                      private->op_callback);
97a979
     }
97a979
 
97a979
     if (cb_info != NULL) {
97a979
         stonith_api_del_callback(stonith, call_id, FALSE);
97a979
     }
97a979
+    pcmk__reset_result(&result);
97a979
 }
97a979
 
97a979
 static gboolean
97a979
@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti
97a979
     CRM_CHECK(stonith->st_private != NULL, return -EINVAL);
97a979
     private = stonith->st_private;
97a979
 
97a979
-    if (call_id == 0) {
97a979
+    if (call_id == 0) { // Add global callback
97a979
         private->op_callback = callback;
97a979
 
97a979
-    } else if (call_id < 0) {
97a979
+    } else if (call_id < 0) { // Call failed immediately, so call callback now
97a979
         if (!(options & st_opt_report_only_success)) {
97a979
+            pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
97a979
+
97a979
             crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id));
97a979
-            invoke_fence_action_callback(stonith, call_id, call_id, user_data,
97a979
-                                         callback);
97a979
+            pcmk__set_result(&result, CRM_EX_ERROR,
97a979
+                             stonith__legacy2status(call_id), NULL);
97a979
+            invoke_fence_action_callback(stonith, call_id, &result,
97a979
+                                         user_data, callback);
97a979
         } else {
97a979
             crm_warn("Fencer call failed: %s", pcmk_strerror(call_id));
97a979
         }
97a979
@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name,
97a979
     freeXpathObject(xpath);
97a979
 }
97a979
 
97a979
+/*!
97a979
+ * \internal
97a979
+ * \brief Return the exit status from an async action callback
97a979
+ *
97a979
+ * \param[in] data  Callback data
97a979
+ *
97a979
+ * \return Exit status from callback data
97a979
+ */
97a979
+int
97a979
+stonith__exit_status(stonith_callback_data_t *data)
97a979
+{
97a979
+    if ((data == NULL) || (data->opaque == NULL)) {
97a979
+        return CRM_EX_ERROR;
97a979
+    }
97a979
+    return ((pcmk__action_result_t *) data->opaque)->exit_status;
97a979
+}
97a979
+
97a979
+/*!
97a979
+ * \internal
97a979
+ * \brief Return the execution status from an async action callback
97a979
+ *
97a979
+ * \param[in] data  Callback data
97a979
+ *
97a979
+ * \return Execution status from callback data
97a979
+ */
97a979
+int
97a979
+stonith__execution_status(stonith_callback_data_t *data)
97a979
+{
97a979
+    if ((data == NULL) || (data->opaque == NULL)) {
97a979
+        return PCMK_EXEC_UNKNOWN;
97a979
+    }
97a979
+    return ((pcmk__action_result_t *) data->opaque)->execution_status;
97a979
+}
97a979
+
97a979
+/*!
97a979
+ * \internal
97a979
+ * \brief Return the exit reason from an async action callback
97a979
+ *
97a979
+ * \param[in] data  Callback data
97a979
+ *
97a979
+ * \return Exit reason from callback data
97a979
+ */
97a979
+const char *
97a979
+stonith__exit_reason(stonith_callback_data_t *data)
97a979
+{
97a979
+    if ((data == NULL) || (data->opaque == NULL)) {
97a979
+        return NULL;
97a979
+    }
97a979
+    return ((pcmk__action_result_t *) data->opaque)->exit_reason;
97a979
+}
97a979
+
97a979
 // Deprecated functions kept only for backward API compatibility
97a979
 // LCOV_EXCL_START
97a979
 
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Tue, 9 Nov 2021 16:16:03 -0600
97a979
Subject: [PATCH 03/17] Log: controller: improve fencing result messages
97a979
97a979
Now that fence callbacks get the full result, we can log a better message.
97a979
Also check for error conditions better, improve message wording, and ensure
97a979
only a single message is logged per result.
97a979
---
97a979
 daemons/controld/controld_fencing.c | 83 +++++++++++++++++++----------
97a979
 1 file changed, 56 insertions(+), 27 deletions(-)
97a979
97a979
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
97a979
index f5a252c813..f8d2fc13f4 100644
97a979
--- a/daemons/controld/controld_fencing.c
97a979
+++ b/daemons/controld/controld_fencing.c
97a979
@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
97a979
     int stonith_id = -1;
97a979
     int transition_id = -1;
97a979
     crm_action_t *action = NULL;
97a979
-    int call_id = data->call_id;
97a979
-    int rc = data->rc;
97a979
-    char *userdata = data->userdata;
97a979
-
97a979
-    CRM_CHECK(userdata != NULL, return);
97a979
-    crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
97a979
-               pcmk_strerror(rc), rc);
97a979
+    const char *target = NULL;
97a979
 
97a979
-    if (AM_I_DC == FALSE) {
97a979
+    if ((data == NULL) || (data->userdata == NULL)) {
97a979
+        crm_err("Ignoring fence operation %d result: "
97a979
+                "No transition key given (bug?)",
97a979
+                ((data == NULL)? -1 : data->call_id));
97a979
         return;
97a979
     }
97a979
 
97a979
-    /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
97a979
-    /*       op->call_id, op->optype, op->node_name, op->op_result, */
97a979
-    /*       (char *)op->node_list, op->private_data); */
97a979
+    if (!AM_I_DC) {
97a979
+        const char *reason = stonith__exit_reason(data);
97a979
+
97a979
+        if (reason == NULL) {
97a979
+           reason = pcmk_exec_status_str(stonith__execution_status(data));
97a979
+        }
97a979
+        crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
97a979
+                   data->call_id, stonith__exit_status(data), reason,
97a979
+                   (const char *) data->userdata);
97a979
+        return;
97a979
+    }
97a979
 
97a979
-    /* filter out old STONITH actions */
97a979
-    CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
97a979
+    CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
97a979
+                                    &stonith_id, NULL),
97a979
               goto bail);
97a979
 
97a979
-    if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei)
97a979
-        || transition_graph->id != transition_id) {
97a979
-        crm_info("Ignoring STONITH action initiated outside of the current transition");
97a979
+    if (transition_graph->complete || (stonith_id < 0)
97a979
+        || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none)
97a979
+        || (transition_graph->id != transition_id)) {
97a979
+        crm_info("Ignoring fence operation %d result: "
97a979
+                 "Not from current transition " CRM_XS
97a979
+                 " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
97a979
+                 data->call_id, pcmk__btoa(transition_graph->complete),
97a979
+                 stonith_id, uuid, te_uuid, transition_id, transition_graph->id);
97a979
         goto bail;
97a979
     }
97a979
 
97a979
     action = controld_get_action(stonith_id);
97a979
     if (action == NULL) {
97a979
-        crm_err("Stonith action not matched");
97a979
+        crm_err("Ignoring fence operation %d result: "
97a979
+                "Action %d not found in transition graph (bug?) "
97a979
+                CRM_XS " uuid=%s transition=%d",
97a979
+                data->call_id, stonith_id, uuid, transition_id);
97a979
+        goto bail;
97a979
+    }
97a979
+
97a979
+    target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
97a979
+    if (target == NULL) {
97a979
+        crm_err("Ignoring fence operation %d result: No target given (bug?)",
97a979
+                data->call_id);
97a979
         goto bail;
97a979
     }
97a979
 
97a979
     stop_te_timer(action->timer);
97a979
-    if (rc == pcmk_ok) {
97a979
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
97a979
+    if (stonith__exit_status(data) == CRM_EX_OK) {
97a979
         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
97a979
         const char *op = crm_meta_value(action->params, "stonith_action");
97a979
 
97a979
-        crm_info("Stonith operation %d for %s passed", call_id, target);
97a979
+        crm_notice("Fence operation %d for %s passed", data->call_id, target);
97a979
         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
97a979
             te_action_confirmed(action, NULL);
97a979
             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
97a979
@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
97a979
         st_fail_count_reset(target);
97a979
 
97a979
     } else {
97a979
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
97a979
         enum transition_action abort_action = tg_restart;
97a979
+        int status = stonith__execution_status(data);
97a979
+        const char *reason = stonith__exit_reason(data);
97a979
 
97a979
+        if (reason == NULL) {
97a979
+            if (status == PCMK_EXEC_DONE) {
97a979
+                reason = "Agent returned error";
97a979
+            } else {
97a979
+                reason = pcmk_exec_status_str(status);
97a979
+            }
97a979
+        }
97a979
         crm__set_graph_action_flags(action, pcmk__graph_action_failed);
97a979
-        crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
97a979
-                   call_id, target, pcmk_strerror(rc));
97a979
 
97a979
         /* If no fence devices were available, there's no use in immediately
97a979
          * checking again, so don't start a new transition in that case.
97a979
          */
97a979
-        if (rc == -ENODEV) {
97a979
-            crm_warn("No devices found in cluster to fence %s, giving up",
97a979
-                     target);
97a979
+        if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
97a979
+            crm_warn("Fence operation %d for %s failed: %s "
97a979
+                     "(aborting transition and giving up for now)",
97a979
+                     data->call_id, target, reason);
97a979
             abort_action = tg_stop;
97a979
+        } else {
97a979
+            crm_notice("Fence operation %d for %s failed: %s "
97a979
+                       "(aborting transition)", data->call_id, target, reason);
97a979
         }
97a979
 
97a979
         /* Increment the fail count now, so abort_for_stonith_failure() can
97a979
@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
97a979
     trigger_graph();
97a979
 
97a979
   bail:
97a979
-    free(userdata);
97a979
+    free(data->userdata);
97a979
     free(uuid);
97a979
     return;
97a979
 }
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 22 Nov 2021 12:37:16 -0600
97a979
Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc()
97a979
 function
97a979
97a979
action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as
97a979
appropriate to the action standard. However, it was called only from a place
97a979
that did not process stonith actions, so that place can just call
97a979
services_result2ocf() directly.
97a979
97a979
This will simplify planned changes.
97a979
---
97a979
 daemons/execd/execd_commands.c | 24 ++++++------------------
97a979
 1 file changed, 6 insertions(+), 18 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 5bb2aab692..5e123e322e 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc)
97a979
     return rc;
97a979
 }
97a979
 
97a979
-static int
97a979
-action_get_uniform_rc(svc_action_t *action)
97a979
-{
97a979
-    lrmd_cmd_t *cmd = action->cb_data;
97a979
-
97a979
-    if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH,
97a979
-                            pcmk__str_casei)) {
97a979
-        return stonith2uniform_rc(cmd->action, action->rc);
97a979
-    } else {
97a979
-        enum ocf_exitcode code = services_result2ocf(action->standard,
97a979
-                                                     cmd->action, action->rc);
97a979
-
97a979
-        // Cast variable instead of function return to keep compilers happy
97a979
-        return (int) code;
97a979
-    }
97a979
-}
97a979
-
97a979
 struct notify_new_client_data {
97a979
     xmlNode *notify;
97a979
     pcmk__client_t *new_client;
97a979
@@ -848,6 +831,7 @@ action_complete(svc_action_t * action)
97a979
 {
97a979
     lrmd_rsc_t *rsc;
97a979
     lrmd_cmd_t *cmd = action->cb_data;
97a979
+    enum ocf_exitcode code;
97a979
 
97a979
 #ifdef PCMK__TIME_USE_CGT
97a979
     const char *rclass = NULL;
97a979
@@ -867,8 +851,12 @@ action_complete(svc_action_t * action)
97a979
 #endif
97a979
 
97a979
     cmd->last_pid = action->pid;
97a979
-    pcmk__set_result(&(cmd->result), action_get_uniform_rc(action),
97a979
+
97a979
+    // Cast variable instead of function return to keep compilers happy
97a979
+    code = services_result2ocf(action->standard, cmd->action, action->rc);
97a979
+    pcmk__set_result(&(cmd->result), (int) code,
97a979
                      action->status, services__exit_reason(action));
97a979
+
97a979
     rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
97a979
 
97a979
 #ifdef PCMK__TIME_USE_CGT
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 22 Nov 2021 12:39:30 -0600
97a979
Subject: [PATCH 05/17] Feature: executor: use full result from fencer for
97a979
 fence actions
97a979
97a979
Now that fence callbacks get the full result, we can improve the executor
97a979
command result for fence actions. stonith_action_complete() now takes a
97a979
full result, allowing the executor to use that directly rather than map a
97a979
legacy return code.
97a979
---
97a979
 daemons/execd/execd_commands.c | 140 +++++++++++++++++++--------------
97a979
 1 file changed, 80 insertions(+), 60 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 5e123e322e..e722994012 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -8,6 +8,7 @@
97a979
  */
97a979
 
97a979
 #include <crm_internal.h>
97a979
+#include <crm/fencing/internal.h>
97a979
 
97a979
 #include <glib.h>
97a979
 
97a979
@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc)
97a979
     }
97a979
 }
97a979
 
97a979
-static int
97a979
-stonith2uniform_rc(const char *action, int rc)
97a979
-{
97a979
-    switch (rc) {
97a979
-        case pcmk_ok:
97a979
-            rc = PCMK_OCF_OK;
97a979
-            break;
97a979
-
97a979
-        case -ENODEV:
97a979
-            /* This should be possible only for probes in practice, but
97a979
-             * interpret for all actions to be safe.
97a979
-             */
97a979
-            if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) {
97a979
-                rc = PCMK_OCF_NOT_RUNNING;
97a979
-            } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) {
97a979
-                rc = PCMK_OCF_OK;
97a979
-            } else {
97a979
-                rc = PCMK_OCF_NOT_INSTALLED;
97a979
-            }
97a979
-            break;
97a979
-
97a979
-        case -EOPNOTSUPP:
97a979
-            rc = PCMK_OCF_UNIMPLEMENT_FEATURE;
97a979
-            break;
97a979
-
97a979
-        default:
97a979
-            rc = PCMK_OCF_UNKNOWN_ERROR;
97a979
-            break;
97a979
-    }
97a979
-    return rc;
97a979
-}
97a979
-
97a979
 struct notify_new_client_data {
97a979
     xmlNode *notify;
97a979
     pcmk__client_t *new_client;
97a979
@@ -988,46 +957,84 @@ action_complete(svc_action_t * action)
97a979
     cmd_finalize(cmd, rsc);
97a979
 }
97a979
 
97a979
+/*!
97a979
+ * \internal
97a979
+ * \brief Process the result of a fence device action (start, stop, or monitor)
97a979
+ *
97a979
+ * \param[in] cmd               Fence device action that completed
97a979
+ * \param[in] exit_status       Fencer API exit status for action
97a979
+ * \param[in] execution_status  Fencer API execution status for action
97a979
+ * \param[in] exit_reason       Human-friendly detail, if action failed
97a979
+ */
97a979
 static void
97a979
-stonith_action_complete(lrmd_cmd_t * cmd, int rc)
97a979
+stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
97a979
+                        enum pcmk_exec_status execution_status,
97a979
+                        const char *exit_reason)
97a979
 {
97a979
     // This can be NULL if resource was removed before command completed
97a979
     lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
97a979
 
97a979
-    cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc);
97a979
+    // Simplify fencer exit status to uniform exit status
97a979
+    if (exit_status != CRM_EX_OK) {
97a979
+        exit_status = PCMK_OCF_UNKNOWN_ERROR;
97a979
+    }
97a979
 
97a979
-    /* This function may be called with status already set to cancelled, if a
97a979
-     * pending action was aborted. Otherwise, we need to determine status from
97a979
-     * the fencer return code.
97a979
-     */
97a979
-    if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) {
97a979
-        cmd->result.execution_status = stonith__legacy2status(rc);
97a979
+    if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) {
97a979
+        /* An in-flight fence action was cancelled. The execution status is
97a979
+         * already correct, so don't overwrite it.
97a979
+         */
97a979
+        execution_status = PCMK_EXEC_CANCELLED;
97a979
 
97a979
-        // Simplify status codes from fencer
97a979
-        switch (cmd->result.execution_status) {
97a979
+    } else {
97a979
+        /* Some execution status codes have specific meanings for the fencer
97a979
+         * that executor clients may not expect, so map them to a simple error
97a979
+         * status.
97a979
+         */
97a979
+        switch (execution_status) {
97a979
             case PCMK_EXEC_NOT_CONNECTED:
97a979
             case PCMK_EXEC_INVALID:
97a979
-            case PCMK_EXEC_NO_FENCE_DEVICE:
97a979
             case PCMK_EXEC_NO_SECRETS:
97a979
-                cmd->result.execution_status = PCMK_EXEC_ERROR;
97a979
+                execution_status = PCMK_EXEC_ERROR;
97a979
                 break;
97a979
-            default:
97a979
+
97a979
+            case PCMK_EXEC_NO_FENCE_DEVICE:
97a979
+                /* This should be possible only for probes in practice, but
97a979
+                 * interpret for all actions to be safe.
97a979
+                 */
97a979
+                if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS,
97a979
+                                 pcmk__str_none)) {
97a979
+                    exit_status = PCMK_OCF_NOT_RUNNING;
97a979
+
97a979
+                } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP,
97a979
+                                        pcmk__str_none)) {
97a979
+                    exit_status = PCMK_OCF_OK;
97a979
+
97a979
+                } else {
97a979
+                    exit_status = PCMK_OCF_NOT_INSTALLED;
97a979
+                }
97a979
+                execution_status = PCMK_EXEC_ERROR;
97a979
                 break;
97a979
-        }
97a979
 
97a979
-        // Certain successful actions change the known state of the resource
97a979
-        if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
97a979
-            if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
-                rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
97a979
-            } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
97a979
-                rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
97a979
-            }
97a979
+            case PCMK_EXEC_NOT_SUPPORTED:
97a979
+                exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE;
97a979
+                break;
97a979
+
97a979
+            default:
97a979
+                break;
97a979
         }
97a979
     }
97a979
 
97a979
-    // Give the user more detail than an OCF code
97a979
-    if (rc != -pcmk_err_generic) {
97a979
-        cmd->result.exit_reason = strdup(pcmk_strerror(rc));
97a979
+    pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason);
97a979
+
97a979
+    // Certain successful actions change the known state of the resource
97a979
+    if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
97a979
+
97a979
+        if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
+            rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
97a979
+
97a979
+        } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
97a979
+            rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
97a979
+        }
97a979
     }
97a979
 
97a979
     /* The recurring timer should not be running at this point in any case, but
97a979
@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc)
97a979
 static void
97a979
 lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
97a979
 {
97a979
-    stonith_action_complete(data->userdata, data->rc);
97a979
+    if ((data == NULL) || (data->userdata == NULL)) {
97a979
+        crm_err("Ignoring fence action result: "
97a979
+                "Invalid callback arguments (bug?)");
97a979
+    } else {
97a979
+        stonith_action_complete((lrmd_cmd_t *) data->userdata,
97a979
+                                stonith__exit_status(data),
97a979
+                                stonith__execution_status(data),
97a979
+                                stonith__exit_reason(data));
97a979
+    }
97a979
 }
97a979
 
97a979
 void
97a979
@@ -1097,7 +1112,9 @@ stonith_connection_failed(void)
97a979
     crm_err("Connection to fencer failed, finalizing %d pending operations",
97a979
             g_list_length(cmd_list));
97a979
     for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
97a979
-        stonith_action_complete(cmd_iter->data, -ENOTCONN);
97a979
+        stonith_action_complete((lrmd_cmd_t *) cmd_iter->data,
97a979
+                                CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED,
97a979
+                                "Lost connection to fencer");
97a979
     }
97a979
     g_list_free(cmd_list);
97a979
 }
97a979
@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
 
97a979
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
         rc = execd_stonith_start(stonith_api, rsc, cmd);
97a979
-        if (rc == 0) {
97a979
+        if (rc == pcmk_ok) {
97a979
             do_monitor = TRUE;
97a979
         }
97a979
 
97a979
@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
         }
97a979
     }
97a979
 
97a979
-    stonith_action_complete(cmd, rc);
97a979
+    stonith_action_complete(cmd,
97a979
+                            ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR),
97a979
+                            stonith__legacy2status(rc),
97a979
+                            rc == -pcmk_err_generic? NULL : pcmk_strerror(rc));
97a979
 }
97a979
 
97a979
 static int
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 22 Nov 2021 16:15:05 -0600
97a979
Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence
97a979
 results as a hard error
97a979
97a979
Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to
97a979
PCMK_EXEC_ERROR to keep handling of that situation the same as before the new
97a979
code was added.
97a979
97a979
However, the earlier handling was less than ideal -- a resource action that
97a979
failed due to missing secrets would be retried on the same node, and almost
97a979
certainly fail again for the same reason. Now, the executor passes along
97a979
PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the
97a979
CIB status, and the scheduler will treat it as a hard error (i.e. not retrying
97a979
on the same node).
97a979
97a979
Backward compatibility isn't a problem because the scheduler treats unknown
97a979
status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to
97a979
handle it as before. The CRM feature set has been bumped so the handling can't
97a979
flip back and forth in a mixed-version cluster.
97a979
---
97a979
 daemons/execd/execd_commands.c | 1 -
97a979
 include/crm/crm.h              | 4 ++--
97a979
 lib/pengine/unpack.c           | 3 ---
97a979
 3 files changed, 2 insertions(+), 6 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index e722994012..4ced6d1d5c 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
97a979
         switch (execution_status) {
97a979
             case PCMK_EXEC_NOT_CONNECTED:
97a979
             case PCMK_EXEC_INVALID:
97a979
-            case PCMK_EXEC_NO_SECRETS:
97a979
                 execution_status = PCMK_EXEC_ERROR;
97a979
                 break;
97a979
 
97a979
diff --git a/include/crm/crm.h b/include/crm/crm.h
97a979
index 16b35e9c55..56b07cb12a 100644
97a979
--- a/include/crm/crm.h
97a979
+++ b/include/crm/crm.h
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2004-2021 the Pacemaker project contributors
97a979
+ * Copyright 2004-2022 the Pacemaker project contributors
97a979
  *
97a979
  * The version control history for this file may have further details.
97a979
  *
97a979
@@ -66,7 +66,7 @@ extern "C" {
97a979
  * >=3.0.13: Fail counts include operation name and interval
97a979
  * >=3.2.0:  DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED
97a979
  */
97a979
-#  define CRM_FEATURE_SET		"3.12.0"
97a979
+#  define CRM_FEATURE_SET		"3.13.0"
97a979
 
97a979
 /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and
97a979
  * recipient of a CPG message. This imposes an arbitrary limit on cluster node
97a979
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
97a979
index 3e0384cd2a..8a2d2a6d6d 100644
97a979
--- a/lib/pengine/unpack.c
97a979
+++ b/lib/pengine/unpack.c
97a979
@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
97a979
         case PCMK_EXEC_INVALID:
97a979
             break; // Not done, do error handling
97a979
 
97a979
-        /* These should only be possible in fence action results, not operation
97a979
-         * history, but have some handling in place as a fail-safe.
97a979
-         */
97a979
         case PCMK_EXEC_NO_FENCE_DEVICE:
97a979
         case PCMK_EXEC_NO_SECRETS:
97a979
             status = PCMK_EXEC_ERROR_HARD;
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 12:05:20 -0600
97a979
Subject: [PATCH 07/17] Low: executor: improve result for fence device probes
97a979
97a979
Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy
97a979
return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and
97a979
rename to fence_probe_result). Set an appropriate exit reason when available.
97a979
---
97a979
 daemons/execd/execd_commands.c  | 57 ++++++++++++++++++++++++++-------
97a979
 daemons/execd/pacemaker-execd.h |  9 +++++-
97a979
 2 files changed, 54 insertions(+), 12 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 4ced6d1d5c..6e5505e973 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg)
97a979
     rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER);
97a979
     rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE);
97a979
     rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc);
97a979
-    rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running"
97a979
+
97a979
+    // Initialize fence device probes (to return "not running")
97a979
+    rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE;
97a979
     return rsc;
97a979
 }
97a979
 
97a979
@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status,
97a979
     if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) {
97a979
 
97a979
         if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
-            rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
97a979
+            rsc->fence_probe_result = PCMK_EXEC_DONE; // "running"
97a979
 
97a979
         } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
97a979
-            rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
97a979
+            rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running"
97a979
         }
97a979
     }
97a979
 
97a979
@@ -1081,14 +1083,13 @@ stonith_connection_failed(void)
97a979
         if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) {
97a979
             /* If we registered this fence device, we don't know whether the
97a979
              * fencer still has the registration or not. Cause future probes to
97a979
-             * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or
97a979
-             * started successfully. This is especially important if the
97a979
-             * controller also went away (possibly due to a cluster layer
97a979
-             * restart) and won't receive our client notification of any
97a979
-             * monitors finalized below.
97a979
+             * return an error until the resource is stopped or started
97a979
+             * successfully. This is especially important if the controller also
97a979
+             * went away (possibly due to a cluster layer restart) and won't
97a979
+             * receive our client notification of any monitors finalized below.
97a979
              */
97a979
-            if (rsc->st_probe_rc == pcmk_ok) {
97a979
-                rsc->st_probe_rc = pcmk_err_generic;
97a979
+            if (rsc->fence_probe_result == PCMK_EXEC_DONE) {
97a979
+                rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED;
97a979
             }
97a979
 
97a979
             if (rsc->active) {
97a979
@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd)
97a979
     return rc;
97a979
 }
97a979
 
97a979
+/*!
97a979
+ * \internal
97a979
+ * \brief  Finalize the result of a fence device probe
97a979
+ *
97a979
+ * \param[in] cmd           Probe action
97a979
+ * \param[in] probe_result  Probe result
97a979
+ */
97a979
+static void
97a979
+finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result)
97a979
+{
97a979
+    int exit_status = CRM_EX_ERROR;
97a979
+    const char *reason = NULL;
97a979
+
97a979
+    switch (probe_result) {
97a979
+        case PCMK_EXEC_DONE: // Device is "running"
97a979
+            exit_status = CRM_EX_OK;
97a979
+            break;
97a979
+
97a979
+        case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running"
97a979
+            break;
97a979
+
97a979
+        case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed()
97a979
+            reason = "Lost connection to fencer";
97a979
+            break;
97a979
+
97a979
+        default: // Shouldn't be possible
97a979
+            probe_result = PCMK_EXEC_ERROR;
97a979
+            reason = "Invalid fence device probe result (bug?)";
97a979
+            break;
97a979
+    }
97a979
+    stonith_action_complete(cmd, exit_status, probe_result, reason);
97a979
+}
97a979
+
97a979
 static void
97a979
 lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
 {
97a979
@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
         if (cmd->interval_ms > 0) {
97a979
             do_monitor = TRUE;
97a979
         } else {
97a979
-            rc = rsc->st_probe_rc;
97a979
+            finalize_fence_device_probe(cmd, rsc->fence_probe_result);
97a979
+            return;
97a979
         }
97a979
     }
97a979
 
97a979
diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h
97a979
index 51ef8d22e6..057d889584 100644
97a979
--- a/daemons/execd/pacemaker-execd.h
97a979
+++ b/daemons/execd/pacemaker-execd.h
97a979
@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s {
97a979
      * that have been handed off from the pending ops list. */
97a979
     GList *recurring_ops;
97a979
 
97a979
-    int st_probe_rc; // What value should be returned for a probe if stonith
97a979
+    /* If this resource is a fence device, probes are handled internally by the
97a979
+     * executor, and this value indicates the result that should currently be
97a979
+     * returned for probes. It should be one of:
97a979
+     * PCMK_EXEC_DONE (to indicate "running"),
97a979
+     * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or
97a979
+     * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost").
97a979
+     */
97a979
+    enum pcmk_exec_status fence_probe_result;
97a979
 
97a979
     crm_trigger_t *work;
97a979
 } lrmd_rsc_t;
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 12:14:48 -0600
97a979
Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for
97a979
 probes
97a979
97a979
For fence devices, probe results are based on earlier state determinations,
97a979
so handle them before requiring an active fencer connection. The effect may be
97a979
negligible, but it would allow probes to proceed while waiting for a
97a979
reconnection.
97a979
---
97a979
 daemons/execd/execd_commands.c | 15 ++++++++-------
97a979
 1 file changed, 8 insertions(+), 7 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 6e5505e973..5999ba19c9 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
 
97a979
     stonith_t *stonith_api = get_stonith_connection();
97a979
 
97a979
-    if (!stonith_api) {
97a979
+    if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)
97a979
+        && (cmd->interval_ms == 0)) {
97a979
+        // Probes don't require a fencer connection
97a979
+        finalize_fence_device_probe(cmd, rsc->fence_probe_result);
97a979
+        return;
97a979
+
97a979
+    } else if (stonith_api == NULL) {
97a979
         rc = -ENOTCONN;
97a979
 
97a979
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
         rc = execd_stonith_stop(stonith_api, rsc);
97a979
 
97a979
     } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
97a979
-        if (cmd->interval_ms > 0) {
97a979
-            do_monitor = TRUE;
97a979
-        } else {
97a979
-            finalize_fence_device_probe(cmd, rsc->fence_probe_result);
97a979
-            return;
97a979
-        }
97a979
+        do_monitor = TRUE;
97a979
     }
97a979
 
97a979
     if (do_monitor) {
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 12:20:34 -0600
97a979
Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence
97a979
 device actions
97a979
97a979
... and set an exit reason. Previously, it would return success for unsupported
97a979
actions. It shouldn't be possible, but it would be nice to have an indication
97a979
of what is wrong if a bug is introduced.
97a979
---
97a979
 daemons/execd/execd_commands.c | 6 ++++++
97a979
 1 file changed, 6 insertions(+)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 5999ba19c9..772d6446dc 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
 
97a979
     } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
97a979
         do_monitor = TRUE;
97a979
+
97a979
+    } else {
97a979
+        stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE,
97a979
+                                PCMK_EXEC_ERROR,
97a979
+                                "Invalid fence device action (bug?)");
97a979
+        return;
97a979
     }
97a979
 
97a979
     if (do_monitor) {
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 12:24:07 -0600
97a979
Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection
97a979
97a979
---
97a979
 daemons/execd/execd_commands.c | 5 ++++-
97a979
 1 file changed, 4 insertions(+), 1 deletion(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 772d6446dc..7ae309d94c 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
         return;
97a979
 
97a979
     } else if (stonith_api == NULL) {
97a979
-        rc = -ENOTCONN;
97a979
+        stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR,
97a979
+                                PCMK_EXEC_NOT_CONNECTED,
97a979
+                                "No connection to fencer");
97a979
+        return;
97a979
 
97a979
     } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
97a979
         rc = execd_stonith_start(stonith_api, rsc, cmd);
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 14:42:26 -0600
97a979
Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence
97a979
 callback
97a979
97a979
---
97a979
 daemons/fenced/cts-fence-helper.c | 7 +++++--
97a979
 1 file changed, 5 insertions(+), 2 deletions(-)
97a979
97a979
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
97a979
index 2adb032f24..c2b55d73b9 100644
97a979
--- a/daemons/fenced/cts-fence-helper.c
97a979
+++ b/daemons/fenced/cts-fence-helper.c
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2009-2020 the Pacemaker project contributors
97a979
+ * Copyright 2009-2021 the Pacemaker project contributors
97a979
  *
97a979
  * This source code is licensed under the GNU General Public License version 2
97a979
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
97a979
@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e)
97a979
 static void
97a979
 st_global_callback(stonith_t * stonith, stonith_callback_data_t * data)
97a979
 {
97a979
-    crm_notice("Call id %d completed with rc %d", data->call_id, data->rc);
97a979
+    crm_notice("Call %d exited %d: %s (%s)",
97a979
+               data->call_id, stonith__exit_status(data),
97a979
+               stonith__execution_status(data),
97a979
+               crm_str(stonith__exit_reason(data)));
97a979
 }
97a979
 
97a979
 static void
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Wed, 10 Nov 2021 15:10:14 -0600
97a979
Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of
97a979
 legacy return code
97a979
97a979
---
97a979
 daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++----------------
97a979
 1 file changed, 37 insertions(+), 40 deletions(-)
97a979
97a979
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
97a979
index c2b55d73b9..2739f57804 100644
97a979
--- a/daemons/fenced/cts-fence-helper.c
97a979
+++ b/daemons/fenced/cts-fence-helper.c
97a979
@@ -34,23 +34,12 @@
97a979
 static GMainLoop *mainloop = NULL;
97a979
 static crm_trigger_t *trig = NULL;
97a979
 static int mainloop_iter = 0;
97a979
-static int callback_rc = 0;
97a979
+static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
97a979
+
97a979
 typedef void (*mainloop_test_iteration_cb) (int check_event);
97a979
 
97a979
 #define MAINLOOP_DEFAULT_TIMEOUT 2
97a979
 
97a979
-#define mainloop_test_done(pass) \
97a979
-    if (pass) { \
97a979
-        crm_info("SUCCESS - %s", __func__); \
97a979
-        mainloop_iter++;   \
97a979
-        mainloop_set_trigger(trig);  \
97a979
-    } else { \
97a979
-        crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \
97a979
-        crm_exit(CRM_EX_ERROR); \
97a979
-    } \
97a979
-    callback_rc = 0; \
97a979
-
97a979
-
97a979
 enum test_modes {
97a979
     test_standard = 0,  // test using a specific developer environment
97a979
     test_passive,       // watch notifications only
97a979
@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call;
97a979
 static int expected_notifications = 0;
97a979
 static int verbose = 0;
97a979
 
97a979
+static void
97a979
+mainloop_test_done(const char *origin, bool pass)
97a979
+{
97a979
+    if (pass) {
97a979
+        crm_info("SUCCESS - %s", origin);
97a979
+        mainloop_iter++;
97a979
+        mainloop_set_trigger(trig);
97a979
+        result.execution_status = PCMK_EXEC_UNKNOWN;
97a979
+        result.exit_status = CRM_EX_OK;
97a979
+    } else {
97a979
+        crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status,
97a979
+                pcmk_exec_status_str(result.execution_status));
97a979
+        crm_exit(CRM_EX_ERROR);
97a979
+    }
97a979
+}
97a979
+
97a979
+
97a979
 static void
97a979
 dispatch_helper(int timeout)
97a979
 {
97a979
@@ -385,7 +391,9 @@ static void
97a979
 static void
97a979
 mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data)
97a979
 {
97a979
-    callback_rc = data->rc;
97a979
+    pcmk__set_result(&result, stonith__exit_status(data),
97a979
+                     stonith__execution_status(data),
97a979
+                     stonith__exit_reason(data));
97a979
     iterate_mainloop_tests(TRUE);
97a979
 }
97a979
 
97a979
@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event)
97a979
     int rc = 0;
97a979
 
97a979
     if (check_event) {
97a979
-        if (callback_rc != 0) {
97a979
-            mainloop_test_done(FALSE);
97a979
-        } else {
97a979
-            mainloop_test_done(TRUE);
97a979
-        }
97a979
+        mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK));
97a979
         return;
97a979
     }
97a979
 
97a979
     rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
97a979
     if (rc < 0) {
97a979
         crm_err("fence failed with rc %d", rc);
97a979
-        mainloop_test_done(FALSE);
97a979
+        mainloop_test_done(__func__, false);
97a979
     }
97a979
     register_callback_helper(rc);
97a979
     /* wait for event */
97a979
@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event)
97a979
     if (check_event) {
97a979
         uint32_t diff = (time(NULL) - begin);
97a979
 
97a979
-        if (callback_rc != -ETIME) {
97a979
-            mainloop_test_done(FALSE);
97a979
+        if (result.execution_status != PCMK_EXEC_TIMEOUT) {
97a979
+            mainloop_test_done(__func__, false);
97a979
         } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) {
97a979
             crm_err
97a979
                 ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d",
97a979
                  CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff);
97a979
-            mainloop_test_done(FALSE);
97a979
+            mainloop_test_done(__func__, false);
97a979
         } else {
97a979
-            mainloop_test_done(TRUE);
97a979
+            mainloop_test_done(__func__, true);
97a979
         }
97a979
         return;
97a979
     }
97a979
@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event)
97a979
     rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
97a979
     if (rc < 0) {
97a979
         crm_err("fence failed with rc %d", rc);
97a979
-        mainloop_test_done(FALSE);
97a979
+        mainloop_test_done(__func__, false);
97a979
     }
97a979
     register_callback_helper(rc);
97a979
     /* wait for event */
97a979
@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event)
97a979
     int rc = 0;
97a979
 
97a979
     if (check_event) {
97a979
-        if (callback_rc != -ENODEV) {
97a979
-            mainloop_test_done(FALSE);
97a979
-        } else {
97a979
-            mainloop_test_done(TRUE);
97a979
-        }
97a979
+        mainloop_test_done(__func__,
97a979
+                           (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE));
97a979
         return;
97a979
     }
97a979
 
97a979
     rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0);
97a979
     if (rc < 0) {
97a979
         crm_err("fence failed with rc %d", rc);
97a979
-        mainloop_test_done(FALSE);
97a979
+        mainloop_test_done(__func__, false);
97a979
     }
97a979
     register_callback_helper(rc);
97a979
     /* wait for event */
97a979
@@ -483,18 +484,14 @@ test_async_monitor(int check_event)
97a979
     int rc = 0;
97a979
 
97a979
     if (check_event) {
97a979
-        if (callback_rc) {
97a979
-            mainloop_test_done(FALSE);
97a979
-        } else {
97a979
-            mainloop_test_done(TRUE);
97a979
-        }
97a979
+        mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK));
97a979
         return;
97a979
     }
97a979
 
97a979
     rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT);
97a979
     if (rc < 0) {
97a979
         crm_err("monitor failed with rc %d", rc);
97a979
-        mainloop_test_done(FALSE);
97a979
+        mainloop_test_done(__func__, false);
97a979
     }
97a979
 
97a979
     register_callback_helper(rc);
97a979
@@ -531,7 +528,7 @@ test_register_async_devices(int check_event)
97a979
                               params);
97a979
     stonith_key_value_freeall(params, 1, 1);
97a979
 
97a979
-    mainloop_test_done(TRUE);
97a979
+    mainloop_test_done(__func__, true);
97a979
 }
97a979
 
97a979
 static void
97a979
@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event)
97a979
     int rc = stonith_api_connect_retry(st, crm_system_name, 10);
97a979
 
97a979
     if (rc == pcmk_ok) {
97a979
-        mainloop_test_done(TRUE);
97a979
+        mainloop_test_done(__func__, true);
97a979
         return;
97a979
     }
97a979
     crm_err("API CONNECTION FAILURE");
97a979
-    mainloop_test_done(FALSE);
97a979
+    mainloop_test_done(__func__, false);
97a979
 }
97a979
 
97a979
 static void
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 10 Jan 2022 10:27:45 -0600
97a979
Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo
97a979
97a979
caught in review
97a979
---
97a979
 doc/sphinx/Pacemaker_Development/components.rst | 2 +-
97a979
 1 file changed, 1 insertion(+), 1 deletion(-)
97a979
97a979
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
97a979
index 68158484ce..c4d10fc9f5 100644
97a979
--- a/doc/sphinx/Pacemaker_Development/components.rst
97a979
+++ b/doc/sphinx/Pacemaker_Development/components.rst
97a979
@@ -171,7 +171,7 @@ messaging layer callback, which calls:
97a979
 
97a979
     * ``fenced_process_fencing_reply()``, which calls either
97a979
       ``request_peer_fencing()`` (to retry a failed operation, or try the next
97a979
-      device in a topology is appropriate, which issues a new
97a979
+      device in a topology if appropriate, which issues a new
97a979
       ``STONITH_OP_FENCE`` request, proceeding as before) or
97a979
       ``finalize_op()`` (if the operation is definitively failed or
97a979
       successful).
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 10 Jan 2022 11:05:40 -0600
97a979
Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how
97a979
 fencing history works
97a979
97a979
---
97a979
 doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++
97a979
 1 file changed, 15 insertions(+)
97a979
97a979
diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst
97a979
index c4d10fc9f5..760da77c9b 100644
97a979
--- a/doc/sphinx/Pacemaker_Development/components.rst
97a979
+++ b/doc/sphinx/Pacemaker_Development/components.rst
97a979
@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call
97a979
 * ``finalize_op()``, which sends the result to all local clients.
97a979
 
97a979
 
97a979
+.. index::
97a979
+   single: fence history
97a979
+
97a979
+Fencing History
97a979
+_______________
97a979
+
97a979
+The fencer keeps a running history of all fencing operations. The bulk of the
97a979
+relevant code is in `fenced_history.c` and ensures the history is synchronized
97a979
+across all nodes even if a node leaves and rejoins the cluster.
97a979
+
97a979
+In libstonithd, this information is represented by `stonith_history_t` and is
97a979
+queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and
97a979
+`stonith_admin` use this API to display the history.
97a979
+
97a979
+
97a979
 .. index::
97a979
    single: scheduler
97a979
    single: pacemaker-schedulerd
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Mon, 10 Jan 2022 11:25:31 -0600
97a979
Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a
97a979
 callback
97a979
97a979
---
97a979
 lib/fencing/st_client.c | 6 ++++--
97a979
 1 file changed, 4 insertions(+), 2 deletions(-)
97a979
97a979
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
97a979
index 9d93ffd481..4823751267 100644
97a979
--- a/lib/fencing/st_client.c
97a979
+++ b/lib/fencing/st_client.c
97a979
@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
97a979
                                      cb_info->user_data, cb_info->callback);
97a979
 
97a979
     } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) {
97a979
-        crm_warn("Fencing action without registered callback failed: %d (%s)",
97a979
+        crm_warn("Fencing action without registered callback failed: %d (%s%s%s)",
97a979
                  result.exit_status,
97a979
-                 pcmk_exec_status_str(result.execution_status));
97a979
+                 pcmk_exec_status_str(result.execution_status),
97a979
+                 ((result.exit_reason == NULL)? "" : ": "),
97a979
+                 ((result.exit_reason == NULL)? "" : result.exit_reason));
97a979
         crm_log_xml_debug(msg, "Failed fence update");
97a979
     }
97a979
 
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Tue, 11 Jan 2022 09:28:27 -0600
97a979
Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent
97a979
97a979
... even if the line runs a little long
97a979
---
97a979
 daemons/execd/execd_commands.c | 4 ++--
97a979
 1 file changed, 2 insertions(+), 2 deletions(-)
97a979
97a979
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
97a979
index 7ae309d94c..bc3b392b2c 100644
97a979
--- a/daemons/execd/execd_commands.c
97a979
+++ b/daemons/execd/execd_commands.c
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2012-2021 the Pacemaker project contributors
97a979
+ * Copyright 2012-2022 the Pacemaker project contributors
97a979
  *
97a979
  * The version control history for this file may have further details.
97a979
  *
97a979
@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
97a979
     stonith_action_complete(cmd,
97a979
                             ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR),
97a979
                             stonith__legacy2status(rc),
97a979
-                            rc == -pcmk_err_generic? NULL : pcmk_strerror(rc));
97a979
+                            ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc)));
97a979
 }
97a979
 
97a979
 static int
97a979
-- 
97a979
2.27.0
97a979
97a979
97a979
From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001
97a979
From: Ken Gaillot <kgaillot@redhat.com>
97a979
Date: Tue, 11 Jan 2022 09:29:03 -0600
97a979
Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution
97a979
 status for completed tests
97a979
97a979
It doesn't matter since the value is only checked against a couple of specific
97a979
failure values, but this is less confusing.
97a979
---
97a979
 daemons/fenced/cts-fence-helper.c | 4 ++--
97a979
 1 file changed, 2 insertions(+), 2 deletions(-)
97a979
97a979
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
97a979
index 2739f57804..e222a59f9f 100644
97a979
--- a/daemons/fenced/cts-fence-helper.c
97a979
+++ b/daemons/fenced/cts-fence-helper.c
97a979
@@ -1,5 +1,5 @@
97a979
 /*
97a979
- * Copyright 2009-2021 the Pacemaker project contributors
97a979
+ * Copyright 2009-2022 the Pacemaker project contributors
97a979
  *
97a979
  * This source code is licensed under the GNU General Public License version 2
97a979
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
97a979
@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass)
97a979
         crm_info("SUCCESS - %s", origin);
97a979
         mainloop_iter++;
97a979
         mainloop_set_trigger(trig);
97a979
-        result.execution_status = PCMK_EXEC_UNKNOWN;
97a979
+        result.execution_status = PCMK_EXEC_DONE;
97a979
         result.exit_status = CRM_EX_OK;
97a979
     } else {
97a979
         crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status,
97a979
-- 
97a979
2.27.0
97a979