Blob Blame History Raw
From d6e5e62f7c084fc0aabfae7e6391b4b59f63252f Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:40:24 -0500
Subject: [PATCH 01/24] Low: fencing: use a default timeout with metadata and
 validate

If the caller did not specify a timeout, use a default in
stonith_api_operations_t:metadata() and validate(). (Timeout is currently
ignored past that point, so this has no effect yet.)

Also, rename timeout argument for clarity.
---
 lib/fencing/st_client.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 2b0d308..28791ff 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -504,7 +504,8 @@ stonith_api_device_list(stonith_t * stonith, int call_options, const char *names
 
 static int
 stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent,
-                            const char *namespace, char **output, int timeout)
+                            const char *namespace, char **output,
+                            int timeout_sec)
 {
     /* By executing meta-data directly, we can get it from stonith_admin when
      * the cluster is not running, which is important for higher-level tools.
@@ -512,16 +513,20 @@ stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *a
 
     enum stonith_namespace ns = stonith_get_namespace(agent, namespace);
 
+    if (timeout_sec <= 0) {
+        timeout_sec = CRMD_METADATA_CALL_TIMEOUT;
+    }
+
     crm_trace("Looking up metadata for %s agent %s",
               stonith_namespace2text(ns), agent);
 
     switch (ns) {
         case st_namespace_rhcs:
-            return stonith__rhcs_metadata(agent, timeout, output);
+            return stonith__rhcs_metadata(agent, timeout_sec, output);
 
 #if HAVE_STONITH_STONITH_H
         case st_namespace_lha:
-            return stonith__lha_metadata(agent, timeout, output);
+            return stonith__lha_metadata(agent, timeout_sec, output);
 #endif
 
         default:
@@ -1684,8 +1689,8 @@ stonith_api_delete(stonith_t * stonith)
 static int
 stonith_api_validate(stonith_t *st, int call_options, const char *rsc_id,
                      const char *namespace_s, const char *agent,
-                     stonith_key_value_t *params, int timeout, char **output,
-                     char **error_output)
+                     stonith_key_value_t *params, int timeout_sec,
+                     char **output, char **error_output)
 {
     /* Validation should be done directly via the agent, so we can get it from
      * stonith_admin when the cluster is not running, which is important for
@@ -1731,17 +1736,21 @@ stonith_api_validate(stonith_t *st, int call_options, const char *rsc_id,
         *error_output = NULL;
     }
 
+    if (timeout_sec <= 0) {
+        timeout_sec = CRMD_METADATA_CALL_TIMEOUT; // Questionable
+    }
+
     switch (stonith_get_namespace(agent, namespace_s)) {
         case st_namespace_rhcs:
             rc = stonith__rhcs_validate(st, call_options, target, agent,
-                                        params_table, host_arg, timeout,
+                                        params_table, host_arg, timeout_sec,
                                         output, error_output);
             break;
 
 #if HAVE_STONITH_STONITH_H
         case st_namespace_lha:
             rc = stonith__lha_validate(st, call_options, target, agent,
-                                       params_table, timeout, output,
+                                       params_table, timeout_sec, output,
                                        error_output);
             break;
 #endif
-- 
2.31.1

From 7404fc1238253b80eb97fd81af123c3f5aa1fde8 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:00:00 -0500
Subject: [PATCH 02/24] Doc: fencer: improve
 stonith_api_operations_t:metadata() description

---
 include/crm/stonith-ng.h | 15 +++++++++++----
 lib/fencing/st_client.c  |  7 ++++---
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h
index 4fe52ef..a41d411 100644
--- a/include/crm/stonith-ng.h
+++ b/include/crm/stonith-ng.h
@@ -206,14 +206,21 @@ typedef struct stonith_api_operations_s
         stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list);
 
     /*!
-     * \brief Get the metadata documentation for a resource.
+     * \brief Retrieve a fence agent's metadata
      *
-     * \note Value is returned in output.  Output must be freed when set.
+     * \param[in,out] stonith       Fencer connection
+     * \param[in]     call_options  Group of enum stonith_call_options
+     *                              (currently ignored)
+     * \param[in]     agent         Fence agent to query
+     * \param[in]     namespace     Namespace of fence agent to query (optional)
+     * \param[out]    output        Where to store metadata
+     * \param[in]     timeout_sec   Error if not complete within this time
      *
      * \return Legacy Pacemaker return code
+     * \note The caller is responsible for freeing *output using free().
      */
-    int (*metadata)(stonith_t *st, int options,
-            const char *device, const char *provider, char **output, int timeout);
+    int (*metadata)(stonith_t *stonith, int call_options, const char *agent,
+                    const char *namespace, char **output, int timeout_sec);
 
     /*!
      * \brief Retrieve a list of installed stonith agents
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 28791ff..6c252bc 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -502,10 +502,11 @@ stonith_api_device_list(stonith_t * stonith, int call_options, const char *names
     return count;
 }
 
+// See stonith_api_operations_t:metadata() documentation
 static int
-stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent,
-                            const char *namespace, char **output,
-                            int timeout_sec)
+stonith_api_device_metadata(stonith_t *stonith, int call_options,
+                            const char *agent, const char *namespace,
+                            char **output, int timeout_sec)
 {
     /* By executing meta-data directly, we can get it from stonith_admin when
      * the cluster is not running, which is important for higher-level tools.
-- 
2.31.1

From 7b5d1610231252e209a57ef6d82a23e3667812a0 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:16:54 -0500
Subject: [PATCH 03/24] Doc: fencing: add doxygen block for
 stonith__action_create()

... and rename a couple arguments for clarity
---
 include/crm/fencing/internal.h |  4 ++--
 lib/fencing/st_actions.c       | 33 ++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index d2b49f8..e2ca85e 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -50,10 +50,10 @@ struct stonith_action_s;
 typedef struct stonith_action_s stonith_action_t;
 
 stonith_action_t *stonith_action_create(const char *agent,
-                                        const char *_action,
+                                        const char *action_name,
                                         const char *victim,
                                         uint32_t victim_nodeid,
-                                        int timeout,
+                                        int timeout_sec,
                                         GHashTable * device_args,
                                         GHashTable * port_map,
                                         const char * host_arg);
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index b3429f6..d16fa33 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -232,27 +232,42 @@ stonith__action_result(stonith_action_t *action)
 }
 
 #define FAILURE_MAX_RETRIES 2
+
+/*!
+ * \internal
+ * \brief Create a new fencing action to be executed
+ *
+ * \param[in] agent          Fence agent to use
+ * \param[in] action_name    Fencing action to be executed
+ * \param[in] victim         Name of target of fencing action (if known)
+ * \param[in] victim_nodeid  Node ID of target of fencing action (if known)
+ * \param[in] timeout_sec    Timeout to be used when executing action
+ * \param[in] device_args    Parameters to pass to fence agent
+ * \param[in] port_map       Mapping of target names to device ports
+ * \param[in] host_arg       Agent parameter used to pass target name
+ *
+ * \return Newly created fencing action (asserts on error, never NULL)
+ */
 stonith_action_t *
 stonith_action_create(const char *agent,
-                      const char *_action,
+                      const char *action_name,
                       const char *victim,
                       uint32_t victim_nodeid,
-                      int timeout, GHashTable * device_args,
+                      int timeout_sec, GHashTable * device_args,
                       GHashTable * port_map, const char *host_arg)
 {
-    stonith_action_t *action;
+    stonith_action_t *action = calloc(1, sizeof(stonith_action_t));
 
-    action = calloc(1, sizeof(stonith_action_t));
     CRM_ASSERT(action != NULL);
 
-    action->args = make_args(agent, _action, victim, victim_nodeid,
+    action->args = make_args(agent, action_name, victim, victim_nodeid,
                              device_args, port_map, host_arg);
     crm_debug("Preparing '%s' action for %s using agent %s",
-              _action, (victim? victim : "no target"), agent);
+              action_name, (victim? victim : "no target"), agent);
     action->agent = strdup(agent);
-    action->action = strdup(_action);
+    action->action = strdup(action_name);
     pcmk__str_update(&action->victim, victim);
-    action->timeout = action->remaining_timeout = timeout;
+    action->timeout = action->remaining_timeout = timeout_sec;
     action->max_retries = FAILURE_MAX_RETRIES;
 
     pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN,
@@ -262,7 +277,7 @@ stonith_action_create(const char *agent,
         char buffer[512];
         const char *value = NULL;
 
-        snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action);
+        snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", action_name);
         value = g_hash_table_lookup(device_args, buffer);
 
         if (value) {
-- 
2.31.1

From a82339a81c451566b9835e61bca7e1bf84b97c46 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:20:24 -0500
Subject: [PATCH 04/24] Low: fencing: use requested timeout with RHCS metadata
 actions

... instead of hardcoded 5 seconds, and rename timeout argument for clarity
---
 lib/fencing/st_rhcs.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c
index dfccff2..1b3cd57 100644
--- a/lib/fencing/st_rhcs.c
+++ b/lib/fencing/st_rhcs.c
@@ -112,25 +112,24 @@ stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter)
 }
 
 /*!
- * \brief Execute RHCS-compatible agent's meta-data action
+ * \brief Execute RHCS-compatible agent's metadata action
  *
- * \param[in]  agent    Agent to execute
- * \param[in]  timeout  Action timeout
- * \param[out] metadata Where to store output xmlNode (or NULL to ignore)
- *
- * \todo timeout is currently ignored; shouldn't we use it?
+ * \param[in]  agent        Agent to execute
+ * \param[in]  timeout_sec  Action timeout
+ * \param[out] metadata     Where to store output xmlNode (or NULL to ignore)
  */
 static int
-stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata)
+stonith__rhcs_get_metadata(const char *agent, int timeout_sec,
+                           xmlNode **metadata)
 {
     xmlNode *xml = NULL;
     xmlNode *actions = NULL;
     xmlXPathObject *xpathObj = NULL;
-    pcmk__action_result_t *result = NULL;
-    stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0,
-                                                     5, NULL, NULL, NULL);
+    stonith_action_t *action = stonith_action_create(agent, "metadata", NULL,
+                                                     0, timeout_sec, NULL,
+                                                     NULL, NULL);
     int rc = stonith__execute(action);
-    result = stonith__action_result(action);
+    pcmk__action_result_t *result = stonith__action_result(action);
 
     if (result == NULL) {
         if (rc < 0) {
@@ -208,21 +207,19 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata)
 }
 
 /*!
- * \brief Execute RHCS-compatible agent's meta-data action
- *
- * \param[in]  agent    Agent to execute
- * \param[in]  timeout  Action timeout
- * \param[out] output   Where to store action output (or NULL to ignore)
+ * \brief Retrieve metadata for RHCS-compatible fence agent
  *
- * \todo timeout is currently ignored; shouldn't we use it?
+ * \param[in]  agent        Agent to execute
+ * \param[in]  timeout_sec  Action timeout
+ * \param[out] output       Where to store action output (or NULL to ignore)
  */
 int
-stonith__rhcs_metadata(const char *agent, int timeout, char **output)
+stonith__rhcs_metadata(const char *agent, int timeout_sec, char **output)
 {
     char *buffer = NULL;
     xmlNode *xml = NULL;
 
-    int rc = stonith__rhcs_get_metadata(agent, timeout, &xml);
+    int rc = stonith__rhcs_get_metadata(agent, timeout_sec, &xml);
 
     if (rc != pcmk_ok) {
         free_xml(xml);
-- 
2.31.1

From 73d12e7bcab4d12b1d60ac3431fb713a36c5b6d3 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:32:44 -0500
Subject: [PATCH 05/24] Refactor: fencing: make stonith_action_t:async bool

---
 lib/fencing/st_actions.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index d16fa33..abd0d5a 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -9,6 +9,7 @@
 
 #include <crm_internal.h>
 
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -32,7 +33,7 @@ struct stonith_action_s {
     char *victim;
     GHashTable *args;
     int timeout;
-    int async;
+    bool async;
     void *userdata;
     void (*done_cb) (int pid, const pcmk__action_result_t *result,
                      void *user_data);
@@ -671,7 +672,7 @@ stonith_action_execute_async(stonith_action_t * action,
     action->userdata = userdata;
     action->done_cb = done;
     action->fork_cb = fork_cb;
-    action->async = 1;
+    action->async = true;
 
     return internal_stonith_action_execute(action);
 }
-- 
2.31.1

From 537955ab3c19a1bfacc43fb739e38f036c5788fb Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:59:28 -0500
Subject: [PATCH 06/24] Refactor: fencing: rename
 stonith_action_execute_async()

... to stonith__execute_async(), since it's internal
---
 daemons/fenced/fenced_commands.c |  4 ++--
 include/crm/fencing/internal.h   | 12 +++++-------
 lib/fencing/st_actions.c         | 11 +++++------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c
index 94aa6b8..41a1936 100644
--- a/daemons/fenced/fenced_commands.c
+++ b/daemons/fenced/fenced_commands.c
@@ -510,8 +510,8 @@ stonith_device_execute(stonith_device_t * device)
     /* for async exec, exec_rc is negative for early error exit
        otherwise handling of success/errors is done via callbacks */
     cmd->activating_on = device;
-    exec_rc = stonith_action_execute_async(action, (void *)cmd,
-                                           cmd->done_cb, fork_cb);
+    exec_rc = stonith__execute_async(action, (void *)cmd, cmd->done_cb,
+                                     fork_cb);
     if (exec_rc < 0) {
         cmd->activating_on = NULL;
         cmd->done_cb(0, stonith__action_result(action), cmd);
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index e2ca85e..1797d9a 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -64,13 +64,11 @@ void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result);
 void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result);
 xmlNode *stonith__find_xe_with_result(xmlNode *xml);
 
-int
-stonith_action_execute_async(stonith_action_t * action,
-                             void *userdata,
-                             void (*done) (int pid,
-                                           const pcmk__action_result_t *result,
-                                           void *user_data),
-                             void (*fork_cb) (int pid, void *user_data));
+int stonith__execute_async(stonith_action_t *action, void *userdata,
+                           void (*done) (int pid,
+                                         const pcmk__action_result_t *result,
+                                         void *user_data),
+                           void (*fork_cb) (int pid, void *user_data));
 
 xmlNode *create_level_registration_xml(const char *node, const char *pattern,
                                        const char *attr, const char *value,
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index abd0d5a..c4e32bd 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -658,12 +658,11 @@ internal_stonith_action_execute(stonith_action_t * action)
  * \return pcmk_ok if ownership of action has been taken, -errno otherwise
  */
 int
-stonith_action_execute_async(stonith_action_t * action,
-                             void *userdata,
-                             void (*done) (int pid,
-                                           const pcmk__action_result_t *result,
-                                           void *user_data),
-                             void (*fork_cb) (int pid, void *user_data))
+stonith__execute_async(stonith_action_t * action, void *userdata,
+                       void (*done) (int pid,
+                                     const pcmk__action_result_t *result,
+                                     void *user_data),
+                       void (*fork_cb) (int pid, void *user_data))
 {
     if (!action) {
         return -EINVAL;
-- 
2.31.1

From b56edde80bfe4cf900a5eb021986c8f6189d3307 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 16:43:57 -0500
Subject: [PATCH 07/24] Refactor: fencing: add internal API for getting
 metadata async

Nothing uses it yet
---
 include/crm/fencing/internal.h |  6 +++
 lib/fencing/st_client.c        | 80 ++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index 1797d9a..513d1c4 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -70,6 +70,12 @@ int stonith__execute_async(stonith_action_t *action, void *userdata,
                                          void *user_data),
                            void (*fork_cb) (int pid, void *user_data));
 
+int stonith__metadata_async(const char *agent, int timeout_sec,
+                            void (*callback)(int pid,
+                                             const pcmk__action_result_t *result,
+                                             void *user_data),
+                            void *user_data);
+
 xmlNode *create_level_registration_xml(const char *node, const char *pattern,
                                        const char *attr, const char *value,
                                        int level,
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 6c252bc..91075bd 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -2386,6 +2386,86 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name,
     freeXpathObject(xpath);
 }
 
+/*!
+ * \internal
+ * \brief Retrieve fence agent meta-data asynchronously
+ *
+ * \param[in] agent        Agent to execute
+ * \param[in] timeout_sec  Error if not complete within this time
+ * \param[in] callback     Function to call with result (this will always be
+ *                         called, whether by this function directly or later
+ *                         via the main loop, and on success the metadata will
+ *                         be in its result argument's action_stdout)
+ * \param[in] user_data    User data to pass to callback
+ *
+ * \return Standard Pacemaker return code
+ * \note The caller must use a main loop. This function is not a
+ *       stonith_api_operations_t method because it does not need a stonith_t
+ *       object and does not go through the fencer, but executes the agent
+ *       directly.
+ */
+int
+stonith__metadata_async(const char *agent, int timeout_sec,
+                        void (*callback)(int pid,
+                                         const pcmk__action_result_t *result,
+                                         void *user_data),
+                        void *user_data)
+{
+    switch (stonith_get_namespace(agent, NULL)) {
+        case st_namespace_rhcs:
+            {
+                stonith_action_t *action = NULL;
+                int rc = pcmk_ok;
+
+                action = stonith_action_create(agent, "metadata", NULL, 0,
+                                               timeout_sec, NULL, NULL, NULL);
+
+                rc = stonith__execute_async(action, user_data, callback, NULL);
+                if (rc != pcmk_ok) {
+                    callback(0, stonith__action_result(action), user_data);
+                    stonith__destroy_action(action);
+                }
+                return pcmk_legacy2rc(rc);
+            }
+
+#if HAVE_STONITH_STONITH_H
+        case st_namespace_lha:
+            // LHA metadata is simply synthesized, so simulate async
+            {
+                pcmk__action_result_t result = {
+                    .exit_status = CRM_EX_OK,
+                    .execution_status = PCMK_EXEC_DONE,
+                    .exit_reason = NULL,
+                    .action_stdout = NULL,
+                    .action_stderr = NULL,
+                };
+
+                stonith__lha_metadata(agent, timeout_sec,
+                                      &result.action_stdout);
+                callback(0, &result, user_data);
+                pcmk__reset_result(&result);
+                return pcmk_rc_ok;
+            }
+#endif
+
+        default:
+            {
+                pcmk__action_result_t result = {
+                    .exit_status = CRM_EX_ERROR,
+                    .execution_status = PCMK_EXEC_ERROR_HARD,
+                    .exit_reason = crm_strdup_printf("No such agent '%s'",
+                                                     agent),
+                    .action_stdout = NULL,
+                    .action_stderr = NULL,
+                };
+
+                callback(0, &result, user_data);
+                pcmk__reset_result(&result);
+                return ENOENT;
+            }
+    }
+}
+
 /*!
  * \internal
  * \brief Return the exit status from an async action callback
-- 
2.31.1

From f12cdd204982548b4d5fed16c0a460d4a5ced217 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 17:33:10 -0500
Subject: [PATCH 08/24] Refactor: liblrmd: add internal API for getting
 metadata async

Nothing uses it yet
---
 include/crm/lrmd_internal.h |  10 +++-
 lib/lrmd/lrmd_client.c      | 115 ++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/include/crm/lrmd_internal.h b/include/crm/lrmd_internal.h
index 284c4d6..5cb00d5 100644
--- a/include/crm/lrmd_internal.h
+++ b/include/crm/lrmd_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2015-2021 the Pacemaker project contributors
+ * Copyright 2015-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -17,7 +17,7 @@
 #include <crm/common/mainloop.h>        // mainloop_io_t, ipc_client_callbacks
 #include <crm/common/output_internal.h> // pcmk__output_t
 #include <crm/common/remote_internal.h> // pcmk__remote_t
-#include <crm/lrmd.h>                   // lrmd_t, lrmd_event_data_t
+#include <crm/lrmd.h>           // lrmd_t, lrmd_event_data_t, lrmd_rsc_info_t
 
 int lrmd__new(lrmd_t **api, const char *nodename, const char *server, int port);
 
@@ -35,6 +35,12 @@ int lrmd_send_resource_alert(lrmd_t *lrmd, GList *alert_list,
 int lrmd__remote_send_xml(pcmk__remote_t *session, xmlNode *msg, uint32_t id,
                           const char *msg_type);
 
+int lrmd__metadata_async(lrmd_rsc_info_t *rsc,
+                         void (*callback)(int pid,
+                                          const pcmk__action_result_t *result,
+                                          void *user_data),
+                         void *user_data);
+
 void lrmd__set_result(lrmd_event_data_t *event, enum ocf_exitcode rc,
                       int op_status, const char *exit_reason);
 
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 82afd6c..4b16bf0 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2343,6 +2343,121 @@ lrmd_api_delete(lrmd_t * lrmd)
     free(lrmd);
 }
 
+struct metadata_cb {
+     void (*callback)(int pid, const pcmk__action_result_t *result,
+                      void *user_data);
+     void *user_data;
+};
+
+/*!
+ * \internal
+ * \brief Process asynchronous metadata completion
+ *
+ * \param[in] action  Metadata action that completed
+ */
+static void
+metadata_complete(svc_action_t *action)
+{
+    struct metadata_cb *metadata_cb = (struct metadata_cb *) action->cb_data;
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
+
+    pcmk__set_result(&result, action->rc, action->status,
+                     services__exit_reason(action));
+    pcmk__set_result_output(&result, action->stdout_data, action->stderr_data);
+
+    metadata_cb->callback(0, &result, metadata_cb->user_data);
+    result.action_stdout = NULL; // Prevent free, because action owns it
+    result.action_stderr = NULL; // Prevent free, because action owns it
+    pcmk__reset_result(&result);
+    free(metadata_cb);
+}
+
+/*!
+ * \internal
+ * \brief Retrieve agent metadata asynchronously
+ *
+ * \param[in] rsc        Resource agent specification
+ * \param[in] callback   Function to call with result (this will always be
+ *                       called, whether by this function directly or later via
+ *                       the main loop, and on success the metadata will be in
+ *                       its result argument's action_stdout)
+ * \param[in] user_data  User data to pass to callback
+ *
+ * \return Standard Pacemaker return code
+ * \note This function is not a lrmd_api_operations_t method because it does not
+ *       need an lrmd_t object and does not go through the executor, but
+ *       executes the agent directly.
+ */
+int
+lrmd__metadata_async(lrmd_rsc_info_t *rsc,
+                     void (*callback)(int pid,
+                                      const pcmk__action_result_t *result,
+                                      void *user_data),
+                     void *user_data)
+{
+    svc_action_t *action = NULL;
+    struct metadata_cb *metadata_cb = NULL;
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
+
+    CRM_CHECK(callback != NULL, return EINVAL);
+
+    if ((rsc == NULL) || (rsc->standard == NULL) || (rsc->type == NULL)) {
+        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED, PCMK_EXEC_ERROR,
+                         "Invalid resource specification");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return EINVAL;
+    }
+
+    if (strcmp(rsc->standard, PCMK_RESOURCE_CLASS_STONITH) == 0) {
+        return stonith__metadata_async(rsc->type,
+                                       CRMD_METADATA_CALL_TIMEOUT / 1000,
+                                       callback, user_data);
+    }
+
+    action = services__create_resource_action(rsc->type, rsc->standard,
+                                              rsc->provider, rsc->type,
+                                              CRMD_ACTION_METADATA, 0,
+                                              CRMD_METADATA_CALL_TIMEOUT, NULL,
+                                              0);
+    if (action == NULL) {
+        pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
+                         "Out of memory");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return ENOMEM;
+    }
+    if (action->rc != PCMK_OCF_UNKNOWN) {
+        pcmk__set_result(&result, action->rc, action->status,
+                         services__exit_reason(action));
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        services_action_free(action);
+        return EINVAL;
+    }
+
+    action->cb_data = calloc(1, sizeof(struct metadata_cb));
+    if (action->cb_data == NULL) {
+        services_action_free(action);
+        pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
+                         "Out of memory");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return ENOMEM;
+    }
+
+    metadata_cb = (struct metadata_cb *) action->cb_data;
+    metadata_cb->callback = callback;
+    metadata_cb->user_data = user_data;
+    if (!services_action_async(action, metadata_complete)) {
+        services_action_free(action);
+        return pcmk_rc_error; // @TODO Derive from action->rc and ->status
+    }
+
+    // The services library has taken responsibility for action
+    return pcmk_rc_ok;
+}
+
 /*!
  * \internal
  * \brief Set the result of an executor event
-- 
2.31.1

From 48f3165bda7eb59b6ade45ecbafc1ad396564c67 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:34:37 -0500
Subject: [PATCH 09/24] Low: controller: ignore CRM_OP_LRM_REFRESH

This was only sent by crm_resource --refresh in versions 1.1.9 and earlier.
Since the local crm_resource is the same version as the controller, and
Pacemaker Remote was introduced in 1.1.9, this means that only remote nodes
running 1.1.9 can possibly send it.

It didn't really do anything useful anyway, so just ignore it.
---
 daemons/controld/controld_execd.c    | 33 +++++-----------------------
 daemons/controld/controld_messages.c |  2 +-
 include/crm/crm.h                    |  2 +-
 lib/pacemaker/pcmk_graph_producer.c  |  3 +--
 lib/pengine/common.c                 |  2 --
 5 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index fa411a6..719fab0 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1553,32 +1553,6 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
     lrmd_free_event(op);
 }
 
-static void
-handle_refresh_op(lrm_state_t *lrm_state, const char *user_name,
-                  const char *from_host, const char *from_sys)
-{
-    int rc = pcmk_ok;
-    xmlNode *fragment = do_lrm_query_internal(lrm_state, node_update_all);
-
-    fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name);
-    crm_info("Forced a local resource history refresh: call=%d", rc);
-
-    if (!pcmk__str_eq(CRM_SYSTEM_CRMD, from_sys, pcmk__str_casei)) {
-        xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, fragment, from_host,
-                                        from_sys, CRM_SYSTEM_LRMD,
-                                        fsa_our_uuid);
-
-        crm_debug("ACK'ing refresh from %s (%s)", from_sys, from_host);
-
-        if (relay_message(reply, TRUE) == FALSE) {
-            crm_log_xml_err(reply, "Unable to route reply");
-        }
-        free_xml(reply);
-    }
-
-    free_xml(fragment);
-}
-
 static void
 handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
 {
@@ -1787,7 +1761,12 @@ do_lrm_invoke(long long action,
     }
 
     if (pcmk__str_eq(crm_op, CRM_OP_LRM_REFRESH, pcmk__str_casei)) {
-        handle_refresh_op(lrm_state, user_name, from_host, from_sys);
+        /* @COMPAT This can only be sent by crm_resource --refresh on a
+         * Pacemaker Remote node running Pacemaker 1.1.9, which is extremely
+         * unlikely. It previously would cause the controller to re-write its
+         * resource history to the CIB. Just ignore it.
+         */
+        crm_notice("Ignoring refresh request from Pacemaker Remote 1.1.9 node");
 
     } else if (pcmk__str_eq(crm_op, CRM_OP_LRM_QUERY, pcmk__str_casei)) {
         handle_query_op(input->msg, lrm_state);
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index 31d3524..957fc20 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -1061,7 +1061,7 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause)
         return handle_lrm_delete(stored_msg);
 
     } else if ((strcmp(op, CRM_OP_LRM_FAIL) == 0)
-               || (strcmp(op, CRM_OP_LRM_REFRESH) == 0)
+               || (strcmp(op, CRM_OP_LRM_REFRESH) == 0) // @COMPAT
                || (strcmp(op, CRM_OP_REPROBE) == 0)) {
 
         crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD);
diff --git a/include/crm/crm.h b/include/crm/crm.h
index 5ec66d2..f2e536e 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -146,7 +146,7 @@ extern char *crm_system_name;
 #  define CRM_OP_REGISTER		"register"
 #  define CRM_OP_IPC_FWD		"ipc_fwd"
 #  define CRM_OP_INVOKE_LRM	"lrm_invoke"
-#  define CRM_OP_LRM_REFRESH	"lrm_refresh" /* Deprecated */
+#  define CRM_OP_LRM_REFRESH "lrm_refresh" //!< Deprecated since 1.1.10
 #  define CRM_OP_LRM_QUERY	"lrm_query"
 #  define CRM_OP_LRM_DELETE	"lrm_delete"
 #  define CRM_OP_LRM_FAIL		"lrm_fail"
diff --git a/lib/pacemaker/pcmk_graph_producer.c b/lib/pacemaker/pcmk_graph_producer.c
index 4c1b5a6..0077719 100644
--- a/lib/pacemaker/pcmk_graph_producer.c
+++ b/lib/pacemaker/pcmk_graph_producer.c
@@ -446,8 +446,7 @@ create_graph_action(xmlNode *parent, pe_action_t *action, bool skip_details,
 
     } else if (pcmk__str_any_of(action->task,
                                 CRM_OP_SHUTDOWN,
-                                CRM_OP_CLEAR_FAILCOUNT,
-                                CRM_OP_LRM_REFRESH, NULL)) {
+                                CRM_OP_CLEAR_FAILCOUNT, NULL)) {
         action_xml = create_xml_node(parent, XML_GRAPH_TAG_CRM_EVENT);
 
     } else if (pcmk__str_eq(action->task, CRM_OP_LRM_DELETE, pcmk__str_none)) {
diff --git a/lib/pengine/common.c b/lib/pengine/common.c
index 93ba3fe..7db9d0e 100644
--- a/lib/pengine/common.c
+++ b/lib/pengine/common.c
@@ -384,8 +384,6 @@ text2task(const char *task)
         return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) {
         return no_action;
-    } else if (pcmk__str_eq(task, CRM_OP_LRM_REFRESH, pcmk__str_casei)) {
-        return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_MIGRATE, pcmk__str_casei)) {
         return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_MIGRATED, pcmk__str_casei)) {
-- 
2.31.1

From 366f943f30f2d3d67b74db2bafa0098874f1b67e Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:46:15 -0500
Subject: [PATCH 10/24] API: libcrmcommon: deprecate CRM_OP_LRM_QUERY

This has been unused since at least Pacemaker 1.0.0, and since we don't support
rolling upgrades from anything that old, and Pacemaker Remote didn't exist
then, we can just drop support for it entirely.
---
 daemons/controld/controld_execd.c | 17 -----------------
 include/crm/crm.h                 |  1 -
 include/crm/crm_compat.h          |  5 ++++-
 3 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 719fab0..54e6818 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1553,20 +1553,6 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
     lrmd_free_event(op);
 }
 
-static void
-handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
-{
-    xmlNode *data = do_lrm_query_internal(lrm_state, node_update_all);
-    xmlNode *reply = create_reply(msg, data);
-
-    if (relay_message(reply, TRUE) == FALSE) {
-        crm_err("Unable to route reply");
-        crm_log_xml_err(reply, "reply");
-    }
-    free_xml(reply);
-    free_xml(data);
-}
-
 static void
 handle_reprobe_op(lrm_state_t *lrm_state, const char *from_sys,
                   const char *from_host, const char *user_name,
@@ -1768,9 +1754,6 @@ do_lrm_invoke(long long action,
          */
         crm_notice("Ignoring refresh request from Pacemaker Remote 1.1.9 node");
 
-    } else if (pcmk__str_eq(crm_op, CRM_OP_LRM_QUERY, pcmk__str_casei)) {
-        handle_query_op(input->msg, lrm_state);
-
     // @COMPAT DCs <1.1.14 in a rolling upgrade might schedule this op
     } else if (pcmk__str_eq(operation, CRM_OP_PROBED, pcmk__str_casei)) {
         update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE,
diff --git a/include/crm/crm.h b/include/crm/crm.h
index f2e536e..38915e3 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -147,7 +147,6 @@ extern char *crm_system_name;
 #  define CRM_OP_IPC_FWD		"ipc_fwd"
 #  define CRM_OP_INVOKE_LRM	"lrm_invoke"
 #  define CRM_OP_LRM_REFRESH "lrm_refresh" //!< Deprecated since 1.1.10
-#  define CRM_OP_LRM_QUERY	"lrm_query"
 #  define CRM_OP_LRM_DELETE	"lrm_delete"
 #  define CRM_OP_LRM_FAIL		"lrm_fail"
 #  define CRM_OP_PROBED		"probe_complete"
diff --git a/include/crm/crm_compat.h b/include/crm/crm_compat.h
index 3b35a5e..8a4b368 100644
--- a/include/crm/crm_compat.h
+++ b/include/crm/crm_compat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2021 the Pacemaker project contributors
+ * Copyright 2004-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -31,6 +31,9 @@ extern "C" {
 //! \deprecated This defined constant will be removed in a future release
 #define MAX_IPC_DELAY 120
 
+//! \deprecated This defined constant will be removed in a future release
+#define CRM_OP_LRM_QUERY "lrm_query"
+
 //!@{
 //! \deprecated This macro will be removed in a future release
 
-- 
2.31.1

From 7107aaceffb243990846e4a6cd5d2f0c82b3c0cf Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:49:48 -0500
Subject: [PATCH 11/24] Refactor: controller: drop do_lrm_query_internal()

Now that there's only one (short) caller, just move its contents there
---
 daemons/controld/controld_execd.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 54e6818..99c9193 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -811,19 +811,26 @@ build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
     return FALSE;
 }
 
-static xmlNode *
-do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
+xmlNode *
+controld_query_executor_state(const char *node_name)
 {
     xmlNode *xml_state = NULL;
     xmlNode *xml_data = NULL;
     xmlNode *rsc_list = NULL;
     crm_node_t *peer = NULL;
+    lrm_state_t *lrm_state = lrm_state_find(node_name);
+
+    if (!lrm_state) {
+        crm_err("Could not find executor state for node %s", node_name);
+        return NULL;
+    }
 
     peer = crm_get_peer_full(0, lrm_state->node_name, CRM_GET_PEER_ANY);
     CRM_CHECK(peer != NULL, return NULL);
 
-    xml_state = create_node_state_update(peer, update_flags, NULL,
-                                         __func__);
+    xml_state = create_node_state_update(peer,
+                                         node_update_cluster|node_update_peer,
+                                         NULL, __func__);
     if (xml_state == NULL) {
         return NULL;
     }
@@ -840,19 +847,6 @@ do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
     return xml_state;
 }
 
-xmlNode *
-controld_query_executor_state(const char *node_name)
-{
-    lrm_state_t *lrm_state = lrm_state_find(node_name);
-
-    if (!lrm_state) {
-        crm_err("Could not find executor state for node %s", node_name);
-        return NULL;
-    }
-    return do_lrm_query_internal(lrm_state,
-                                 node_update_cluster|node_update_peer);
-}
-
 /*!
  * \internal
  * \brief Map standard Pacemaker return code to operation status and OCF code
-- 
2.31.1

From 9549e2bdd0781a6b09546da6df793d549d4db0cc Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 15:48:44 -0500
Subject: [PATCH 12/24] Doc: controller: drop pointless comment

It's (likely?) impossible for a live cluster to have been doing rolling
upgrades since 2006.
---
 daemons/controld/controld_execd.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 99c9193..53b1156 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -678,18 +678,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
 
     target_rc = rsc_op_expected_rc(op);
 
-    /* there is a small risk in formerly mixed clusters that it will
-     * be sub-optimal.
-     *
-     * however with our upgrade policy, the update we send should
-     * still be completely supported anyway
-     */
     caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION);
-    CRM_LOG_ASSERT(caller_version != NULL);
-
-    if(caller_version == NULL) {
-        caller_version = CRM_FEATURE_SET;
-    }
+    CRM_CHECK(caller_version != NULL, caller_version = CRM_FEATURE_SET);
 
     xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
                                       fsa_our_uname, src);
-- 
2.31.1

From 393b6f25be30e7e5d5d63c70dc479aa5e2cd14e4 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 11:24:28 -0500
Subject: [PATCH 13/24] Refactor: controller: move where reload actions get
 remapped

... from do_lrm_invoke() to do_lrm_rsc_op(), which will make planned changes
easier
---
 daemons/controld/controld_execd.c | 38 ++++++++++++++++---------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 53b1156..c9f0cc7 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -43,7 +43,8 @@ static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer us
 static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
                                        const char *rsc_id, const char *operation);
 static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-                          const char *operation, xmlNode *msg);
+                          const char *operation, xmlNode *msg,
+                          struct ra_metadata_s *md);
 
 static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
                                          int log_level);
@@ -1808,26 +1809,12 @@ do_lrm_invoke(long long action,
             do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
                           crm_rsc_delete, user_name);
 
-        } else if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
-                                    CRMD_ACTION_RELOAD_AGENT, NULL)) {
-            /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
-             * will schedule reload-agent actions only. In either case, we need
-             * to map that to whatever the resource agent actually supports.
-             * Default to the OCF 1.1 name.
-             */
+        } else {
             struct ra_metadata_s *md = NULL;
-            const char *reload_name = CRMD_ACTION_RELOAD_AGENT;
 
             md = controld_get_rsc_metadata(lrm_state, rsc,
                                            controld_metadata_from_cache);
-            if ((md != NULL)
-                && pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
-                reload_name = CRMD_ACTION_RELOAD;
-            }
-            do_lrm_rsc_op(lrm_state, rsc, reload_name, input->xml);
-
-        } else {
-            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml);
+            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, md);
         }
 
         lrmd_free_rsc_info(rsc);
@@ -2176,7 +2163,7 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
 
 static void
 do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-              const char *operation, xmlNode *msg)
+              const char *operation, xmlNode *msg, struct ra_metadata_s *md)
 {
     int rc;
     int call_id = 0;
@@ -2198,6 +2185,21 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         }
     }
 
+    if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
+                         CRMD_ACTION_RELOAD_AGENT, NULL)) {
+        /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
+         * will schedule reload-agent actions only. In either case, we need
+         * to map that to whatever the resource agent actually supports.
+         * Default to the OCF 1.1 name.
+         */
+        if ((md != NULL)
+            && pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
+            operation = CRMD_ACTION_RELOAD;
+        } else {
+            operation = CRMD_ACTION_RELOAD_AGENT;
+        }
+    }
+
     op = construct_op(lrm_state, msg, rsc->id, operation);
     CRM_CHECK(op != NULL, return);
 
-- 
2.31.1

From b134383ef516a7c676a714b3b51594fcfaec9ede Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 15:12:06 -0500
Subject: [PATCH 14/24] Refactor: controller: drop operation argument to
 do_lrm_rsc_op()

It can be derived from the XML argument
---
 daemons/controld/controld_execd.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index c9f0cc7..3590f98 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -43,8 +43,7 @@ static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer us
 static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
                                        const char *rsc_id, const char *operation);
 static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-                          const char *operation, xmlNode *msg,
-                          struct ra_metadata_s *md);
+                          xmlNode *msg, struct ra_metadata_s *md);
 
 static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
                                          int log_level);
@@ -1814,7 +1813,7 @@ do_lrm_invoke(long long action,
 
             md = controld_get_rsc_metadata(lrm_state, rsc,
                                            controld_metadata_from_cache);
-            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, md);
+            do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
         }
 
         lrmd_free_rsc_info(rsc);
@@ -2162,8 +2161,8 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
 }
 
 static void
-do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-              const char *operation, xmlNode *msg, struct ra_metadata_s *md)
+do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
+              struct ra_metadata_s *md)
 {
     int rc;
     int call_id = 0;
@@ -2172,17 +2171,18 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
     lrmd_key_value_t *params = NULL;
     fsa_data_t *msg_data = NULL;
     const char *transition = NULL;
+    const char *operation = NULL;
     gboolean stop_recurring = FALSE;
     const char *nack_reason = NULL;
 
-    CRM_CHECK(rsc != NULL, return);
-    CRM_CHECK(operation != NULL, return);
+    CRM_CHECK((rsc != NULL) && (msg != NULL), return);
 
-    if (msg != NULL) {
-        transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
-        if (transition == NULL) {
-            crm_log_xml_err(msg, "Missing transition number");
-        }
+    operation = crm_element_value(msg, XML_LRM_ATTR_TASK);
+    CRM_CHECK(!pcmk__str_empty(operation), return);
+
+    transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
+    if (pcmk__str_empty(transition)) {
+        crm_log_xml_err(msg, "Missing transition number");
     }
 
     if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
-- 
2.31.1

From 563f6e8cebb2b64ba5426b87e5923cbc5bb1df11 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 15:40:38 -0500
Subject: [PATCH 15/24] Low: controller: add failsafe for no executor
 connection

... in do_lrm_rsc_op(), to make planned changes easier
---
 daemons/controld/controld_execd.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 3590f98..274ca95 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2185,6 +2185,17 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
         crm_log_xml_err(msg, "Missing transition number");
     }
 
+    if (lrm_state == NULL) {
+        // This shouldn't be possible, but provide a failsafe just in case
+        crm_err("Cannot execute %s of %s: No executor connection "
+                CRM_XS " transition_key=%s",
+                operation, rsc->id, (transition != NULL) ? transition : "");
+        synthesize_lrmd_failure(NULL, msg, PCMK_EXEC_INVALID,
+                                PCMK_OCF_UNKNOWN_ERROR,
+                                "No executor connection");
+        return;
+    }
+
     if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
                          CRMD_ACTION_RELOAD_AGENT, NULL)) {
         /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
-- 
2.31.1

From 571a5766e025f2a1235be169180e6eb5a9b7b7ea Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 12:04:31 -0500
Subject: [PATCH 16/24] Log: controller: improve messages when metadata cache
 update fails

Previously, metadata_cache_update() or ra_param_from_xml() would log an error,
then controld_get_rsc_metadata() (but not the other caller,
process_lrm_event()) would log another warning with the agent info.

Combine these into a single message always logged by metadata_cache_update(),
which also has been renamed to controld_cache_metadata().
---
 daemons/controld/controld_execd.c    |  2 +-
 daemons/controld/controld_metadata.c | 27 ++++++++++++---------------
 daemons/controld/controld_metadata.h |  6 +++---
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 274ca95..1ffcc64 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2858,7 +2858,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
         } else if (rsc && (op->rc == PCMK_OCF_OK)) {
             char *metadata = unescape_newlines(op->output);
 
-            metadata_cache_update(lrm_state->metadata_cache, rsc, metadata);
+            controld_cache_metadata(lrm_state->metadata_cache, rsc, metadata);
             free(metadata);
         }
     }
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index 8c6f195..ddcd5db 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -149,13 +149,11 @@ ra_param_from_xml(xmlNode *param_xml)
 
     p = calloc(1, sizeof(struct ra_param_s));
     if (p == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
         return NULL;
     }
 
     p->rap_name = strdup(param_name);
     if (p->rap_name == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
         free(p);
         return NULL;
     }
@@ -196,10 +194,11 @@ log_ra_ocf_version(const char *ra_key, const char *ra_ocf_version)
 }
 
 struct ra_metadata_s *
-metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
-                      const char *metadata_str)
+controld_cache_metadata(GHashTable *mdc, const lrmd_rsc_info_t *rsc,
+                        const char *metadata_str)
 {
     char *key = NULL;
+    const char *reason = NULL;
     xmlNode *metadata = NULL;
     xmlNode *match = NULL;
     struct ra_metadata_s *md = NULL;
@@ -210,20 +209,19 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
 
     key = crm_generate_ra_key(rsc->standard, rsc->provider, rsc->type);
     if (!key) {
-        crm_crit("Could not allocate memory for resource metadata");
+        reason = "Invalid resource agent standard or type";
         goto err;
     }
 
     metadata = string2xml(metadata_str);
     if (!metadata) {
-        crm_err("Metadata for %s:%s:%s is not valid XML",
-                rsc->standard, rsc->provider, rsc->type);
+        reason = "Metadata is not valid XML";
         goto err;
     }
 
     md = calloc(1, sizeof(struct ra_metadata_s));
     if (md == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
+        reason = "Could not allocate memory";
         goto err;
     }
 
@@ -281,6 +279,7 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
             struct ra_param_s *p = ra_param_from_xml(match);
 
             if (p == NULL) {
+                reason = "Could not allocate memory";
                 goto err;
             }
             if (pcmk_is_set(p->rap_flags, ra_param_private)) {
@@ -311,6 +310,9 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
     return md;
 
 err:
+    crm_warn("Unable to update metadata for %s (%s%s%s:%s): %s",
+             rsc->id, rsc->standard, ((rsc->provider == NULL)? "" : ":"),
+             ((rsc->provider == NULL)? "" : rsc->provider), rsc->type, reason);
     free(key);
     free_xml(metadata);
     metadata_free(md);
@@ -377,13 +379,8 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         return NULL;
     }
 
-    metadata = metadata_cache_update(lrm_state->metadata_cache, rsc,
-                                     metadata_str);
+    metadata = controld_cache_metadata(lrm_state->metadata_cache, rsc,
+                                       metadata_str);
     free(metadata_str);
-    if (metadata == NULL) {
-        crm_warn("Failed to update metadata for %s (%s%s%s:%s)",
-                 rsc->id, rsc->standard, ((rsc->provider == NULL)? "" : ":"),
-                 ((rsc->provider == NULL)? "" : rsc->provider), rsc->type);
-    }
     return metadata;
 }
diff --git a/daemons/controld/controld_metadata.h b/daemons/controld/controld_metadata.h
index 7354f94..3903cce 100644
--- a/daemons/controld/controld_metadata.h
+++ b/daemons/controld/controld_metadata.h
@@ -73,9 +73,9 @@ void metadata_cache_free(GHashTable *mdc);
 void metadata_cache_reset(GHashTable *mdc);
 void metadata_cache_fini(void);
 
-struct ra_metadata_s *metadata_cache_update(GHashTable *mdc,
-                                            lrmd_rsc_info_t *rsc,
-                                            const char *metadata_str);
+struct ra_metadata_s *controld_cache_metadata(GHashTable *mdc,
+                                              const lrmd_rsc_info_t *rsc,
+                                              const char *metadata_str);
 struct ra_metadata_s *controld_get_rsc_metadata(lrm_state_t *lrm_state,
                                                 lrmd_rsc_info_t *rsc,
                                                 uint32_t source);
-- 
2.31.1

From 6738e4d1aef78955c796f0afa431d656f25c8179 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 13:33:36 -0500
Subject: [PATCH 17/24] Fix: controller: pre-load agent metadata asynchronously

The controller needs resource agent metadata to record digests with pending and
completed resource actions.

Previously, metadata was collected synchronously when needed. This caused
several problems, two of which are fixed here for most actions: synchronous
execution blocks the controller from doing anything else (and if the agent's
metadata action tries to contact the controller, that blocks everything until
the action times out), and the metadata action ate into the real action's
timeout.

Now, if we're likely to need metadata for an action, attempt to get it
asynchronously before executing that action, so the metadata is available in
cache when needed.

This is not a complete solution, as there are other code paths that might
require metadata and still lead to synchronous execution, but it handles the
most important cases.

Fixes T554
---
 daemons/controld/controld_execd.c    | 105 +++++++++++++++++++++++----
 daemons/controld/controld_metadata.c |  22 +++---
 2 files changed, 102 insertions(+), 25 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 1ffcc64..6b65f52 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -670,7 +670,6 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
     struct ra_metadata_s *metadata = NULL;
     const char *caller_version = NULL;
     lrm_state_t *lrm_state = NULL;
-    uint32_t metadata_source = controld_metadata_from_agent;
 
     if (op == NULL) {
         return FALSE;
@@ -703,19 +702,14 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
         return TRUE;
     }
 
-    /* Getting meta-data from cache is OK unless this is a successful start
-     * action -- always refresh from the agent for those, in case the
-     * resource agent was updated.
+    /* Ideally the metadata is cached, and the agent is just a fallback.
      *
-     * @TODO Only refresh the meta-data after starts if the agent actually
-     * changed (using something like inotify, or a hash or modification time of
-     * the agent executable).
+     * @TODO Go through all callers and ensure they get metadata asynchronously
+     * first.
      */
-    if ((op->op_status != PCMK_EXEC_DONE) || (op->rc != target_rc)
-        || !pcmk__str_eq(op->op_type, CRMD_ACTION_START, pcmk__str_none)) {
-        metadata_source |= controld_metadata_from_cache;
-    }
-    metadata = controld_get_rsc_metadata(lrm_state, rsc, metadata_source);
+    metadata = controld_get_rsc_metadata(lrm_state, rsc,
+                                         controld_metadata_from_agent
+                                         |controld_metadata_from_cache);
     if (metadata == NULL) {
         return TRUE;
     }
@@ -1673,6 +1667,56 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
                     user_name, input, unregister);
 }
 
+// User data for asynchronous metadata execution
+struct metadata_cb_data {
+    lrmd_rsc_info_t *rsc;   // Copy of resource information
+    xmlNode *input_xml;     // Copy of FSA input XML
+};
+
+static struct metadata_cb_data *
+new_metadata_cb_data(lrmd_rsc_info_t *rsc, xmlNode *input_xml)
+{
+    struct metadata_cb_data *data = NULL;
+
+    data = calloc(1, sizeof(struct metadata_cb_data));
+    CRM_ASSERT(data != NULL);
+    data->input_xml = copy_xml(input_xml);
+    data->rsc = lrmd_copy_rsc_info(rsc);
+    return data;
+}
+
+static void
+free_metadata_cb_data(struct metadata_cb_data *data)
+{
+    lrmd_free_rsc_info(data->rsc);
+    free_xml(data->input_xml);
+    free(data);
+}
+
+/*!
+ * \internal
+ * \brief Execute an action after metadata has been retrieved
+ *
+ * \param[in] pid        Ignored
+ * \param[in] result     Result of metadata action
+ * \param[in] user_data  Metadata callback data
+ */
+static void
+metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
+{
+    struct metadata_cb_data *data = (struct metadata_cb_data *) user_data;
+
+    struct ra_metadata_s *md = NULL;
+    lrm_state_t *lrm_state = lrm_state_find(lrm_op_target(data->input_xml));
+
+    if ((lrm_state != NULL) && pcmk__result_ok(result)) {
+        md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
+                                     result->action_stdout);
+    }
+    do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+    free_metadata_cb_data(data);
+}
+
 /*	 A_LRM_INVOKE	*/
 void
 do_lrm_invoke(long long action,
@@ -1811,9 +1855,40 @@ do_lrm_invoke(long long action,
         } else {
             struct ra_metadata_s *md = NULL;
 
-            md = controld_get_rsc_metadata(lrm_state, rsc,
-                                           controld_metadata_from_cache);
-            do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
+            /* Getting metadata from cache is OK except for start actions --
+             * always refresh from the agent for those, in case the resource
+             * agent was updated.
+             *
+             * @TODO Only refresh metadata for starts if the agent actually
+             * changed (using something like inotify, or a hash or modification
+             * time of the agent executable).
+             */
+            if (strcmp(operation, CRMD_ACTION_START) != 0) {
+                md = controld_get_rsc_metadata(lrm_state, rsc,
+                                               controld_metadata_from_cache);
+            }
+
+            if ((md == NULL) && crm_op_needs_metadata(rsc->standard,
+                                                      operation)) {
+                /* Most likely, we'll need the agent metadata to record the
+                 * pending operation and the operation result. Get it now rather
+                 * than wait until then, so the metadata action doesn't eat into
+                 * the real action's timeout.
+                 *
+                 * @TODO Metadata is retrieved via direct execution of the
+                 * agent, which has a couple of related issues: the executor
+                 * should execute agents, not the controller; and metadata for
+                 * Pacemaker Remote nodes should be collected on those nodes,
+                 * not locally.
+                 */
+                struct metadata_cb_data *data = NULL;
+
+                data = new_metadata_cb_data(rsc, input->xml);
+                (void) lrmd__metadata_async(rsc, metadata_complete,
+                                            (void *) data);
+            } else {
+                do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
+            }
         }
 
         lrmd_free_rsc_info(rsc);
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index ddcd5db..f4b7560 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -356,17 +356,19 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         return NULL;
     }
 
-    /* For now, we always collect resource agent meta-data via a local,
-     * synchronous, direct execution of the agent. This has multiple issues:
-     * the executor should execute agents, not the controller; meta-data for
-     * Pacemaker Remote nodes should be collected on those nodes, not
-     * locally; and the meta-data call shouldn't eat into the timeout of the
-     * real action being performed.
+    /* For most actions, metadata was cached asynchronously before action
+     * execution (via metadata_complete()).
      *
-     * These issues are planned to be addressed by having the scheduler
-     * schedule a meta-data cache check at the beginning of each transition.
-     * Once that is working, this block will only be a fallback in case the
-     * initial collection fails.
+     * However if that failed, and for other actions, retrieve the metadata now
+     * via a local, synchronous, direct execution of the agent.
+     *
+     * This has multiple issues, which is why this is just a fallback: the
+     * executor should execute agents, not the controller; metadata for
+     * Pacemaker Remote nodes should be collected on those nodes, not locally;
+     * the metadata call shouldn't eat into the timeout of the real action being
+     * performed; and the synchronous call blocks the controller (which also
+     * means that if the metadata action tries to contact the controller,
+     * everything will hang until the timeout).
      */
     rc = lrm_state_get_metadata(lrm_state, rsc->standard, rsc->provider,
                                 rsc->type, &metadata_str, 0);
-- 
2.31.1

From 1fad9acf0c8544dbbe2a4092ed3debe99453712a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:19:06 -0500
Subject: [PATCH 18/24] Low: libstonithd: return CRM_EX_NOSUCH for bad agent
 namespace

Callers can't rely on a particular exit code scheme at this point,
but it doesn't hurt
---
 lib/fencing/st_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 91075bd..d41b066 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -2451,7 +2451,7 @@ stonith__metadata_async(const char *agent, int timeout_sec,
         default:
             {
                 pcmk__action_result_t result = {
-                    .exit_status = CRM_EX_ERROR,
+                    .exit_status = CRM_EX_NOSUCH,
                     .execution_status = PCMK_EXEC_ERROR_HARD,
                     .exit_reason = crm_strdup_printf("No such agent '%s'",
                                                      agent),
-- 
2.31.1

From f54b7807b8a642311258d407c95d49916ed09fa8 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:23:43 -0500
Subject: [PATCH 19/24] Low: liblrmd: consider invalid agent specification a
 fatal error

---
 lib/lrmd/lrmd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 4b16bf0..d691dce 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2402,7 +2402,8 @@ lrmd__metadata_async(lrmd_rsc_info_t *rsc,
     CRM_CHECK(callback != NULL, return EINVAL);
 
     if ((rsc == NULL) || (rsc->standard == NULL) || (rsc->type == NULL)) {
-        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED, PCMK_EXEC_ERROR,
+        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED,
+                         PCMK_EXEC_ERROR_FATAL,
                          "Invalid resource specification");
         callback(0, &result, user_data);
         pcmk__reset_result(&result);
-- 
2.31.1

From 2c5eaf642eccb1a9960eca7db3158bf52b843385 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:25:12 -0500
Subject: [PATCH 20/24] Low: liblrmd: use resource ID for metadata actions when
 available

---
 lib/lrmd/lrmd_client.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index d691dce..a6a237c 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2416,11 +2416,11 @@ lrmd__metadata_async(lrmd_rsc_info_t *rsc,
                                        callback, user_data);
     }
 
-    action = services__create_resource_action(rsc->type, rsc->standard,
-                                              rsc->provider, rsc->type,
-                                              CRMD_ACTION_METADATA, 0,
-                                              CRMD_METADATA_CALL_TIMEOUT, NULL,
-                                              0);
+    action = services__create_resource_action((rsc->id != NULL) ? rsc->id : rsc->type,
+                                              rsc->standard, rsc->provider,
+                                              rsc->type, CRMD_ACTION_METADATA,
+                                              0, CRMD_METADATA_CALL_TIMEOUT,
+                                              NULL, 0);
     if (action == NULL) {
         pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
                          "Out of memory");
-- 
2.31.1

From d60727b2681bf222284f3cdac0cfa452cf7bfaf9 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:27:11 -0500
Subject: [PATCH 21/24] Refactor: controller: executor query can assume local
 node

---
 daemons/controld/controld_execd.c       | 6 +++---
 daemons/controld/controld_fsa.h         | 4 ++--
 daemons/controld/controld_join_client.c | 2 +-
 daemons/controld/controld_join_dc.c     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 6b65f52..16c13ba 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -796,16 +796,16 @@ build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
 }
 
 xmlNode *
-controld_query_executor_state(const char *node_name)
+controld_query_executor_state(void)
 {
     xmlNode *xml_state = NULL;
     xmlNode *xml_data = NULL;
     xmlNode *rsc_list = NULL;
     crm_node_t *peer = NULL;
-    lrm_state_t *lrm_state = lrm_state_find(node_name);
+    lrm_state_t *lrm_state = lrm_state_find(fsa_our_uname);
 
     if (!lrm_state) {
-        crm_err("Could not find executor state for node %s", node_name);
+        crm_err("Could not find executor state for node %s", fsa_our_uname);
         return NULL;
     }
 
diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h
index 296232f..d137310 100644
--- a/daemons/controld/controld_fsa.h
+++ b/daemons/controld/controld_fsa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2021 the Pacemaker project contributors
+ * Copyright 2004-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -518,7 +518,7 @@ extern gboolean ever_had_quorum;
 // These should be moved elsewhere
 void do_update_cib_nodes(gboolean overwrite, const char *caller);
 int crmd_cib_smart_opt(void);
-xmlNode *controld_query_executor_state(const char *node_name);
+xmlNode *controld_query_executor_state(void);
 
 const char *fsa_input2string(enum crmd_fsa_input input);
 const char *fsa_state2string(enum crmd_fsa_state state);
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index 6485856..bfec430 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -268,7 +268,7 @@ do_cl_join_finalize_respond(long long action,
     update_dc_expected(input->msg);
 
     /* send our status section to the DC */
-    tmp1 = controld_query_executor_state(fsa_our_uname);
+    tmp1 = controld_query_executor_state();
     if (tmp1 != NULL) {
         xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                         CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c
index 9386182..9a8ea3e 100644
--- a/daemons/controld/controld_join_dc.c
+++ b/daemons/controld/controld_join_dc.c
@@ -591,7 +591,7 @@ do_dc_join_ack(long long action,
     }
     controld_delete_node_state(join_from, section, cib_scope_local);
     if (pcmk__str_eq(join_from, fsa_our_uname, pcmk__str_casei)) {
-        xmlNode *now_dc_lrmd_state = controld_query_executor_state(fsa_our_uname);
+        xmlNode *now_dc_lrmd_state = controld_query_executor_state();
 
         if (now_dc_lrmd_state != NULL) {
             fsa_cib_update(XML_CIB_TAG_STATUS, now_dc_lrmd_state,
-- 
2.31.1

From 5069bbfeedd91e86b8faa3a641c6064900c664a9 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 20 Sep 2022 10:18:48 -0500
Subject: [PATCH 22/24] Log: controller: add messages when getting agent
 metadata

---
 daemons/controld/controld_execd.c    |  5 +++++
 daemons/controld/controld_metadata.c | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 16c13ba..aa1c607 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1884,6 +1884,11 @@ do_lrm_invoke(long long action,
                 struct metadata_cb_data *data = NULL;
 
                 data = new_metadata_cb_data(rsc, input->xml);
+                crm_info("Retrieving metadata for %s (%s%s%s:%s) asynchronously",
+                         rsc->id, rsc->standard,
+                         ((rsc->provider == NULL)? "" : ":"),
+                         ((rsc->provider == NULL)? "" : rsc->provider),
+                         rsc->type);
                 (void) lrmd__metadata_async(rsc, metadata_complete,
                                             (void *) data);
             } else {
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index f4b7560..c756a79 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -348,6 +348,11 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
             free(key);
         }
         if (metadata != NULL) {
+            crm_debug("Retrieved metadata for %s (%s%s%s:%s) from cache",
+                      rsc->id, rsc->standard,
+                      ((rsc->provider == NULL)? "" : ":"),
+                      ((rsc->provider == NULL)? "" : rsc->provider),
+                      rsc->type);
             return metadata;
         }
     }
@@ -370,6 +375,11 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
      * means that if the metadata action tries to contact the controller,
      * everything will hang until the timeout).
      */
+    crm_debug("Retrieving metadata for %s (%s%s%s:%s) synchronously",
+              rsc->id, rsc->standard,
+              ((rsc->provider == NULL)? "" : ":"),
+              ((rsc->provider == NULL)? "" : rsc->provider),
+              rsc->type);
     rc = lrm_state_get_metadata(lrm_state, rsc->standard, rsc->provider,
                                 rsc->type, &metadata_str, 0);
     if (rc != pcmk_ok) {
-- 
2.31.1

From 67d81f7d4bdcb6a888e13bcb29954165dce6cace Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 14:36:44 -0500
Subject: [PATCH 23/24] Test: cts-lab: allow any whitespace in "Recover"
 messages

This seems to have always been multiple spaces, not sure what happened
---
 cts/lab/CTStests.py | 12 ++++++------
 cts/lab/patterns.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
index 5535177..8b56758 100644
--- a/cts/lab/CTStests.py
+++ b/cts/lab/CTStests.py
@@ -1,7 +1,7 @@
 """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
 """
 
-__copyright__ = "Copyright 2000-2021 the Pacemaker project contributors"
+__copyright__ = "Copyright 2000-2022 the Pacemaker project contributors"
 __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
 
 #
@@ -1225,7 +1225,7 @@ class MaintenanceMode(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
-            r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
+            r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self.rid,
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
@@ -1324,7 +1324,7 @@ class ResourceRecover(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
-            r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
+            r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self.rid, self.rid_alt),
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
@@ -2559,7 +2559,7 @@ class RemoteLXC(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for ping",
-            r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
+            r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)",
             # The orphaned lxc-ms resource causes an expected transition error
             # that is a result of the scheduler not having knowledge that the
             # promotable resource used to be a clone. As a result, it looks like that 
@@ -3054,7 +3054,7 @@ class RemoteStonithd(RemoteDriver):
             r"Software caused connection abort",
             r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
             r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
-            r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
+            r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)",
             r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
         ]
 
@@ -3120,7 +3120,7 @@ class RemoteRscFailure(RemoteDriver):
 
     def errorstoignore(self):
         ignore_pats = [
-            r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
+            r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)",
             r"Dummy.*: No process state file found",
         ]
 
diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py
index 90cac73..6e718f7 100644
--- a/cts/lab/patterns.py
+++ b/cts/lab/patterns.py
@@ -66,7 +66,7 @@ class BasePatterns(object):
 
             "Pat:Fencing_start"   : r"Requesting peer fencing .* targeting %s",
             "Pat:Fencing_ok"      : r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK",
-            "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s",
+            "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover\s+%s",
             "Pat:Fencing_active"  : r"stonith resource .* is active on 2 nodes (attempting recovery)",
             "Pat:Fencing_probe"   : r"pacemaker-controld.* Result of probe operation for %s on .*: Error",
 
@@ -180,7 +180,7 @@ class crm_corosync(BasePatterns):
             r"Parameters to .* action changed:",
             r"Parameters to .* changed",
             r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)",
-            r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)",
+            r"pacemaker-schedulerd.*Recover\s+.*\(.* -\> .*\)",
             r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
             r"Peer is not part of our cluster",
             r"We appear to be in an election loop",
-- 
2.31.1

From aefc2a846ad7400fdbc5185e5dd44bceb3f9fcb5 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:55:42 -0500
Subject: [PATCH 24/24] Test: cts-lab: match parentheses correctly

---
 cts/lab/patterns.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py
index 6e718f7..856fffb 100644
--- a/cts/lab/patterns.py
+++ b/cts/lab/patterns.py
@@ -271,6 +271,7 @@ class crm_corosync(BasePatterns):
         ]
         self.components["pacemaker-based-ignore"] = [
             r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
+            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)",
             # This is overbroad, but we don't have a way to say that only
             # certain transition errors are acceptable (if the fencer respawns,
             # fence devices may appear multiply active). We have to rely on
@@ -328,7 +329,7 @@ class crm_corosync(BasePatterns):
             r"crit:.*Fencing daemon connection failed",
             r"error:.*Fencer connection failed \(will retry\)",
             r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations",
-            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error",
+            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)",
             # This is overbroad, but we don't have a way to say that only
             # certain transition errors are acceptable (if the fencer respawns,
             # fence devices may appear multiply active). We have to rely on
-- 
2.31.1