Blob Blame History Raw
From a35dfe0b76555f30dda4c9d96630866de40322b3 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:40:24 -0500
Subject: [PATCH 01/24] Low: fencing: use a default timeout with metadata and
 validate

If the caller did not specify a timeout, use a default in
stonith_api_operations_t:metadata() and validate(). (Timeout is currently
ignored past that point, so this has no effect yet.)

Also, rename timeout argument for clarity.
---
 lib/fencing/st_client.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 2b0d308..28791ff 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -504,7 +504,8 @@ stonith_api_device_list(stonith_t * stonith, int call_options, const char *names
 
 static int
 stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent,
-                            const char *namespace, char **output, int timeout)
+                            const char *namespace, char **output,
+                            int timeout_sec)
 {
     /* By executing meta-data directly, we can get it from stonith_admin when
      * the cluster is not running, which is important for higher-level tools.
@@ -512,16 +513,20 @@ stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *a
 
     enum stonith_namespace ns = stonith_get_namespace(agent, namespace);
 
+    if (timeout_sec <= 0) {
+        timeout_sec = CRMD_METADATA_CALL_TIMEOUT;
+    }
+
     crm_trace("Looking up metadata for %s agent %s",
               stonith_namespace2text(ns), agent);
 
     switch (ns) {
         case st_namespace_rhcs:
-            return stonith__rhcs_metadata(agent, timeout, output);
+            return stonith__rhcs_metadata(agent, timeout_sec, output);
 
 #if HAVE_STONITH_STONITH_H
         case st_namespace_lha:
-            return stonith__lha_metadata(agent, timeout, output);
+            return stonith__lha_metadata(agent, timeout_sec, output);
 #endif
 
         default:
@@ -1684,8 +1689,8 @@ stonith_api_delete(stonith_t * stonith)
 static int
 stonith_api_validate(stonith_t *st, int call_options, const char *rsc_id,
                      const char *namespace_s, const char *agent,
-                     stonith_key_value_t *params, int timeout, char **output,
-                     char **error_output)
+                     stonith_key_value_t *params, int timeout_sec,
+                     char **output, char **error_output)
 {
     /* Validation should be done directly via the agent, so we can get it from
      * stonith_admin when the cluster is not running, which is important for
@@ -1731,17 +1736,21 @@ stonith_api_validate(stonith_t *st, int call_options, const char *rsc_id,
         *error_output = NULL;
     }
 
+    if (timeout_sec <= 0) {
+        timeout_sec = CRMD_METADATA_CALL_TIMEOUT; // Questionable
+    }
+
     switch (stonith_get_namespace(agent, namespace_s)) {
         case st_namespace_rhcs:
             rc = stonith__rhcs_validate(st, call_options, target, agent,
-                                        params_table, host_arg, timeout,
+                                        params_table, host_arg, timeout_sec,
                                         output, error_output);
             break;
 
 #if HAVE_STONITH_STONITH_H
         case st_namespace_lha:
             rc = stonith__lha_validate(st, call_options, target, agent,
-                                       params_table, timeout, output,
+                                       params_table, timeout_sec, output,
                                        error_output);
             break;
 #endif
-- 
2.31.1

From c2a863b7daeb829c0210d87a2f1503c1cf4dc7a5 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:00:00 -0500
Subject: [PATCH 02/24] Doc: fencer: improve
 stonith_api_operations_t:metadata() description

---
 include/crm/stonith-ng.h | 15 +++++++++++----
 lib/fencing/st_client.c  |  7 ++++---
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h
index 4fe52ef..a41d411 100644
--- a/include/crm/stonith-ng.h
+++ b/include/crm/stonith-ng.h
@@ -206,14 +206,21 @@ typedef struct stonith_api_operations_s
         stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list);
 
     /*!
-     * \brief Get the metadata documentation for a resource.
+     * \brief Retrieve a fence agent's metadata
      *
-     * \note Value is returned in output.  Output must be freed when set.
+     * \param[in,out] stonith       Fencer connection
+     * \param[in]     call_options  Group of enum stonith_call_options
+     *                              (currently ignored)
+     * \param[in]     agent         Fence agent to query
+     * \param[in]     namespace     Namespace of fence agent to query (optional)
+     * \param[out]    output        Where to store metadata
+     * \param[in]     timeout_sec   Error if not complete within this time
      *
      * \return Legacy Pacemaker return code
+     * \note The caller is responsible for freeing *output using free().
      */
-    int (*metadata)(stonith_t *st, int options,
-            const char *device, const char *provider, char **output, int timeout);
+    int (*metadata)(stonith_t *stonith, int call_options, const char *agent,
+                    const char *namespace, char **output, int timeout_sec);
 
     /*!
      * \brief Retrieve a list of installed stonith agents
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 28791ff..6c252bc 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -502,10 +502,11 @@ stonith_api_device_list(stonith_t * stonith, int call_options, const char *names
     return count;
 }
 
+// See stonith_api_operations_t:metadata() documentation
 static int
-stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent,
-                            const char *namespace, char **output,
-                            int timeout_sec)
+stonith_api_device_metadata(stonith_t *stonith, int call_options,
+                            const char *agent, const char *namespace,
+                            char **output, int timeout_sec)
 {
     /* By executing meta-data directly, we can get it from stonith_admin when
      * the cluster is not running, which is important for higher-level tools.
-- 
2.31.1

From 9beff34a0d39425ef470e59e251a8ca7c08e69a0 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:16:54 -0500
Subject: [PATCH 03/24] Doc: fencing: add doxygen block for
 stonith__action_create()

... and rename a couple arguments for clarity
---
 include/crm/fencing/internal.h |  4 ++--
 lib/fencing/st_actions.c       | 33 ++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index d2b49f8..e2ca85e 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -50,10 +50,10 @@ struct stonith_action_s;
 typedef struct stonith_action_s stonith_action_t;
 
 stonith_action_t *stonith_action_create(const char *agent,
-                                        const char *_action,
+                                        const char *action_name,
                                         const char *victim,
                                         uint32_t victim_nodeid,
-                                        int timeout,
+                                        int timeout_sec,
                                         GHashTable * device_args,
                                         GHashTable * port_map,
                                         const char * host_arg);
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index b3429f6..d16fa33 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -232,27 +232,42 @@ stonith__action_result(stonith_action_t *action)
 }
 
 #define FAILURE_MAX_RETRIES 2
+
+/*!
+ * \internal
+ * \brief Create a new fencing action to be executed
+ *
+ * \param[in] agent          Fence agent to use
+ * \param[in] action_name    Fencing action to be executed
+ * \param[in] victim         Name of target of fencing action (if known)
+ * \param[in] victim_nodeid  Node ID of target of fencing action (if known)
+ * \param[in] timeout_sec    Timeout to be used when executing action
+ * \param[in] device_args    Parameters to pass to fence agent
+ * \param[in] port_map       Mapping of target names to device ports
+ * \param[in] host_arg       Agent parameter used to pass target name
+ *
+ * \return Newly created fencing action (asserts on error, never NULL)
+ */
 stonith_action_t *
 stonith_action_create(const char *agent,
-                      const char *_action,
+                      const char *action_name,
                       const char *victim,
                       uint32_t victim_nodeid,
-                      int timeout, GHashTable * device_args,
+                      int timeout_sec, GHashTable * device_args,
                       GHashTable * port_map, const char *host_arg)
 {
-    stonith_action_t *action;
+    stonith_action_t *action = calloc(1, sizeof(stonith_action_t));
 
-    action = calloc(1, sizeof(stonith_action_t));
     CRM_ASSERT(action != NULL);
 
-    action->args = make_args(agent, _action, victim, victim_nodeid,
+    action->args = make_args(agent, action_name, victim, victim_nodeid,
                              device_args, port_map, host_arg);
     crm_debug("Preparing '%s' action for %s using agent %s",
-              _action, (victim? victim : "no target"), agent);
+              action_name, (victim? victim : "no target"), agent);
     action->agent = strdup(agent);
-    action->action = strdup(_action);
+    action->action = strdup(action_name);
     pcmk__str_update(&action->victim, victim);
-    action->timeout = action->remaining_timeout = timeout;
+    action->timeout = action->remaining_timeout = timeout_sec;
     action->max_retries = FAILURE_MAX_RETRIES;
 
     pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN,
@@ -262,7 +277,7 @@ stonith_action_create(const char *agent,
         char buffer[512];
         const char *value = NULL;
 
-        snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action);
+        snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", action_name);
         value = g_hash_table_lookup(device_args, buffer);
 
         if (value) {
-- 
2.31.1

From 3001cb016eefff55c55e709247b0c14c331fb330 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:20:24 -0500
Subject: [PATCH 04/24] Low: fencing: use requested timeout with RHCS metadata
 actions

... instead of hardcoded 5 seconds, and rename timeout argument for clarity
---
 lib/fencing/st_rhcs.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c
index dfccff2..5e600d2 100644
--- a/lib/fencing/st_rhcs.c
+++ b/lib/fencing/st_rhcs.c
@@ -112,25 +112,24 @@ stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter)
 }
 
 /*!
- * \brief Execute RHCS-compatible agent's meta-data action
+ * \brief Execute RHCS-compatible agent's metadata action
  *
- * \param[in]  agent    Agent to execute
- * \param[in]  timeout  Action timeout
- * \param[out] metadata Where to store output xmlNode (or NULL to ignore)
- *
- * \todo timeout is currently ignored; shouldn't we use it?
+ * \param[in]  agent        Agent to execute
+ * \param[in]  timeout_sec  Action timeout
+ * \param[out] metadata     Where to store output xmlNode (or NULL to ignore)
  */
 static int
-stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata)
+stonith__rhcs_get_metadata(const char *agent, int timeout_sec,
+		           xmlNode **metadata)
 {
     xmlNode *xml = NULL;
     xmlNode *actions = NULL;
     xmlXPathObject *xpathObj = NULL;
-    pcmk__action_result_t *result = NULL;
-    stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0,
-                                                     5, NULL, NULL, NULL);
+    stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 
+                                                     0, timeout_sec, NULL,
+                                                     NULL, NULL);
     int rc = stonith__execute(action);
-    result = stonith__action_result(action);
+    pcmk__action_result_t *result = stonith__action_result(action);
 
     if (result == NULL) {
         if (rc < 0) {
@@ -208,21 +207,19 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata)
 }
 
 /*!
- * \brief Execute RHCS-compatible agent's meta-data action
- *
- * \param[in]  agent    Agent to execute
- * \param[in]  timeout  Action timeout
- * \param[out] output   Where to store action output (or NULL to ignore)
+ * \brief Retrieve metadata for RHCS-compatible fence agent
  *
- * \todo timeout is currently ignored; shouldn't we use it?
+ * \param[in]  agent        Agent to execute
+ * \param[in]  timeout_sec  Action timeout
+ * \param[out] output       Where to store action output (or NULL to ignore)
  */
 int
-stonith__rhcs_metadata(const char *agent, int timeout, char **output)
+stonith__rhcs_metadata(const char *agent, int timeout_sec, char **output)
 {
     char *buffer = NULL;
     xmlNode *xml = NULL;
 
-    int rc = stonith__rhcs_get_metadata(agent, timeout, &xml);
+    int rc = stonith__rhcs_get_metadata(agent, timeout_sec, &xml);
 
     if (rc != pcmk_ok) {
         free_xml(xml);
-- 
2.31.1

From 17dbf449d8b51ea27a89a13f47160a95b0a45149 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:32:44 -0500
Subject: [PATCH 05/24] Refactor: fencing: make stonith_action_t:async bool

---
 lib/fencing/st_actions.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index d16fa33..abd0d5a 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -9,6 +9,7 @@
 
 #include <crm_internal.h>
 
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -32,7 +33,7 @@ struct stonith_action_s {
     char *victim;
     GHashTable *args;
     int timeout;
-    int async;
+    bool async;
     void *userdata;
     void (*done_cb) (int pid, const pcmk__action_result_t *result,
                      void *user_data);
@@ -671,7 +672,7 @@ stonith_action_execute_async(stonith_action_t * action,
     action->userdata = userdata;
     action->done_cb = done;
     action->fork_cb = fork_cb;
-    action->async = 1;
+    action->async = true;
 
     return internal_stonith_action_execute(action);
 }
-- 
2.31.1

From 9b0f568dddc928104e6d2d54d5138e0c7ca5b537 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 14:59:28 -0500
Subject: [PATCH 06/24] Refactor: fencing: rename
 stonith_action_execute_async()

... to stonith__execute_async(), since it's internal
---
 daemons/fenced/fenced_commands.c |  4 ++--
 include/crm/fencing/internal.h   | 12 +++++-------
 lib/fencing/st_actions.c         | 11 +++++------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c
index 94aa6b8..41a1936 100644
--- a/daemons/fenced/fenced_commands.c
+++ b/daemons/fenced/fenced_commands.c
@@ -510,8 +510,8 @@ stonith_device_execute(stonith_device_t * device)
     /* for async exec, exec_rc is negative for early error exit
        otherwise handling of success/errors is done via callbacks */
     cmd->activating_on = device;
-    exec_rc = stonith_action_execute_async(action, (void *)cmd,
-                                           cmd->done_cb, fork_cb);
+    exec_rc = stonith__execute_async(action, (void *)cmd, cmd->done_cb,
+                                     fork_cb);
     if (exec_rc < 0) {
         cmd->activating_on = NULL;
         cmd->done_cb(0, stonith__action_result(action), cmd);
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index e2ca85e..1797d9a 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -64,13 +64,11 @@ void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result);
 void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result);
 xmlNode *stonith__find_xe_with_result(xmlNode *xml);
 
-int
-stonith_action_execute_async(stonith_action_t * action,
-                             void *userdata,
-                             void (*done) (int pid,
-                                           const pcmk__action_result_t *result,
-                                           void *user_data),
-                             void (*fork_cb) (int pid, void *user_data));
+int stonith__execute_async(stonith_action_t *action, void *userdata,
+                           void (*done) (int pid,
+                                         const pcmk__action_result_t *result,
+                                         void *user_data),
+                           void (*fork_cb) (int pid, void *user_data));
 
 xmlNode *create_level_registration_xml(const char *node, const char *pattern,
                                        const char *attr, const char *value,
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index abd0d5a..c4e32bd 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -658,12 +658,11 @@ internal_stonith_action_execute(stonith_action_t * action)
  * \return pcmk_ok if ownership of action has been taken, -errno otherwise
  */
 int
-stonith_action_execute_async(stonith_action_t * action,
-                             void *userdata,
-                             void (*done) (int pid,
-                                           const pcmk__action_result_t *result,
-                                           void *user_data),
-                             void (*fork_cb) (int pid, void *user_data))
+stonith__execute_async(stonith_action_t * action, void *userdata,
+                       void (*done) (int pid,
+                                     const pcmk__action_result_t *result,
+                                     void *user_data),
+                       void (*fork_cb) (int pid, void *user_data))
 {
     if (!action) {
         return -EINVAL;
-- 
2.31.1

From 1d8fbd12b302b5029a341f269bd00def79e6a0ea Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 16:43:57 -0500
Subject: [PATCH 07/24] Refactor: fencing: add internal API for getting
 metadata async

Nothing uses it yet
---
 include/crm/fencing/internal.h |  6 +++
 lib/fencing/st_client.c        | 80 ++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index 1797d9a..513d1c4 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -70,6 +70,12 @@ int stonith__execute_async(stonith_action_t *action, void *userdata,
                                          void *user_data),
                            void (*fork_cb) (int pid, void *user_data));
 
+int stonith__metadata_async(const char *agent, int timeout_sec,
+                            void (*callback)(int pid,
+                                             const pcmk__action_result_t *result,
+                                             void *user_data),
+                            void *user_data);
+
 xmlNode *create_level_registration_xml(const char *node, const char *pattern,
                                        const char *attr, const char *value,
                                        int level,
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 6c252bc..91075bd 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -2386,6 +2386,86 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name,
     freeXpathObject(xpath);
 }
 
+/*!
+ * \internal
+ * \brief Retrieve fence agent meta-data asynchronously
+ *
+ * \param[in] agent        Agent to execute
+ * \param[in] timeout_sec  Error if not complete within this time
+ * \param[in] callback     Function to call with result (this will always be
+ *                         called, whether by this function directly or later
+ *                         via the main loop, and on success the metadata will
+ *                         be in its result argument's action_stdout)
+ * \param[in] user_data    User data to pass to callback
+ *
+ * \return Standard Pacemaker return code
+ * \note The caller must use a main loop. This function is not a
+ *       stonith_api_operations_t method because it does not need a stonith_t
+ *       object and does not go through the fencer, but executes the agent
+ *       directly.
+ */
+int
+stonith__metadata_async(const char *agent, int timeout_sec,
+                        void (*callback)(int pid,
+                                         const pcmk__action_result_t *result,
+                                         void *user_data),
+                        void *user_data)
+{
+    switch (stonith_get_namespace(agent, NULL)) {
+        case st_namespace_rhcs:
+            {
+                stonith_action_t *action = NULL;
+                int rc = pcmk_ok;
+
+                action = stonith_action_create(agent, "metadata", NULL, 0,
+                                               timeout_sec, NULL, NULL, NULL);
+
+                rc = stonith__execute_async(action, user_data, callback, NULL);
+                if (rc != pcmk_ok) {
+                    callback(0, stonith__action_result(action), user_data);
+                    stonith__destroy_action(action);
+                }
+                return pcmk_legacy2rc(rc);
+            }
+
+#if HAVE_STONITH_STONITH_H
+        case st_namespace_lha:
+            // LHA metadata is simply synthesized, so simulate async
+            {
+                pcmk__action_result_t result = {
+                    .exit_status = CRM_EX_OK,
+                    .execution_status = PCMK_EXEC_DONE,
+                    .exit_reason = NULL,
+                    .action_stdout = NULL,
+                    .action_stderr = NULL,
+                };
+
+                stonith__lha_metadata(agent, timeout_sec,
+                                      &result.action_stdout);
+                callback(0, &result, user_data);
+                pcmk__reset_result(&result);
+                return pcmk_rc_ok;
+            }
+#endif
+
+        default:
+            {
+                pcmk__action_result_t result = {
+                    .exit_status = CRM_EX_ERROR,
+                    .execution_status = PCMK_EXEC_ERROR_HARD,
+                    .exit_reason = crm_strdup_printf("No such agent '%s'",
+                                                     agent),
+                    .action_stdout = NULL,
+                    .action_stderr = NULL,
+                };
+
+                callback(0, &result, user_data);
+                pcmk__reset_result(&result);
+                return ENOENT;
+            }
+    }
+}
+
 /*!
  * \internal
  * \brief Return the exit status from an async action callback
-- 
2.31.1

From 1869cc181ef9599bd938fc545d302b2721169755 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 13 Sep 2022 17:33:10 -0500
Subject: [PATCH 08/24] Refactor: liblrmd: add internal API for getting
 metadata async

Nothing uses it yet
---
 include/crm/lrmd_internal.h |  10 +++-
 lib/lrmd/lrmd_client.c      | 115 ++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/include/crm/lrmd_internal.h b/include/crm/lrmd_internal.h
index 284c4d6..5cb00d5 100644
--- a/include/crm/lrmd_internal.h
+++ b/include/crm/lrmd_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2015-2021 the Pacemaker project contributors
+ * Copyright 2015-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -17,7 +17,7 @@
 #include <crm/common/mainloop.h>        // mainloop_io_t, ipc_client_callbacks
 #include <crm/common/output_internal.h> // pcmk__output_t
 #include <crm/common/remote_internal.h> // pcmk__remote_t
-#include <crm/lrmd.h>                   // lrmd_t, lrmd_event_data_t
+#include <crm/lrmd.h>           // lrmd_t, lrmd_event_data_t, lrmd_rsc_info_t
 
 int lrmd__new(lrmd_t **api, const char *nodename, const char *server, int port);
 
@@ -35,6 +35,12 @@ int lrmd_send_resource_alert(lrmd_t *lrmd, GList *alert_list,
 int lrmd__remote_send_xml(pcmk__remote_t *session, xmlNode *msg, uint32_t id,
                           const char *msg_type);
 
+int lrmd__metadata_async(lrmd_rsc_info_t *rsc,
+                         void (*callback)(int pid,
+                                          const pcmk__action_result_t *result,
+                                          void *user_data),
+                         void *user_data);
+
 void lrmd__set_result(lrmd_event_data_t *event, enum ocf_exitcode rc,
                       int op_status, const char *exit_reason);
 
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 82afd6c..4b16bf0 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2343,6 +2343,121 @@ lrmd_api_delete(lrmd_t * lrmd)
     free(lrmd);
 }
 
+struct metadata_cb {
+     void (*callback)(int pid, const pcmk__action_result_t *result,
+                      void *user_data);
+     void *user_data;
+};
+
+/*!
+ * \internal
+ * \brief Process asynchronous metadata completion
+ *
+ * \param[in] action  Metadata action that completed
+ */
+static void
+metadata_complete(svc_action_t *action)
+{
+    struct metadata_cb *metadata_cb = (struct metadata_cb *) action->cb_data;
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
+
+    pcmk__set_result(&result, action->rc, action->status,
+                     services__exit_reason(action));
+    pcmk__set_result_output(&result, action->stdout_data, action->stderr_data);
+
+    metadata_cb->callback(0, &result, metadata_cb->user_data);
+    result.action_stdout = NULL; // Prevent free, because action owns it
+    result.action_stderr = NULL; // Prevent free, because action owns it
+    pcmk__reset_result(&result);
+    free(metadata_cb);
+}
+
+/*!
+ * \internal
+ * \brief Retrieve agent metadata asynchronously
+ *
+ * \param[in] rsc        Resource agent specification
+ * \param[in] callback   Function to call with result (this will always be
+ *                       called, whether by this function directly or later via
+ *                       the main loop, and on success the metadata will be in
+ *                       its result argument's action_stdout)
+ * \param[in] user_data  User data to pass to callback
+ *
+ * \return Standard Pacemaker return code
+ * \note This function is not a lrmd_api_operations_t method because it does not
+ *       need an lrmd_t object and does not go through the executor, but
+ *       executes the agent directly.
+ */
+int
+lrmd__metadata_async(lrmd_rsc_info_t *rsc,
+                     void (*callback)(int pid,
+                                      const pcmk__action_result_t *result,
+                                      void *user_data),
+                     void *user_data)
+{
+    svc_action_t *action = NULL;
+    struct metadata_cb *metadata_cb = NULL;
+    pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
+
+    CRM_CHECK(callback != NULL, return EINVAL);
+
+    if ((rsc == NULL) || (rsc->standard == NULL) || (rsc->type == NULL)) {
+        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED, PCMK_EXEC_ERROR,
+                         "Invalid resource specification");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return EINVAL;
+    }
+
+    if (strcmp(rsc->standard, PCMK_RESOURCE_CLASS_STONITH) == 0) {
+        return stonith__metadata_async(rsc->type,
+                                       CRMD_METADATA_CALL_TIMEOUT / 1000,
+                                       callback, user_data);
+    }
+
+    action = services__create_resource_action(rsc->type, rsc->standard,
+                                              rsc->provider, rsc->type,
+                                              CRMD_ACTION_METADATA, 0,
+                                              CRMD_METADATA_CALL_TIMEOUT, NULL,
+                                              0);
+    if (action == NULL) {
+        pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
+                         "Out of memory");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return ENOMEM;
+    }
+    if (action->rc != PCMK_OCF_UNKNOWN) {
+        pcmk__set_result(&result, action->rc, action->status,
+                         services__exit_reason(action));
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        services_action_free(action);
+        return EINVAL;
+    }
+
+    action->cb_data = calloc(1, sizeof(struct metadata_cb));
+    if (action->cb_data == NULL) {
+        services_action_free(action);
+        pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
+                         "Out of memory");
+        callback(0, &result, user_data);
+        pcmk__reset_result(&result);
+        return ENOMEM;
+    }
+
+    metadata_cb = (struct metadata_cb *) action->cb_data;
+    metadata_cb->callback = callback;
+    metadata_cb->user_data = user_data;
+    if (!services_action_async(action, metadata_complete)) {
+        services_action_free(action);
+        return pcmk_rc_error; // @TODO Derive from action->rc and ->status
+    }
+
+    // The services library has taken responsibility for action
+    return pcmk_rc_ok;
+}
+
 /*!
  * \internal
  * \brief Set the result of an executor event
-- 
2.31.1

From de89164053cde8f44ca74a007703e0827ffd67ec Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:34:37 -0500
Subject: [PATCH 09/24] Low: controller: ignore CRM_OP_LRM_REFRESH

This was only sent by crm_resource --refresh in versions 1.1.9 and earlier.
Since the local crm_resource is the same version as the controller, and
Pacemaker Remote was introduced in 1.1.9, this means that only remote nodes
running 1.1.9 can possibly send it.

It didn't really do anything useful anyway, so just ignore it.
---
 daemons/controld/controld_execd.c    | 33 +++++-----------------------
 daemons/controld/controld_messages.c |  2 +-
 include/crm/crm.h                    |  2 +-
 lib/pacemaker/pcmk_graph_producer.c  |  3 +--
 lib/pengine/common.c                 |  2 --
 5 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index fa411a6..719fab0 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1553,32 +1553,6 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
     lrmd_free_event(op);
 }
 
-static void
-handle_refresh_op(lrm_state_t *lrm_state, const char *user_name,
-                  const char *from_host, const char *from_sys)
-{
-    int rc = pcmk_ok;
-    xmlNode *fragment = do_lrm_query_internal(lrm_state, node_update_all);
-
-    fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name);
-    crm_info("Forced a local resource history refresh: call=%d", rc);
-
-    if (!pcmk__str_eq(CRM_SYSTEM_CRMD, from_sys, pcmk__str_casei)) {
-        xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, fragment, from_host,
-                                        from_sys, CRM_SYSTEM_LRMD,
-                                        fsa_our_uuid);
-
-        crm_debug("ACK'ing refresh from %s (%s)", from_sys, from_host);
-
-        if (relay_message(reply, TRUE) == FALSE) {
-            crm_log_xml_err(reply, "Unable to route reply");
-        }
-        free_xml(reply);
-    }
-
-    free_xml(fragment);
-}
-
 static void
 handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
 {
@@ -1787,7 +1761,12 @@ do_lrm_invoke(long long action,
     }
 
     if (pcmk__str_eq(crm_op, CRM_OP_LRM_REFRESH, pcmk__str_casei)) {
-        handle_refresh_op(lrm_state, user_name, from_host, from_sys);
+        /* @COMPAT This can only be sent by crm_resource --refresh on a
+         * Pacemaker Remote node running Pacemaker 1.1.9, which is extremely
+         * unlikely. It previously would cause the controller to re-write its
+         * resource history to the CIB. Just ignore it.
+         */
+        crm_notice("Ignoring refresh request from Pacemaker Remote 1.1.9 node");
 
     } else if (pcmk__str_eq(crm_op, CRM_OP_LRM_QUERY, pcmk__str_casei)) {
         handle_query_op(input->msg, lrm_state);
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index 31d3524..957fc20 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -1061,7 +1061,7 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause)
         return handle_lrm_delete(stored_msg);
 
     } else if ((strcmp(op, CRM_OP_LRM_FAIL) == 0)
-               || (strcmp(op, CRM_OP_LRM_REFRESH) == 0)
+               || (strcmp(op, CRM_OP_LRM_REFRESH) == 0) // @COMPAT
                || (strcmp(op, CRM_OP_REPROBE) == 0)) {
 
         crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD);
diff --git a/include/crm/crm.h b/include/crm/crm.h
index 5ec66d2..f2e536e 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -146,7 +146,7 @@ extern char *crm_system_name;
 #  define CRM_OP_REGISTER		"register"
 #  define CRM_OP_IPC_FWD		"ipc_fwd"
 #  define CRM_OP_INVOKE_LRM	"lrm_invoke"
-#  define CRM_OP_LRM_REFRESH	"lrm_refresh" /* Deprecated */
+#  define CRM_OP_LRM_REFRESH "lrm_refresh" //!< Deprecated since 1.1.10
 #  define CRM_OP_LRM_QUERY	"lrm_query"
 #  define CRM_OP_LRM_DELETE	"lrm_delete"
 #  define CRM_OP_LRM_FAIL		"lrm_fail"
diff --git a/lib/pacemaker/pcmk_graph_producer.c b/lib/pacemaker/pcmk_graph_producer.c
index 4c1b5a6..0077719 100644
--- a/lib/pacemaker/pcmk_graph_producer.c
+++ b/lib/pacemaker/pcmk_graph_producer.c
@@ -446,8 +446,7 @@ create_graph_action(xmlNode *parent, pe_action_t *action, bool skip_details,
 
     } else if (pcmk__str_any_of(action->task,
                                 CRM_OP_SHUTDOWN,
-                                CRM_OP_CLEAR_FAILCOUNT,
-                                CRM_OP_LRM_REFRESH, NULL)) {
+                                CRM_OP_CLEAR_FAILCOUNT, NULL)) {
         action_xml = create_xml_node(parent, XML_GRAPH_TAG_CRM_EVENT);
 
     } else if (pcmk__str_eq(action->task, CRM_OP_LRM_DELETE, pcmk__str_none)) {
diff --git a/lib/pengine/common.c b/lib/pengine/common.c
index 93ba3fe..7db9d0e 100644
--- a/lib/pengine/common.c
+++ b/lib/pengine/common.c
@@ -384,8 +384,6 @@ text2task(const char *task)
         return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) {
         return no_action;
-    } else if (pcmk__str_eq(task, CRM_OP_LRM_REFRESH, pcmk__str_casei)) {
-        return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_MIGRATE, pcmk__str_casei)) {
         return no_action;
     } else if (pcmk__str_eq(task, CRMD_ACTION_MIGRATED, pcmk__str_casei)) {
-- 
2.31.1

From 406fbc52ed652915887e78138f8f3c2eeaeabfb6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:46:15 -0500
Subject: [PATCH 10/24] API: libcrmcommon: deprecate CRM_OP_LRM_QUERY

This has been unused since at least Pacemaker 1.0.0, and since we don't support
rolling upgrades from anything that old, and Pacemaker Remote didn't exist
then, we can just drop support for it entirely.
---
 daemons/controld/controld_execd.c | 17 -----------------
 include/crm/crm.h                 |  1 -
 include/crm/crm_compat.h          |  5 ++++-
 3 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 719fab0..54e6818 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1553,20 +1553,6 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
     lrmd_free_event(op);
 }
 
-static void
-handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
-{
-    xmlNode *data = do_lrm_query_internal(lrm_state, node_update_all);
-    xmlNode *reply = create_reply(msg, data);
-
-    if (relay_message(reply, TRUE) == FALSE) {
-        crm_err("Unable to route reply");
-        crm_log_xml_err(reply, "reply");
-    }
-    free_xml(reply);
-    free_xml(data);
-}
-
 static void
 handle_reprobe_op(lrm_state_t *lrm_state, const char *from_sys,
                   const char *from_host, const char *user_name,
@@ -1768,9 +1754,6 @@ do_lrm_invoke(long long action,
          */
         crm_notice("Ignoring refresh request from Pacemaker Remote 1.1.9 node");
 
-    } else if (pcmk__str_eq(crm_op, CRM_OP_LRM_QUERY, pcmk__str_casei)) {
-        handle_query_op(input->msg, lrm_state);
-
     // @COMPAT DCs <1.1.14 in a rolling upgrade might schedule this op
     } else if (pcmk__str_eq(operation, CRM_OP_PROBED, pcmk__str_casei)) {
         update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE,
diff --git a/include/crm/crm.h b/include/crm/crm.h
index f2e536e..38915e3 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -147,7 +147,6 @@ extern char *crm_system_name;
 #  define CRM_OP_IPC_FWD		"ipc_fwd"
 #  define CRM_OP_INVOKE_LRM	"lrm_invoke"
 #  define CRM_OP_LRM_REFRESH "lrm_refresh" //!< Deprecated since 1.1.10
-#  define CRM_OP_LRM_QUERY	"lrm_query"
 #  define CRM_OP_LRM_DELETE	"lrm_delete"
 #  define CRM_OP_LRM_FAIL		"lrm_fail"
 #  define CRM_OP_PROBED		"probe_complete"
diff --git a/include/crm/crm_compat.h b/include/crm/crm_compat.h
index 3b35a5e..8a4b368 100644
--- a/include/crm/crm_compat.h
+++ b/include/crm/crm_compat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2021 the Pacemaker project contributors
+ * Copyright 2004-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -31,6 +31,9 @@ extern "C" {
 //! \deprecated This defined constant will be removed in a future release
 #define MAX_IPC_DELAY 120
 
+//! \deprecated This defined constant will be removed in a future release
+#define CRM_OP_LRM_QUERY "lrm_query"
+
 //!@{
 //! \deprecated This macro will be removed in a future release
 
-- 
2.31.1

From 7c3d2f58d387d2ec0d5c5d340f8816f324e816bf Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 16:49:48 -0500
Subject: [PATCH 11/24] Refactor: controller: drop do_lrm_query_internal()

Now that there's only one (short) caller, just move its contents there
---
 daemons/controld/controld_execd.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 54e6818..99c9193 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -811,19 +811,26 @@ build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
     return FALSE;
 }
 
-static xmlNode *
-do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
+xmlNode *
+controld_query_executor_state(const char *node_name)
 {
     xmlNode *xml_state = NULL;
     xmlNode *xml_data = NULL;
     xmlNode *rsc_list = NULL;
     crm_node_t *peer = NULL;
+    lrm_state_t *lrm_state = lrm_state_find(node_name);
+
+    if (!lrm_state) {
+        crm_err("Could not find executor state for node %s", node_name);
+        return NULL;
+    }
 
     peer = crm_get_peer_full(0, lrm_state->node_name, CRM_GET_PEER_ANY);
     CRM_CHECK(peer != NULL, return NULL);
 
-    xml_state = create_node_state_update(peer, update_flags, NULL,
-                                         __func__);
+    xml_state = create_node_state_update(peer,
+                                         node_update_cluster|node_update_peer,
+                                         NULL, __func__);
     if (xml_state == NULL) {
         return NULL;
     }
@@ -840,19 +847,6 @@ do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
     return xml_state;
 }
 
-xmlNode *
-controld_query_executor_state(const char *node_name)
-{
-    lrm_state_t *lrm_state = lrm_state_find(node_name);
-
-    if (!lrm_state) {
-        crm_err("Could not find executor state for node %s", node_name);
-        return NULL;
-    }
-    return do_lrm_query_internal(lrm_state,
-                                 node_update_cluster|node_update_peer);
-}
-
 /*!
  * \internal
  * \brief Map standard Pacemaker return code to operation status and OCF code
-- 
2.31.1

From 5cab259417a06f64a607f99c478459093ed1b5ed Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 15:48:44 -0500
Subject: [PATCH 12/24] Doc: controller: drop pointless comment

It's (likely?) impossible for a live cluster to have been doing rolling
upgrades since 2006.
---
 daemons/controld/controld_execd.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 99c9193..53b1156 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -678,18 +678,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
 
     target_rc = rsc_op_expected_rc(op);
 
-    /* there is a small risk in formerly mixed clusters that it will
-     * be sub-optimal.
-     *
-     * however with our upgrade policy, the update we send should
-     * still be completely supported anyway
-     */
     caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION);
-    CRM_LOG_ASSERT(caller_version != NULL);
-
-    if(caller_version == NULL) {
-        caller_version = CRM_FEATURE_SET;
-    }
+    CRM_CHECK(caller_version != NULL, caller_version = CRM_FEATURE_SET);
 
     xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
                                       fsa_our_uname, src);
-- 
2.31.1

From b4541d7ecd9551674c4546415751a223ff3013ed Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 11:24:28 -0500
Subject: [PATCH 13/24] Refactor: controller: move where reload actions get
 remapped

... from do_lrm_invoke() to do_lrm_rsc_op(), which will make planned changes
easier
---
 daemons/controld/controld_execd.c | 38 ++++++++++++++++---------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 53b1156..c9f0cc7 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -43,7 +43,8 @@ static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer us
 static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
                                        const char *rsc_id, const char *operation);
 static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-                          const char *operation, xmlNode *msg);
+                          const char *operation, xmlNode *msg,
+                          struct ra_metadata_s *md);
 
 static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
                                          int log_level);
@@ -1808,26 +1809,12 @@ do_lrm_invoke(long long action,
             do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
                           crm_rsc_delete, user_name);
 
-        } else if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
-                                    CRMD_ACTION_RELOAD_AGENT, NULL)) {
-            /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
-             * will schedule reload-agent actions only. In either case, we need
-             * to map that to whatever the resource agent actually supports.
-             * Default to the OCF 1.1 name.
-             */
+        } else {
             struct ra_metadata_s *md = NULL;
-            const char *reload_name = CRMD_ACTION_RELOAD_AGENT;
 
             md = controld_get_rsc_metadata(lrm_state, rsc,
                                            controld_metadata_from_cache);
-            if ((md != NULL)
-                && pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
-                reload_name = CRMD_ACTION_RELOAD;
-            }
-            do_lrm_rsc_op(lrm_state, rsc, reload_name, input->xml);
-
-        } else {
-            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml);
+            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, md);
         }
 
         lrmd_free_rsc_info(rsc);
@@ -2176,7 +2163,7 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
 
 static void
 do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-              const char *operation, xmlNode *msg)
+              const char *operation, xmlNode *msg, struct ra_metadata_s *md)
 {
     int rc;
     int call_id = 0;
@@ -2198,6 +2185,21 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         }
     }
 
+    if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
+                         CRMD_ACTION_RELOAD_AGENT, NULL)) {
+        /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
+         * will schedule reload-agent actions only. In either case, we need
+         * to map that to whatever the resource agent actually supports.
+         * Default to the OCF 1.1 name.
+         */
+        if ((md != NULL)
+            && pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
+            operation = CRMD_ACTION_RELOAD;
+        } else {
+            operation = CRMD_ACTION_RELOAD_AGENT;
+        }
+    }
+
     op = construct_op(lrm_state, msg, rsc->id, operation);
     CRM_CHECK(op != NULL, return);
 
-- 
2.31.1

From a4f6e394a61712da750aabffca2b6dd02f0c5ae6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 15:12:06 -0500
Subject: [PATCH 14/24] Refactor: controller: drop operation argument to
 do_lrm_rsc_op()

It can be derived from the XML argument
---
 daemons/controld/controld_execd.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index c9f0cc7..89a993b 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -43,8 +43,7 @@ static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer us
 static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
                                        const char *rsc_id, const char *operation);
 static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-                          const char *operation, xmlNode *msg,
-                          struct ra_metadata_s *md);
+                          xmlNode *msg, struct ra_metadata_s *md);
 
 static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
                                          int log_level);
@@ -1814,7 +1813,7 @@ do_lrm_invoke(long long action,
 
             md = controld_get_rsc_metadata(lrm_state, rsc,
                                            controld_metadata_from_cache);
-            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, md);
+            do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
         }
 
         lrmd_free_rsc_info(rsc);
@@ -2162,8 +2161,8 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
 }
 
 static void
-do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
-              const char *operation, xmlNode *msg, struct ra_metadata_s *md)
+do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
+              struct ra_metadata_s *md)
 {
     int rc;
     int call_id = 0;
@@ -2172,17 +2171,18 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
     lrmd_key_value_t *params = NULL;
     fsa_data_t *msg_data = NULL;
     const char *transition = NULL;
+    const char *operation = NULL;
     gboolean stop_recurring = FALSE;
     const char *nack_reason = NULL;
 
-    CRM_CHECK(rsc != NULL, return);
-    CRM_CHECK(operation != NULL, return);
+    CRM_CHECK((rsc != NULL) && (msg != NULL), return);
 
-    if (msg != NULL) {
-        transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
-        if (transition == NULL) {
-            crm_log_xml_err(msg, "Missing transition number");
-        }
+    operation = crm_element_value(msg, XML_LRM_ATTR_TASK);
+    CRM_CHECK(!pcmk__str_empty(operation), return);
+
+    transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
+    if (pcmk__str_empty(transition)) {
+        crm_log_xml_err(msg, "Missing transition number");
     }
 
     if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
@@ -2241,7 +2241,7 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
     crm_notice("Requesting local execution of %s operation for %s on %s "
                CRM_XS " transition_key=%s op_key=" PCMK__OP_FMT,
                crm_action_str(op->op_type, op->interval_ms), rsc->id, lrm_state->node_name,
-               transition, rsc->id, operation, op->interval_ms);
+               (transition != NULL ? transition : ""), rsc->id, operation, op->interval_ms);
 
     if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)
         && pcmk__str_eq(operation, RSC_START, pcmk__str_casei)) {
-- 
2.31.1

From 486dbdf023f82a82a02207d8fb7921f8f2ac0588 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 15:40:38 -0500
Subject: [PATCH 15/24] Low: controller: add failsafe for no executor
 connection

... in do_lrm_rsc_op(), to make planned changes easier
---
 daemons/controld/controld_execd.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 89a993b..8986b9b 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2185,6 +2185,17 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
         crm_log_xml_err(msg, "Missing transition number");
     }
 
+    if (lrm_state == NULL) {
+        // This shouldn't be possible, but provide a failsafe just in case
+        crm_err("Cannot execute %s of %s: No executor connection "
+                CRM_XS " transition_key=%s",
+                operation, rsc->id, (transition != NULL ? transition : ""));
+        synthesize_lrmd_failure(NULL, msg, PCMK_EXEC_INVALID,
+                                PCMK_OCF_UNKNOWN_ERROR,
+                                "No executor connection");
+        return;
+    }
+
     if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
                          CRMD_ACTION_RELOAD_AGENT, NULL)) {
         /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
-- 
2.31.1

From afd53bba7dfb5109d844318dff0f82e4687d9e32 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 12:04:31 -0500
Subject: [PATCH 16/24] Log: controller: improve messages when metadata cache
 update fails

Previously, metadata_cache_update() or ra_param_from_xml() would log an error,
then controld_get_rsc_metadata() (but not the other caller,
process_lrm_event()) would log another warning with the agent info.

Combine these into a single message always logged by metadata_cache_update(),
which also has been renamed to controld_cache_metadata().
---
 daemons/controld/controld_execd.c    |  2 +-
 daemons/controld/controld_metadata.c | 27 ++++++++++++---------------
 daemons/controld/controld_metadata.h |  6 +++---
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 8986b9b..fe16c96 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2858,7 +2858,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
         } else if (rsc && (op->rc == PCMK_OCF_OK)) {
             char *metadata = unescape_newlines(op->output);
 
-            metadata_cache_update(lrm_state->metadata_cache, rsc, metadata);
+            controld_cache_metadata(lrm_state->metadata_cache, rsc, metadata);
             free(metadata);
         }
     }
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index 8c6f195..91a6a10 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -149,13 +149,11 @@ ra_param_from_xml(xmlNode *param_xml)
 
     p = calloc(1, sizeof(struct ra_param_s));
     if (p == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
         return NULL;
     }
 
     p->rap_name = strdup(param_name);
     if (p->rap_name == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
         free(p);
         return NULL;
     }
@@ -196,10 +194,11 @@ log_ra_ocf_version(const char *ra_key, const char *ra_ocf_version)
 }
 
 struct ra_metadata_s *
-metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
-                      const char *metadata_str)
+controld_cache_metadata(GHashTable *mdc, lrmd_rsc_info_t *rsc,
+                        const char *metadata_str)
 {
     char *key = NULL;
+    const char *reason = NULL;
     xmlNode *metadata = NULL;
     xmlNode *match = NULL;
     struct ra_metadata_s *md = NULL;
@@ -210,20 +209,19 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
 
     key = crm_generate_ra_key(rsc->standard, rsc->provider, rsc->type);
     if (!key) {
-        crm_crit("Could not allocate memory for resource metadata");
+        reason = "Invalid resource agent standard or type";
         goto err;
     }
 
     metadata = string2xml(metadata_str);
     if (!metadata) {
-        crm_err("Metadata for %s:%s:%s is not valid XML",
-                rsc->standard, rsc->provider, rsc->type);
+        reason = "Metadata is not valid XML";
         goto err;
     }
 
     md = calloc(1, sizeof(struct ra_metadata_s));
     if (md == NULL) {
-        crm_crit("Could not allocate memory for resource metadata");
+        reason = "Could not allocate memory";
         goto err;
     }
 
@@ -281,6 +279,7 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
             struct ra_param_s *p = ra_param_from_xml(match);
 
             if (p == NULL) {
+                reason = "Could not allocate memory";
                 goto err;
             }
             if (pcmk_is_set(p->rap_flags, ra_param_private)) {
@@ -311,6 +310,9 @@ metadata_cache_update(GHashTable *mdc, lrmd_rsc_info_t *rsc,
     return md;
 
 err:
+    crm_warn("Unable to update metadata for %s (%s%s%s:%s): %s",
+             rsc->id, rsc->standard, ((rsc->provider == NULL)? "" : ":"),
+             (rsc->provider != NULL ? rsc->provider : ""), rsc->type, reason);
     free(key);
     free_xml(metadata);
     metadata_free(md);
@@ -377,13 +379,8 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         return NULL;
     }
 
-    metadata = metadata_cache_update(lrm_state->metadata_cache, rsc,
-                                     metadata_str);
+    metadata = controld_cache_metadata(lrm_state->metadata_cache, rsc,
+                                       metadata_str);
     free(metadata_str);
-    if (metadata == NULL) {
-        crm_warn("Failed to update metadata for %s (%s%s%s:%s)",
-                 rsc->id, rsc->standard, ((rsc->provider == NULL)? "" : ":"),
-                 ((rsc->provider == NULL)? "" : rsc->provider), rsc->type);
-    }
     return metadata;
 }
diff --git a/daemons/controld/controld_metadata.h b/daemons/controld/controld_metadata.h
index 7354f94..52d3336 100644
--- a/daemons/controld/controld_metadata.h
+++ b/daemons/controld/controld_metadata.h
@@ -73,9 +73,9 @@ void metadata_cache_free(GHashTable *mdc);
 void metadata_cache_reset(GHashTable *mdc);
 void metadata_cache_fini(void);
 
-struct ra_metadata_s *metadata_cache_update(GHashTable *mdc,
-                                            lrmd_rsc_info_t *rsc,
-                                            const char *metadata_str);
+struct ra_metadata_s *controld_cache_metadata(GHashTable *mdc,
+                                              lrmd_rsc_info_t *rsc,
+                                              const char *metadata_str);
 struct ra_metadata_s *controld_get_rsc_metadata(lrm_state_t *lrm_state,
                                                 lrmd_rsc_info_t *rsc,
                                                 uint32_t source);
-- 
2.31.1

From caeed447d0d8a980d431efd70e5b6f9c91ffac7f Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 15 Sep 2022 13:33:36 -0500
Subject: [PATCH 17/24] Fix: controller: pre-load agent metadata asynchronously

The controller needs resource agent metadata to record digests with pending and
completed resource actions.

Previously, metadata was collected synchronously when needed. This caused
several problems, two of which are fixed here for most actions: synchronous
execution blocks the controller from doing anything else (and if the agent's
metadata action tries to contact the controller, that blocks everything until
the action times out), and the metadata action ate into the real action's
timeout.

Now, if we're likely to need metadata for an action, attempt to get it
asynchronously before executing that action, so the metadata is available in
cache when needed.

This is not a complete solution, as there are other code paths that might
require metadata and still lead to synchronous execution, but it handles the
most important cases.

Fixes T554
---
 daemons/controld/controld_execd.c    | 105 +++++++++++++++++++++++----
 daemons/controld/controld_metadata.c |  22 +++---
 2 files changed, 102 insertions(+), 25 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index fe16c96..c56fdf5 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -670,7 +670,6 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
     struct ra_metadata_s *metadata = NULL;
     const char *caller_version = NULL;
     lrm_state_t *lrm_state = NULL;
-    uint32_t metadata_source = controld_metadata_from_agent;
 
     if (op == NULL) {
         return FALSE;
@@ -703,19 +702,14 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
         return TRUE;
     }
 
-    /* Getting meta-data from cache is OK unless this is a successful start
-     * action -- always refresh from the agent for those, in case the
-     * resource agent was updated.
+    /* Ideally the metadata is cached, and the agent is just a fallback.
      *
-     * @TODO Only refresh the meta-data after starts if the agent actually
-     * changed (using something like inotify, or a hash or modification time of
-     * the agent executable).
+     * @TODO Go through all callers and ensure they get metadata asynchronously
+     * first.
      */
-    if ((op->op_status != PCMK_EXEC_DONE) || (op->rc != target_rc)
-        || !pcmk__str_eq(op->op_type, CRMD_ACTION_START, pcmk__str_none)) {
-        metadata_source |= controld_metadata_from_cache;
-    }
-    metadata = controld_get_rsc_metadata(lrm_state, rsc, metadata_source);
+    metadata = controld_get_rsc_metadata(lrm_state, rsc,
+                                         controld_metadata_from_agent
+                                         |controld_metadata_from_cache);
     if (metadata == NULL) {
         return TRUE;
     }
@@ -1673,6 +1667,56 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
                     user_name, input, unregister);
 }
 
+// User data for asynchronous metadata execution
+struct metadata_cb_data {
+    lrmd_rsc_info_t *rsc;   // Copy of resource information
+    xmlNode *input_xml;     // Copy of FSA input XML
+};
+
+static struct metadata_cb_data *
+new_metadata_cb_data(lrmd_rsc_info_t *rsc, xmlNode *input_xml)
+{
+    struct metadata_cb_data *data = NULL;
+
+    data = calloc(1, sizeof(struct metadata_cb_data));
+    CRM_ASSERT(data != NULL);
+    data->input_xml = copy_xml(input_xml);
+    data->rsc = lrmd_copy_rsc_info(rsc);
+    return data;
+}
+
+static void
+free_metadata_cb_data(struct metadata_cb_data *data)
+{
+    lrmd_free_rsc_info(data->rsc);
+    free_xml(data->input_xml);
+    free(data);
+}
+
+/*!
+ * \internal
+ * \brief Execute an action after metadata has been retrieved
+ *
+ * \param[in] pid        Ignored
+ * \param[in] result     Result of metadata action
+ * \param[in] user_data  Metadata callback data
+ */
+static void
+metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
+{
+    struct metadata_cb_data *data = (struct metadata_cb_data *) user_data;
+
+    struct ra_metadata_s *md = NULL;
+    lrm_state_t *lrm_state = lrm_state_find(lrm_op_target(data->input_xml));
+
+    if ((lrm_state != NULL) && pcmk__result_ok(result)) {
+        md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
+                                     result->action_stdout);
+    }
+    do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+    free_metadata_cb_data(data);
+}
+
 /*	 A_LRM_INVOKE	*/
 void
 do_lrm_invoke(long long action,
@@ -1811,9 +1855,40 @@ do_lrm_invoke(long long action,
         } else {
             struct ra_metadata_s *md = NULL;
 
-            md = controld_get_rsc_metadata(lrm_state, rsc,
-                                           controld_metadata_from_cache);
-            do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
+            /* Getting metadata from cache is OK except for start actions --
+             * always refresh from the agent for those, in case the resource
+             * agent was updated.
+             *
+             * @TODO Only refresh metadata for starts if the agent actually
+             * changed (using something like inotify, or a hash or modification
+             * time of the agent executable).
+             */
+            if (strcmp(operation, CRMD_ACTION_START) != 0) {
+                md = controld_get_rsc_metadata(lrm_state, rsc,
+                                               controld_metadata_from_cache);
+            }
+
+            if ((md == NULL) && crm_op_needs_metadata(rsc->standard,
+                                                      operation)) {
+                /* Most likely, we'll need the agent metadata to record the
+                 * pending operation and the operation result. Get it now rather
+                 * than wait until then, so the metadata action doesn't eat into
+                 * the real action's timeout.
+                 *
+                 * @TODO Metadata is retrieved via direct execution of the
+                 * agent, which has a couple of related issues: the executor
+                 * should execute agents, not the controller; and metadata for
+                 * Pacemaker Remote nodes should be collected on those nodes,
+                 * not locally.
+                 */
+                struct metadata_cb_data *data = NULL;
+
+                data = new_metadata_cb_data(rsc, input->xml);
+                (void) lrmd__metadata_async(rsc, metadata_complete,
+                                            (void *) data);
+            } else {
+                do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
+            }
         }
 
         lrmd_free_rsc_info(rsc);
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index 91a6a10..a954ebd 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -356,17 +356,19 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
         return NULL;
     }
 
-    /* For now, we always collect resource agent meta-data via a local,
-     * synchronous, direct execution of the agent. This has multiple issues:
-     * the executor should execute agents, not the controller; meta-data for
-     * Pacemaker Remote nodes should be collected on those nodes, not
-     * locally; and the meta-data call shouldn't eat into the timeout of the
-     * real action being performed.
+    /* For most actions, metadata was cached asynchronously before action
+     * execution (via metadata_complete()).
      *
-     * These issues are planned to be addressed by having the scheduler
-     * schedule a meta-data cache check at the beginning of each transition.
-     * Once that is working, this block will only be a fallback in case the
-     * initial collection fails.
+     * However if that failed, and for other actions, retrieve the metadata now
+     * via a local, synchronous, direct execution of the agent.
+     *
+     * This has multiple issues, which is why this is just a fallback: the
+     * executor should execute agents, not the controller; metadata for
+     * Pacemaker Remote nodes should be collected on those nodes, not locally;
+     * the metadata call shouldn't eat into the timeout of the real action being
+     * performed; and the synchronous call blocks the controller (which also
+     * means that if the metadata action tries to contact the controller,
+     * everything will hang until the timeout).
      */
     rc = lrm_state_get_metadata(lrm_state, rsc->standard, rsc->provider,
                                 rsc->type, &metadata_str, 0);
-- 
2.31.1

From fddf663d5285740771145e83c41f33c0bfb86dfb Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:19:06 -0500
Subject: [PATCH 18/24] Low: libstonithd: return CRM_EX_NOSUCH for bad agent
 namespace

Callers can't rely on a particular exit code scheme at this point,
but it doesn't hurt
---
 lib/fencing/st_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 91075bd..d41b066 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -2451,7 +2451,7 @@ stonith__metadata_async(const char *agent, int timeout_sec,
         default:
             {
                 pcmk__action_result_t result = {
-                    .exit_status = CRM_EX_ERROR,
+                    .exit_status = CRM_EX_NOSUCH,
                     .execution_status = PCMK_EXEC_ERROR_HARD,
                     .exit_reason = crm_strdup_printf("No such agent '%s'",
                                                      agent),
-- 
2.31.1

From 2de926f5b2b5dbf28f994bc35477d59ce46d5ab1 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:23:43 -0500
Subject: [PATCH 19/24] Low: liblrmd: consider invalid agent specification a
 fatal error

---
 lib/lrmd/lrmd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 4b16bf0..d691dce 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2402,7 +2402,8 @@ lrmd__metadata_async(lrmd_rsc_info_t *rsc,
     CRM_CHECK(callback != NULL, return EINVAL);
 
     if ((rsc == NULL) || (rsc->standard == NULL) || (rsc->type == NULL)) {
-        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED, PCMK_EXEC_ERROR,
+        pcmk__set_result(&result, PCMK_OCF_NOT_CONFIGURED,
+                         PCMK_EXEC_ERROR_FATAL,
                          "Invalid resource specification");
         callback(0, &result, user_data);
         pcmk__reset_result(&result);
-- 
2.31.1

From 2d526dae9dbfc6f8658ff96f5f6d58ee09ea879c Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:25:12 -0500
Subject: [PATCH 20/24] Low: liblrmd: use resource ID for metadata actions when
 available

---
 lib/lrmd/lrmd_client.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index d691dce..570a2b8 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2416,11 +2416,11 @@ lrmd__metadata_async(lrmd_rsc_info_t *rsc,
                                        callback, user_data);
     }
 
-    action = services__create_resource_action(rsc->type, rsc->standard,
-                                              rsc->provider, rsc->type,
-                                              CRMD_ACTION_METADATA, 0,
-                                              CRMD_METADATA_CALL_TIMEOUT, NULL,
-                                              0);
+    action = services__create_resource_action((rsc->id != NULL ? rsc->id : rsc->type),
+                                              rsc->standard, rsc->provider,
+                                              rsc->type, CRMD_ACTION_METADATA,
+                                              0, CRMD_METADATA_CALL_TIMEOUT,
+                                              NULL, 0);
     if (action == NULL) {
         pcmk__set_result(&result, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
                          "Out of memory");
-- 
2.31.1

From 3d632be58dca13293e4ae974da5dfe2838fcdf12 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:27:11 -0500
Subject: [PATCH 21/24] Refactor: controller: executor query can assume local
 node

---
 daemons/controld/controld_execd.c       | 6 +++---
 daemons/controld/controld_fsa.h         | 4 ++--
 daemons/controld/controld_join_client.c | 2 +-
 daemons/controld/controld_join_dc.c     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index c56fdf5..039b194 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -796,16 +796,16 @@ build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
 }
 
 xmlNode *
-controld_query_executor_state(const char *node_name)
+controld_query_executor_state(void)
 {
     xmlNode *xml_state = NULL;
     xmlNode *xml_data = NULL;
     xmlNode *rsc_list = NULL;
     crm_node_t *peer = NULL;
-    lrm_state_t *lrm_state = lrm_state_find(node_name);
+    lrm_state_t *lrm_state = lrm_state_find(fsa_our_uname);
 
     if (!lrm_state) {
-        crm_err("Could not find executor state for node %s", node_name);
+        crm_err("Could not find executor state for node %s", fsa_our_uname);
         return NULL;
     }
 
diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h
index 296232f..d137310 100644
--- a/daemons/controld/controld_fsa.h
+++ b/daemons/controld/controld_fsa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2021 the Pacemaker project contributors
+ * Copyright 2004-2022 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -518,7 +518,7 @@ extern gboolean ever_had_quorum;
 // These should be moved elsewhere
 void do_update_cib_nodes(gboolean overwrite, const char *caller);
 int crmd_cib_smart_opt(void);
-xmlNode *controld_query_executor_state(const char *node_name);
+xmlNode *controld_query_executor_state(void);
 
 const char *fsa_input2string(enum crmd_fsa_input input);
 const char *fsa_state2string(enum crmd_fsa_state state);
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index 6485856..bfec430 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -268,7 +268,7 @@ do_cl_join_finalize_respond(long long action,
     update_dc_expected(input->msg);
 
     /* send our status section to the DC */
-    tmp1 = controld_query_executor_state(fsa_our_uname);
+    tmp1 = controld_query_executor_state();
     if (tmp1 != NULL) {
         xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                         CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c
index 9386182..9a8ea3e 100644
--- a/daemons/controld/controld_join_dc.c
+++ b/daemons/controld/controld_join_dc.c
@@ -591,7 +591,7 @@ do_dc_join_ack(long long action,
     }
     controld_delete_node_state(join_from, section, cib_scope_local);
     if (pcmk__str_eq(join_from, fsa_our_uname, pcmk__str_casei)) {
-        xmlNode *now_dc_lrmd_state = controld_query_executor_state(fsa_our_uname);
+        xmlNode *now_dc_lrmd_state = controld_query_executor_state();
 
         if (now_dc_lrmd_state != NULL) {
             fsa_cib_update(XML_CIB_TAG_STATUS, now_dc_lrmd_state,
-- 
2.31.1

From d852ec335bd5b518a3f06c7f1b597370094311ae Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 20 Sep 2022 10:18:48 -0500
Subject: [PATCH 22/24] Log: controller: add messages when getting agent
 metadata

---
 daemons/controld/controld_execd.c    |  5 +++++
 daemons/controld/controld_metadata.c | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 039b194..f02da82 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1884,6 +1884,11 @@ do_lrm_invoke(long long action,
                 struct metadata_cb_data *data = NULL;
 
                 data = new_metadata_cb_data(rsc, input->xml);
+                crm_info("Retrieving metadata for %s (%s%s%s:%s) asynchronously",
+                         rsc->id, rsc->standard,
+                         ((rsc->provider == NULL)? "" : ":"),
+                         ((rsc->provider == NULL)? "" : rsc->provider),
+                         rsc->type);
                 (void) lrmd__metadata_async(rsc, metadata_complete,
                                             (void *) data);
             } else {
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index a954ebd..39b43b0 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -348,6 +348,11 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
             free(key);
         }
         if (metadata != NULL) {
+            crm_debug("Retrieved metadata for %s (%s%s%s:%s) from cache",
+                      rsc->id, rsc->standard,
+                      ((rsc->provider == NULL)? "" : ":"),
+                      ((rsc->provider == NULL)? "" : rsc->provider),
+                      rsc->type);
             return metadata;
         }
     }
@@ -370,6 +375,11 @@ controld_get_rsc_metadata(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
      * means that if the metadata action tries to contact the controller,
      * everything will hang until the timeout).
      */
+    crm_debug("Retrieving metadata for %s (%s%s%s:%s) synchronously",
+              rsc->id, rsc->standard,
+              ((rsc->provider == NULL)? "" : ":"),
+              ((rsc->provider == NULL)? "" : rsc->provider),
+              rsc->type);
     rc = lrm_state_get_metadata(lrm_state, rsc->standard, rsc->provider,
                                 rsc->type, &metadata_str, 0);
     if (rc != pcmk_ok) {
-- 
2.31.1

From 5aec773a20e1ded971a4082358e266353615f196 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 14 Sep 2022 14:36:44 -0500
Subject: [PATCH 23/24] Test: cts-lab: allow any whitespace in "Recover"
 messages

This seems to have always been multiple spaces, not sure what happened
---
 cts/lab/CTStests.py | 12 ++++++------
 cts/lab/patterns.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
index 5535177..8b56758 100644
--- a/cts/lab/CTStests.py
+++ b/cts/lab/CTStests.py
@@ -1,7 +1,7 @@
 """ Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
 """
 
-__copyright__ = "Copyright 2000-2021 the Pacemaker project contributors"
+__copyright__ = "Copyright 2000-2022 the Pacemaker project contributors"
 __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
 
 #
@@ -1225,7 +1225,7 @@ class MaintenanceMode(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
-            r"schedulerd.*: Recover %s\s*\(.*\)" % self.rid,
+            r"schedulerd.*: Recover\s+%s\s+\(.*\)" % self.rid,
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
@@ -1324,7 +1324,7 @@ class ResourceRecover(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
-            r"schedulerd.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
+            r"schedulerd.*: Recover\s+(%s|%s)\s+\(.*\)" % (self.rid, self.rid_alt),
             r"Unknown operation: fail",
             self.templates["Pat:RscOpOK"] % (self.action, self.rid),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
@@ -2559,7 +2559,7 @@ class RemoteLXC(CTSTest):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for ping",
-            r"schedulerd.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
+            r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)",
             # The orphaned lxc-ms resource causes an expected transition error
             # that is a result of the scheduler not having knowledge that the
             # promotable resource used to be a clone. As a result, it looks like that 
@@ -3054,7 +3054,7 @@ class RemoteStonithd(RemoteDriver):
             r"Software caused connection abort",
             r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
             r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
-            r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
+            r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)",
             r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
         ]
 
@@ -3120,7 +3120,7 @@ class RemoteRscFailure(RemoteDriver):
 
     def errorstoignore(self):
         ignore_pats = [
-            r"schedulerd.*: Recover remote-rsc\s*\(.*\)",
+            r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)",
             r"Dummy.*: No process state file found",
         ]
 
diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py
index 90cac73..6e718f7 100644
--- a/cts/lab/patterns.py
+++ b/cts/lab/patterns.py
@@ -66,7 +66,7 @@ class BasePatterns(object):
 
             "Pat:Fencing_start"   : r"Requesting peer fencing .* targeting %s",
             "Pat:Fencing_ok"      : r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK",
-            "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s",
+            "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover\s+%s",
             "Pat:Fencing_active"  : r"stonith resource .* is active on 2 nodes (attempting recovery)",
             "Pat:Fencing_probe"   : r"pacemaker-controld.* Result of probe operation for %s on .*: Error",
 
@@ -180,7 +180,7 @@ class crm_corosync(BasePatterns):
             r"Parameters to .* action changed:",
             r"Parameters to .* changed",
             r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)",
-            r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)",
+            r"pacemaker-schedulerd.*Recover\s+.*\(.* -\> .*\)",
             r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
             r"Peer is not part of our cluster",
             r"We appear to be in an election loop",
-- 
2.31.1

From 338cf55d19cb4ebebedf092dd0a5969ac2eda295 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 19 Sep 2022 15:55:42 -0500
Subject: [PATCH 24/24] Test: cts-lab: match parentheses correctly

---
 cts/lab/patterns.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py
index 6e718f7..856fffb 100644
--- a/cts/lab/patterns.py
+++ b/cts/lab/patterns.py
@@ -271,6 +271,7 @@ class crm_corosync(BasePatterns):
         ]
         self.components["pacemaker-based-ignore"] = [
             r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)",
+            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)",
             # This is overbroad, but we don't have a way to say that only
             # certain transition errors are acceptable (if the fencer respawns,
             # fence devices may appear multiply active). We have to rely on
@@ -328,7 +329,7 @@ class crm_corosync(BasePatterns):
             r"crit:.*Fencing daemon connection failed",
             r"error:.*Fencer connection failed \(will retry\)",
             r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations",
-            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error",
+            r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)",
             # This is overbroad, but we don't have a way to say that only
             # certain transition errors are acceptable (if the fencer respawns,
             # fence devices may appear multiply active). We have to rely on
-- 
2.31.1