Blob Blame History Raw
From 28566d6832274c59f27bb7b2f1f54420a3f3d822 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 9 May 2019 20:26:08 -0500
Subject: [PATCH 01/13] Refactor: libpe_status: functionize unfencing digest
 code more

... for readability, reusability, and avoiding unnecessary function calls or
memory allocation.
---
 lib/pengine/utils.c | 159 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 118 insertions(+), 41 deletions(-)

diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c
index 2f4dc1e..f80f8d4 100644
--- a/lib/pengine/utils.c
+++ b/lib/pengine/utils.c
@@ -2080,57 +2080,134 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node,
     return data;
 }
 
+/*!
+ * \internal
+ * \brief Create an unfencing summary for use in special node attribute
+ *
+ * Create a string combining a fence device's resource ID, agent type, and
+ * parameter digest (whether for all parameters or just non-private parameters).
+ * This can be stored in a special node attribute, allowing us to detect changes
+ * in either the agent type or parameters, to know whether unfencing must be
+ * redone or can be safely skipped when the device's history is cleaned.
+ *
+ * \param[in] rsc_id        Fence device resource ID
+ * \param[in] agent_type    Fence device agent
+ * \param[in] param_digest  Fence device parameter digest
+ *
+ * \return Newly allocated string with unfencing digest
+ * \note The caller is responsible for freeing the result.
+ */
+static inline char *
+create_unfencing_summary(const char *rsc_id, const char *agent_type,
+                         const char *param_digest)
+{
+    return crm_strdup_printf("%s:%s:%s", rsc_id, agent_type, param_digest);
+}
+
+/*!
+ * \internal
+ * \brief Check whether a node can skip unfencing
+ *
+ * Check whether a fence device's current definition matches a node's
+ * stored summary of when it was last unfenced by the device.
+ *
+ * \param[in] rsc_id        Fence device's resource ID
+ * \param[in] agent         Fence device's agent type
+ * \param[in] digest_calc   Fence device's current parameter digest
+ * \param[in] node_summary  Value of node's special unfencing node attribute
+ *                          (a comma-separated list of unfencing summaries for
+ *                          all devices that have unfenced this node)
+ *
+ * \return TRUE if digest matches, FALSE otherwise
+ */
+static bool
+unfencing_digest_matches(const char *rsc_id, const char *agent,
+                         const char *digest_calc, const char *node_summary)
+{
+    bool matches = FALSE;
+
+    if (rsc_id && agent && digest_calc && node_summary) {
+        char *search_secure = create_unfencing_summary(rsc_id, agent,
+                                                       digest_calc);
+
+        /* The digest was calculated including the device ID and agent,
+         * so there is no risk of collision using strstr().
+         */
+        matches = (strstr(node_summary, search_secure) != NULL);
+        crm_trace("Calculated unfencing digest '%s' %sfound in '%s'",
+                  search_secure, matches? "" : "not ", node_summary);
+        free(search_secure);
+    }
+    return matches;
+}
+
+/* Magic string to use as action name for digest cache entries used for
+ * unfencing checks. This is not a real action name (i.e. "on"), so
+ * check_action_definition() won't confuse these entries with real actions.
+ */
 #define STONITH_DIGEST_TASK "stonith-on"
 
+/*!
+ * \internal
+ * \brief Calculate fence device digests and digest comparison result
+ *
+ * \param[in] rsc       Fence device resource
+ * \param[in] agent     Fence device's agent type
+ * \param[in] node      Node with digest cache to use
+ * \param[in] data_set  Cluster working set
+ *
+ * \return Node's digest cache entry
+ */
 static op_digest_cache_t *
-fencing_action_digest_cmp(resource_t * rsc, node_t * node, pe_working_set_t * data_set)
+fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent,
+                          pe_node_t *node, pe_working_set_t *data_set)
 {
-    char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0);
-    op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, node, NULL, data_set);
+    const char *node_summary = NULL;
 
-    const char *digest_all = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL);
-    const char *digest_secure = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE);
+    // Calculate device's current parameter digests
+    char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0);
+    op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key,
+                                                node, NULL, data_set);
 
-    /* No 'reloads' for fencing device changes
-     *
-     * We use the resource id + agent + digest so that we can detect
-     * changes to the agent and/or the parameters used
-     */
-    char *search_all = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_all_calc);
-    char *search_secure = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_secure_calc);
+    free(key);
 
-    data->rc = RSC_DIGEST_ALL;
-    if (digest_all == NULL) {
-        /* it is unknown what the previous op digest was */
+    // Check whether node has special unfencing summary node attribute
+    node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL);
+    if (node_summary == NULL) {
         data->rc = RSC_DIGEST_UNKNOWN;
+        return data;
+    }
 
-    } else if (strstr(digest_all, search_all)) {
+    // Check whether full parameter digest matches
+    if (unfencing_digest_matches(rsc->id, agent, data->digest_all_calc,
+                                 node_summary)) {
         data->rc = RSC_DIGEST_MATCH;
+        return data;
+    }
 
-    } else if(digest_secure && data->digest_secure_calc) {
-        if(strstr(digest_secure, search_secure)) {
-            if (is_set(data_set->flags, pe_flag_stdout)) {
-                printf("Only 'private' parameters to %s for unfencing %s changed\n",
-                       rsc->id, node->details->uname);
-            }
-            data->rc = RSC_DIGEST_MATCH;
+    // Check whether secure parameter digest matches
+    node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE);
+    if (unfencing_digest_matches(rsc->id, agent, data->digest_secure_calc,
+                                 node_summary)) {
+        data->rc = RSC_DIGEST_MATCH;
+        if (is_set(data_set->flags, pe_flag_stdout)) {
+            printf("Only 'private' parameters to %s for unfencing %s changed\n",
+                   rsc->id, node->details->uname);
         }
+        return data;
     }
 
-    if (is_set(data_set->flags, pe_flag_sanitized)
-        && is_set(data_set->flags, pe_flag_stdout)
-        && (data->rc == RSC_DIGEST_ALL)
+    // Parameters don't match
+    data->rc = RSC_DIGEST_ALL;
+    if (is_set(data_set->flags, (pe_flag_sanitized|pe_flag_stdout))
         && data->digest_secure_calc) {
-        printf("Parameters to %s for unfencing %s changed, try '%s:%s:%s'\n",
-               rsc->id, node->details->uname, rsc->id,
-               (const char *) g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE),
-               data->digest_secure_calc);
-    }
-
-    free(key);
-    free(search_all);
-    free(search_secure);
+        char *digest = create_unfencing_summary(rsc->id, agent,
+                                                data->digest_secure_calc);
 
+        printf("Parameters to %s for unfencing %s changed, try '%s'\n",
+               rsc->id, node->details->uname, digest);
+        free(digest);
+    }
     return data;
 }
 
@@ -2218,9 +2295,6 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe
              *
              * We may do this for all nodes in the future, but for now
              * the check_action_definition() based stuff works fine.
-             *
-             * Use "stonith-on" to avoid creating cache entries for
-             * operations check_action_definition() would look for.
              */
             long max = 1024;
             long digests_all_offset = 0;
@@ -2232,8 +2306,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe
 
             for (GListPtr gIter = matches; gIter != NULL; gIter = gIter->next) {
                 resource_t *match = gIter->data;
-                op_digest_cache_t *data = fencing_action_digest_cmp(match, node, data_set);
+                const char *agent = g_hash_table_lookup(match->meta,
+                                                        XML_ATTR_TYPE);
+                op_digest_cache_t *data = NULL;
 
+                data = fencing_action_digest_cmp(match, agent, node, data_set);
                 if(data->rc == RSC_DIGEST_ALL) {
                     optional = FALSE;
                     crm_notice("Unfencing %s (remote): because the definition of %s changed", node->details->uname, match->id);
@@ -2244,11 +2321,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe
 
                 digests_all_offset += snprintf(
                     digests_all+digests_all_offset, max-digests_all_offset,
-                    "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_all_calc);
+                    "%s:%s:%s,", match->id, agent, data->digest_all_calc);
 
                 digests_secure_offset += snprintf(
                     digests_secure+digests_secure_offset, max-digests_secure_offset,
-                    "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_secure_calc);
+                    "%s:%s:%s,", match->id, agent, data->digest_secure_calc);
             }
             g_hash_table_insert(stonith_op->meta,
                                 strdup(XML_OP_ATTR_DIGESTS_ALL),
-- 
1.8.3.1


From fd6e06ff419c95f4423202163d2d4dca3f03a4c5 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 May 2019 11:57:31 -0500
Subject: [PATCH 02/13] Fix: libpe_status: calculate secure digests for
 unfencing ops

The calculation of digests for detection of when unfencing is needed reused
rsc_action_digest(). However that would only add secure digests when the
pe_flag_sanitized flag was set, which is only set by crm_simulate, so secure
digests would never be added in normal cluster operation. This led to
node attributes like name="#digests-secure"
value="stonith-fence_compute-fence-nova:fence_compute:(null),".

Now, rsc_action_digest() takes a new argument to select whether secure digests
are added, which is always set to TRUE when calculating unfencing digests.
---
 lib/pengine/utils.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c
index f80f8d4..5b893f7 100644
--- a/lib/pengine/utils.c
+++ b/lib/pengine/utils.c
@@ -1936,9 +1936,24 @@ append_versioned_params(xmlNode *versioned_params, const char *ra_version, xmlNo
 }
 #endif
 
+/*!
+ * \internal
+ * \brief Calculate action digests and store in node's digest cache
+ *
+ * \param[in] rsc          Resource that action was for
+ * \param[in] task         Name of action performed
+ * \param[in] key          Action's task key
+ * \param[in] node         Node action was performed on
+ * \param[in] xml_op       XML of operation in CIB status (if available)
+ * \param[in] calc_secure  Whether to calculate secure digest
+ * \param[in] data_set     Cluster working set
+ *
+ * \return Pointer to node's digest cache entry
+ */
 static op_digest_cache_t *
-rsc_action_digest(resource_t * rsc, const char *task, const char *key,
-                  node_t * node, xmlNode * xml_op, pe_working_set_t * data_set) 
+rsc_action_digest(pe_resource_t *rsc, const char *task, const char *key,
+                  pe_node_t *node, xmlNode *xml_op, bool calc_secure,
+                  pe_working_set_t *data_set)
 {
     op_digest_cache_t *data = NULL;
 
@@ -2007,7 +2022,7 @@ rsc_action_digest(resource_t * rsc, const char *task, const char *key,
 
         data->digest_all_calc = calculate_operation_digest(data->params_all, op_version);
 
-        if (is_set(data_set->flags, pe_flag_sanitized)) {
+        if (calc_secure) {
             data->params_secure = copy_xml(data->params_all);
             if(secure_list) {
                 filter_parameters(data->params_secure, secure_list, FALSE);
@@ -2053,7 +2068,9 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node,
 
     interval_ms = crm_parse_ms(interval_ms_s);
     key = generate_op_key(rsc->id, task, interval_ms);
-    data = rsc_action_digest(rsc, task, key, node, xml_op, data_set);
+    data = rsc_action_digest(rsc, task, key, node, xml_op,
+                             is_set(data_set->flags, pe_flag_sanitized),
+                             data_set);
 
     data->rc = RSC_DIGEST_MATCH;
     if (digest_restart && data->digest_restart_calc && strcmp(data->digest_restart_calc, digest_restart) != 0) {
@@ -2167,7 +2184,7 @@ fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent,
     // Calculate device's current parameter digests
     char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0);
     op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key,
-                                                node, NULL, data_set);
+                                                node, NULL, TRUE, data_set);
 
     free(key);
 
-- 
1.8.3.1


From 7886c8ec4dd209078cdc76274ed9d2804ea09b6a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 12:54:34 -0500
Subject: [PATCH 03/13] Refactor: controller: pass desired op status when
 synthesizing failure

so we can use new status codes later
---
 daemons/controld/controld_execd.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 8e89216..fed9419 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1424,8 +1424,22 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
     update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node);
 }
 
+/*!
+ * \internal
+ * \brief Fail a requested action without actually executing it
+ *
+ * For an action that can't be executed, process it similarly to an actual
+ * execution result, with specified error status (except for notify actions,
+ * which will always be treated as successful).
+ *
+ * \param[in] lrm_state  Executor connection that action is for
+ * \param[in] action     Action XML from request
+ * \param[in] rc         Desired return code to use
+ * \param[in] op_status  Desired operation status to use
+ */
 static void
-synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc) 
+synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action,
+                        int op_status, enum ocf_exitcode rc)
 {
     lrmd_event_data_t *op = NULL;
     const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK);
@@ -1451,7 +1465,7 @@ synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc)
     if (safe_str_eq(operation, RSC_NOTIFY)) { // Notifications can't fail
         fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_OK);
     } else {
-        fake_op_status(lrm_state, op, PCMK_LRM_OP_ERROR, rc);
+        fake_op_status(lrm_state, op, op_status, rc);
     }
 
     crm_info("Faking " CRM_OP_FMT " result (%d) on %s",
@@ -1744,7 +1758,8 @@ do_lrm_invoke(long long action,
     if ((lrm_state == NULL) && is_remote_node) {
         crm_err("Failing action because local node has never had connection to remote node %s",
                 target_node);
-        synthesize_lrmd_failure(NULL, input->xml, PCMK_OCF_CONNECTION_DIED);
+        synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_ERROR,
+                                PCMK_OCF_CONNECTION_DIED);
         return;
     }
     CRM_ASSERT(lrm_state != NULL);
@@ -1800,7 +1815,7 @@ do_lrm_invoke(long long action,
 
         rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc);
         if (rc == -ENOTCONN) {
-            synthesize_lrmd_failure(lrm_state, input->xml,
+            synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR,
                                     PCMK_OCF_CONNECTION_DIED);
             return;
 
@@ -1822,7 +1837,7 @@ do_lrm_invoke(long long action,
             // Resource operation on malformed resource
             crm_err("Invalid resource definition for %s", ID(xml_rsc));
             crm_log_xml_warn(input->msg, "invalid resource");
-            synthesize_lrmd_failure(lrm_state, input->xml,
+            synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR,
                                     PCMK_OCF_NOT_CONFIGURED); // fatal error
             return;
 
@@ -1832,7 +1847,7 @@ do_lrm_invoke(long long action,
                     CRM_XS " rc=%d",
                     ID(xml_rsc), pcmk_strerror(rc), rc);
             crm_log_xml_warn(input->msg, "failed registration");
-            synthesize_lrmd_failure(lrm_state, input->xml,
+            synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR,
                                     PCMK_OCF_INVALID_PARAM); // hard error
             return;
         }
-- 
1.8.3.1


From ddc3942d7131db9c9874031ca4b3b4a531221573 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 13:08:15 -0500
Subject: [PATCH 04/13] Fix: controller: use op status, not rc, for executor
 disconnection

Previously, if an action were requested for an executor (local or remote) that
the controller does not have a connection to, the action's rc would be set to
PCMK_OCF_CONNECTION_DIED and its op status to PCMK_LRM_OP_ERROR.

This was undesirable for a couple reasons: PCMK_OCF_CONNECTION_DIED is a
nonstandard extension to the OCF return codes, which can confuse users
trying to look up the meaning or interpret cluster status output; and it really
is an operation execution status and not an operation result.

This changes the result to PCMK_OCF_UNKNOWN_ERROR with a new op status
PCMK_LRM_OP_NOT_CONNECTED. The new codes are mapped to the old ones for older
DCs that don't understand them.
---
 cts/CTStests.py                         |  2 +-
 daemons/controld/controld_execd.c       | 21 +++++++++++++++++----
 daemons/controld/controld_execd_state.c |  6 ++++--
 include/crm/services.h                  |  4 +++-
 lib/common/operations.c                 |  1 +
 lib/pengine/unpack.c                    |  3 ++-
 6 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/cts/CTStests.py b/cts/CTStests.py
index 32945cb..be7fd7f 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -3068,7 +3068,7 @@ class RemoteStonithd(RemoteDriver):
             r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
             r"Calculated [Tt]ransition .*pe-error",
             r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery",
-            r"error: Result of monitor operation for .* on remote-.*: Error",
+            r"error: Result of monitor operation for .* on remote-.*: No executor connection",
         ]
 
         ignore_pats.extend(RemoteDriver.errorstoignore(self))
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index fed9419..ac215b6 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1758,8 +1758,8 @@ do_lrm_invoke(long long action,
     if ((lrm_state == NULL) && is_remote_node) {
         crm_err("Failing action because local node has never had connection to remote node %s",
                 target_node);
-        synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_ERROR,
-                                PCMK_OCF_CONNECTION_DIED);
+        synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_NOT_CONNECTED,
+                                PCMK_OCF_UNKNOWN_ERROR);
         return;
     }
     CRM_ASSERT(lrm_state != NULL);
@@ -1815,8 +1815,9 @@ do_lrm_invoke(long long action,
 
         rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc);
         if (rc == -ENOTCONN) {
-            synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR,
-                                    PCMK_OCF_CONNECTION_DIED);
+            synthesize_lrmd_failure(lrm_state, input->xml,
+                                    PCMK_LRM_OP_NOT_CONNECTED,
+                                    PCMK_OCF_UNKNOWN_ERROR);
             return;
 
         } else if ((rc < 0) && !create_rsc) {
@@ -2532,6 +2533,18 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
     CRM_CHECK(op != NULL, return);
     CRM_CHECK(op->rsc_id != NULL, return);
 
+    // Remap new status codes for older DCs
+    if (compare_version(fsa_our_dc_version, "3.2.0") < 0) {
+        switch (op->op_status) {
+            case PCMK_LRM_OP_NOT_CONNECTED:
+                op->op_status = PCMK_LRM_OP_ERROR;
+                op->rc = PCMK_OCF_CONNECTION_DIED;
+                break;
+            default:
+                break;
+        }
+    }
+
     op_id = make_stop_id(op->rsc_id, op->call_id);
     op_key = generate_op_key(op->rsc_id, op->op_type, op->interval_ms);
 
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 4e9f096..63e6b33 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -1,6 +1,8 @@
 /*
  * Copyright 2012-2019 the Pacemaker project contributors
  *
+ * The version control history for this file may have further details.
+ *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
  */
@@ -76,8 +78,8 @@ fail_pending_op(gpointer key, gpointer value, gpointer user_data)
     event.user_data = op->user_data;
     event.timeout = 0;
     event.interval_ms = op->interval_ms;
-    event.rc = PCMK_OCF_CONNECTION_DIED;
-    event.op_status = PCMK_LRM_OP_ERROR;
+    event.rc = PCMK_OCF_UNKNOWN_ERROR;
+    event.op_status = PCMK_LRM_OP_NOT_CONNECTED;
     event.t_run = op->start_time;
     event.t_rcchange = op->start_time;
 
diff --git a/include/crm/services.h b/include/crm/services.h
index 4bdd21a..ca9470b 100644
--- a/include/crm/services.h
+++ b/include/crm/services.h
@@ -100,7 +100,7 @@ enum ocf_exitcode {
 
 
     /* 150-199	reserved for application use */
-    PCMK_OCF_CONNECTION_DIED = 189, /* Operation failure implied by disconnection of the LRM API to a local or remote node */
+    PCMK_OCF_CONNECTION_DIED = 189, // Deprecated (see PCMK_LRM_OP_NOT_CONNECTED)
 
     PCMK_OCF_DEGRADED        = 190, /* Active resource that is no longer 100% functional */
     PCMK_OCF_DEGRADED_MASTER = 191, /* Promoted resource that is no longer 100% functional */
@@ -126,6 +126,7 @@ enum op_status {
     PCMK_LRM_OP_ERROR_HARD,
     PCMK_LRM_OP_ERROR_FATAL,
     PCMK_LRM_OP_NOT_INSTALLED,
+    PCMK_LRM_OP_NOT_CONNECTED,
 };
 
 enum nagios_exitcode {
@@ -337,6 +338,7 @@ gboolean services_alert_async(svc_action_t *action,
                 case PCMK_LRM_OP_NOTSUPPORTED:return "NOT SUPPORTED";
                 case PCMK_LRM_OP_ERROR:return "Error";
                 case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed";
+                case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection";
                 default:return "UNKNOWN!";
         }
     }
diff --git a/lib/common/operations.c b/lib/common/operations.c
index 2144cc6..c6b16cb 100644
--- a/lib/common/operations.c
+++ b/lib/common/operations.c
@@ -395,6 +395,7 @@ did_rsc_op_fail(lrmd_event_data_t * op, int target_rc)
         case PCMK_LRM_OP_NOTSUPPORTED:
         case PCMK_LRM_OP_TIMEOUT:
         case PCMK_LRM_OP_ERROR:
+        case PCMK_LRM_OP_NOT_CONNECTED:
             return TRUE;
             break;
 
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 0e8177b..671f0c4 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -3163,7 +3163,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last
     crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
 
     CRM_CHECK(task != NULL, return FALSE);
-    CRM_CHECK(status <= PCMK_LRM_OP_NOT_INSTALLED, return FALSE);
+    CRM_CHECK(status <= PCMK_LRM_OP_NOT_CONNECTED, return FALSE);
     CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return FALSE);
 
     if (safe_str_eq(task, CRMD_ACTION_NOTIFY) ||
@@ -3304,6 +3304,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last
         case PCMK_LRM_OP_ERROR_FATAL:
         case PCMK_LRM_OP_TIMEOUT:
         case PCMK_LRM_OP_NOTSUPPORTED:
+        case PCMK_LRM_OP_NOT_CONNECTED:
 
             failure_strategy = get_action_on_fail(rsc, task_key, task, data_set);
             if ((failure_strategy == action_fail_ignore)
-- 
1.8.3.1


From fc135cb441fb7c66a44fbffe74dcae26c112be3f Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 13:43:08 -0500
Subject: [PATCH 05/13] Fix: controller: use op status, not rc, for execution
 in invalid state

Previously, if an action were requested while the controller cannot execute actions
(i.e. shutdown), the action's rc would be set to CRM_DIRECT_NACK_RC and its op
status to PCMK_LRM_OP_ERROR.

This was undesirable for a couple reasons: rc should only be OCF return codes,
and it really is an operation execution status and not an operation result.

This changes the result to PCMK_OCF_UNKNOWN_ERROR with a new op status
PCMK_LRM_OP_INVALID. The new codes are mapped to the old ones for older
DCs that don't understand them.
---
 daemons/controld/controld_execd.c     |  8 ++++++--
 daemons/controld/controld_fsa.h       |  6 +-----
 daemons/controld/controld_te_events.c | 13 ++++++-------
 include/crm/services.h                |  2 ++
 lib/common/operations.c               |  1 +
 lib/pengine/unpack.c                  |  3 ++-
 6 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index ac215b6..a20f96a 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2254,8 +2254,8 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
                    operation, rsc->id, fsa_state2string(fsa_state),
                    is_set(fsa_input_register, R_SHUTDOWN)?"true":"false");
 
-        op->rc = CRM_DIRECT_NACK_RC;
-        op->op_status = PCMK_LRM_OP_ERROR;
+        op->rc = PCMK_OCF_UNKNOWN_ERROR;
+        op->op_status = PCMK_LRM_OP_INVALID;
         send_direct_ack(NULL, NULL, rsc, op, rsc->id);
         lrmd_free_event(op);
         free(op_id);
@@ -2540,6 +2540,10 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
                 op->op_status = PCMK_LRM_OP_ERROR;
                 op->rc = PCMK_OCF_CONNECTION_DIED;
                 break;
+            case PCMK_LRM_OP_INVALID:
+                op->op_status = PCMK_LRM_OP_ERROR;
+                op->rc = CRM_DIRECT_NACK_RC;
+                break;
             default:
                 break;
         }
diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h
index 397a9cd..7527ed9 100644
--- a/daemons/controld/controld_fsa.h
+++ b/daemons/controld/controld_fsa.h
@@ -426,11 +426,7 @@ enum crmd_fsa_input {
 
 #  define R_IN_RECOVERY     0x80000000ULL
 
-/*
- * Magic RC used within the controller to indicate direct nacks
- * (operation is invalid in current state)
- */
-#define CRM_DIRECT_NACK_RC (99)
+#define CRM_DIRECT_NACK_RC (99) // Deprecated (see PCMK_LRM_OP_INVALID)
 
 enum crmd_fsa_cause {
     C_UNKNOWN = 0,
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
index b7b48a4..d297241 100644
--- a/daemons/controld/controld_te_events.c
+++ b/daemons/controld/controld_te_events.c
@@ -123,10 +123,8 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc,
     const char *on_uname = crm_peer_uname(event_node_uuid);
     const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
 
-    /* Nothing needs to be done for success, lrm status refresh,
-     * or direct nack (internal code for "busy, try again")
-     */
-    if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) {
+    // Nothing needs to be done for success or status refresh
+    if (rc == target_rc) {
         return FALSE;
     } else if (safe_str_eq(origin, "build_active_RAs")) {
         crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
@@ -225,7 +223,7 @@ status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc)
         return PCMK_LRM_OP_DONE;
     }
 
-    if (rc != CRM_DIRECT_NACK_RC) {
+    if (orig_status != PCMK_LRM_OP_INVALID) {
         const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
         const char *uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 
@@ -541,8 +539,9 @@ process_graph_event(xmlNode *event, const char *event_node)
     if (action && (rc == target_rc)) {
         crm_trace("Processed update to %s: %s", id, magic);
     } else {
-        if (update_failcount(event, event_node, rc, target_rc,
-                             (transition_num == -1), ignore_failures)) {
+        if ((status != PCMK_LRM_OP_INVALID)
+            && update_failcount(event, event_node, rc, target_rc,
+                               (transition_num == -1), ignore_failures)) {
             desc = "failed";
         }
         crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num,
diff --git a/include/crm/services.h b/include/crm/services.h
index ca9470b..0771241 100644
--- a/include/crm/services.h
+++ b/include/crm/services.h
@@ -127,6 +127,7 @@ enum op_status {
     PCMK_LRM_OP_ERROR_FATAL,
     PCMK_LRM_OP_NOT_INSTALLED,
     PCMK_LRM_OP_NOT_CONNECTED,
+    PCMK_LRM_OP_INVALID,
 };
 
 enum nagios_exitcode {
@@ -339,6 +340,7 @@ gboolean services_alert_async(svc_action_t *action,
                 case PCMK_LRM_OP_ERROR:return "Error";
                 case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed";
                 case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection";
+                case PCMK_LRM_OP_INVALID:return "Cannot execute now";
                 default:return "UNKNOWN!";
         }
     }
diff --git a/lib/common/operations.c b/lib/common/operations.c
index c6b16cb..480bddc 100644
--- a/lib/common/operations.c
+++ b/lib/common/operations.c
@@ -396,6 +396,7 @@ did_rsc_op_fail(lrmd_event_data_t * op, int target_rc)
         case PCMK_LRM_OP_TIMEOUT:
         case PCMK_LRM_OP_ERROR:
         case PCMK_LRM_OP_NOT_CONNECTED:
+        case PCMK_LRM_OP_INVALID:
             return TRUE;
             break;
 
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 671f0c4..fb1ab60 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -3163,7 +3163,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last
     crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
 
     CRM_CHECK(task != NULL, return FALSE);
-    CRM_CHECK(status <= PCMK_LRM_OP_NOT_CONNECTED, return FALSE);
+    CRM_CHECK(status <= PCMK_LRM_OP_INVALID, return FALSE);
     CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return FALSE);
 
     if (safe_str_eq(task, CRMD_ACTION_NOTIFY) ||
@@ -3305,6 +3305,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last
         case PCMK_LRM_OP_TIMEOUT:
         case PCMK_LRM_OP_NOTSUPPORTED:
         case PCMK_LRM_OP_NOT_CONNECTED:
+        case PCMK_LRM_OP_INVALID:
 
             failure_strategy = get_action_on_fail(rsc, task_key, task, data_set);
             if ((failure_strategy == action_fail_ignore)
-- 
1.8.3.1


From f5ea526b211e95ece16acb0f72bfbbbda60ec437 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 12 Jun 2019 20:48:59 -0500
Subject: [PATCH 06/13] Doc: libcrmcommon: document CRM_FEATURE_SET in API docs

---
 include/crm/crm.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/crm/crm.h b/include/crm/crm.h
index 5f323e8..56a2048 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2018 the Pacemaker project contributors
+ * Copyright 2004-2019 the Pacemaker project contributors
  *
  * The version control history for this file may have further details.
  *
@@ -29,6 +29,27 @@ extern "C" {
 
 #  include <libxml/tree.h>
 
+/*!
+ * The CRM feature set assists with compatibility in mixed-version clusters.
+ * The major version number increases when nodes with different versions
+ * would not work (rolling upgrades are not allowed). The minor version
+ * number increases when mixed-version clusters are allowed only during
+ * rolling upgrades (a node with the oldest feature set will be elected DC). The
+ * minor-minor version number is ignored, but allows resource agents to detect
+ * cluster support for various features.
+ *
+ * The feature set also affects the processing of old saved CIBs (such as for
+ * many scheduler regression tests).
+ *
+ * Particular feature points currently used by pacemaker:
+ *
+ * >2.1:     Operation updates include timing data
+ * >=3.0.5:  XML v2 digests are created
+ * >=3.0.8:  Peers do not need acks for cancellations
+ * >=3.0.9:  DC will send its own shutdown request to all peers
+ *           XML v2 patchsets are created by default
+ * >=3.0.13: Fail counts include operation name and interval
+ */
 #  define CRM_FEATURE_SET		"3.1.0"
 
 #  define EOS		'\0'
-- 
1.8.3.1


From 1ff54a448b1178a34f2dd4f615221087e08468de Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 12 Jun 2019 20:51:21 -0500
Subject: [PATCH 07/13] Feature: libcrmcommon: bump CRM feature set

... for the new LRM op status codes
---
 include/crm/crm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/crm/crm.h b/include/crm/crm.h
index 56a2048..cbf72d3 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -49,8 +49,9 @@ extern "C" {
  * >=3.0.9:  DC will send its own shutdown request to all peers
  *           XML v2 patchsets are created by default
  * >=3.0.13: Fail counts include operation name and interval
+ * >=3.2.0:  DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED
  */
-#  define CRM_FEATURE_SET		"3.1.0"
+#  define CRM_FEATURE_SET		"3.2.0"
 
 #  define EOS		'\0'
 #  define DIMOF(a)	((int) (sizeof(a)/sizeof(a[0])) )
-- 
1.8.3.1


From efc639cc835fba27fa5af4a0539e995d95660520 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 15:12:20 -0500
Subject: [PATCH 08/13] Low: libpe_status: fail connection resource if remote
 action gets "not connected"

---
 lib/pengine/unpack.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index fb1ab60..081df07 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -3299,12 +3299,25 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last
             unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set);
             break;
 
+        case PCMK_LRM_OP_NOT_CONNECTED:
+            if (pe__is_guest_or_remote_node(node)
+                && is_set(node->details->remote_rsc->flags, pe_rsc_managed)) {
+                /* We should never get into a situation where a managed remote
+                 * connection resource is considered OK but a resource action
+                 * behind the connection gets a "not connected" status. But as a
+                 * fail-safe in case a bug or unusual circumstances do lead to
+                 * that, ensure the remote connection is considered failed.
+                 */
+                set_bit(node->details->remote_rsc->flags, pe_rsc_failed);
+            }
+
+            // fall through
+
         case PCMK_LRM_OP_ERROR:
         case PCMK_LRM_OP_ERROR_HARD:
         case PCMK_LRM_OP_ERROR_FATAL:
         case PCMK_LRM_OP_TIMEOUT:
         case PCMK_LRM_OP_NOTSUPPORTED:
-        case PCMK_LRM_OP_NOT_CONNECTED:
         case PCMK_LRM_OP_INVALID:
 
             failure_strategy = get_action_on_fail(rsc, task_key, task, data_set);
-- 
1.8.3.1


From dad337a96dfeca4dbde7bbd97f99f24956440fc2 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Sat, 8 Jun 2019 16:25:04 -0500
Subject: [PATCH 09/13] Refactor: libpe_status: add function for checking
 shutdown attribute

... to reduce code duplication and allow further reuse
---
 include/crm/pengine/internal.h |  2 ++
 lib/pengine/unpack.c           |  8 ++------
 lib/pengine/utils.c            | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h
index fd55bb9..a2a3d52 100644
--- a/include/crm/pengine/internal.h
+++ b/include/crm/pengine/internal.h
@@ -359,4 +359,6 @@ void pe__foreach_param_check(pe_working_set_t *data_set,
                                         enum pe_check_parameters,
                                         pe_working_set_t*));
 void pe__free_param_checks(pe_working_set_t *data_set);
+
+bool pe__shutdown_requested(pe_node_t *node);
 #endif
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 081df07..9d13a57 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -909,7 +909,6 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t *
     const char *resource_discovery_enabled = NULL;
     xmlNode *attrs = NULL;
     resource_t *rsc = NULL;
-    const char *shutdown = NULL;
 
     if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) {
         return;
@@ -931,8 +930,7 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t *
     attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE);
     add_node_attrs(attrs, this_node, TRUE, data_set);
 
-    shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN);
-    if (shutdown != NULL && safe_str_neq("0", shutdown)) {
+    if (pe__shutdown_requested(this_node)) {
         crm_info("Node %s is shutting down", this_node->details->uname);
         this_node->details->shutdown = TRUE;
         if (rsc) {
@@ -1392,7 +1390,6 @@ gboolean
 determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set)
 {
     gboolean online = FALSE;
-    const char *shutdown = NULL;
     const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED);
 
     if (this_node == NULL) {
@@ -1402,9 +1399,8 @@ determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set
 
     this_node->details->shutdown = FALSE;
     this_node->details->expected_up = FALSE;
-    shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN);
 
-    if (shutdown != NULL && safe_str_neq("0", shutdown)) {
+    if (pe__shutdown_requested(this_node)) {
         this_node->details->shutdown = TRUE;
 
     } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) {
diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c
index 5b893f7..c5fd0f7 100644
--- a/lib/pengine/utils.c
+++ b/lib/pengine/utils.c
@@ -2510,3 +2510,23 @@ void pe_action_set_reason(pe_action_t *action, const char *reason, bool overwrit
         }
     }
 }
+
+/*!
+ * \internal
+ * \brief Check whether shutdown has been requested for a node
+ *
+ * \param[in] node  Node to check
+ *
+ * \return TRUE if node has shutdown attribute set and nonzero, FALSE otherwise
+ * \note This differs from simply using node->details->shutdown in that it can
+ *       be used before that has been determined (and in fact to determine it),
+ *       and it can also be used to distinguish requested shutdown from implicit
+ *       shutdown of remote nodes by virtue of their connection stopping.
+ */
+bool
+pe__shutdown_requested(pe_node_t *node)
+{
+    const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN);
+
+    return shutdown && strcmp(shutdown, "0");
+}
-- 
1.8.3.1


From 1e9903326a59f58d9dd2f2618d709f8aa61e41e9 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 16:37:26 -0500
Subject: [PATCH 10/13] Fix: scheduler: remote state is failed if node is
 shutting down with connection failure

When determining remote state, if the connection resource is failed and not
being started again, we consider the state to be unknown if the connection has
a reconnect interval, because we won't know whether the connection can be
recovered until the interval expires and we re-attempt connection.

However, if the node is shutting down at the time, we won't re-attempt
connection, so consider the state failed in that case. (Note that we check the
actual shutdown node attribute, rather than node->details->shutdown, since that
is set for remote nodes whenever the connection is stopping.)

This avoids a situation where actions that cannot succeed can be scheduled on a
remote node that's shutting down.
---
 lib/pacemaker/pcmk_sched_allocate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c
index 3363a72..b7d1b48 100644
--- a/lib/pacemaker/pcmk_sched_allocate.c
+++ b/lib/pacemaker/pcmk_sched_allocate.c
@@ -1972,7 +1972,8 @@ get_remote_node_state(pe_node_t *node)
 
         if ((remote_rsc->next_role == RSC_ROLE_STOPPED)
             && remote_rsc->remote_reconnect_ms
-            && node->details->remote_was_fenced) {
+            && node->details->remote_was_fenced
+            && !pe__shutdown_requested(node)) {
 
             /* We won't know whether the connection is recoverable until the
              * reconnect interval expires and we reattempt connection.
-- 
1.8.3.1


From ea70750d04219618b5feeda04443b27616e441a0 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 5 Jun 2019 16:43:19 -0500
Subject: [PATCH 11/13] Fix: libpe_status: don't order implied stops relative
 to a remote connection

Actions behind a remote connection are ordered relative to any start or stop of
the remote connection. However, if the action is a stop implied due to fencing,
it does not require the remote connection, and the ordering should not be done.

This avoids a delay in the remote connection recovery if it is failed, e.g.
previously the ordering would look like:

   fence remote node -> implied stop of resource on remote -> stop connection

Now, the connection stop can proceed simultaneously with the remote node
fencing.
---
 lib/pacemaker/pcmk_sched_allocate.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c
index b7d1b48..9f82c00 100644
--- a/lib/pacemaker/pcmk_sched_allocate.c
+++ b/lib/pacemaker/pcmk_sched_allocate.c
@@ -2065,14 +2065,13 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
                                        pe_order_implies_first, data_set);
 
             } else if(state == remote_state_failed) {
-                /* We would only be here if the resource is
-                 * running on the remote node.  Since we have no
-                 * way to stop it, it is necessary to fence the
-                 * node.
+                /* The resource is active on the node, but since we don't have a
+                 * valid connection, the only way to stop the resource is by
+                 * fencing the node. There is no need to order the stop relative
+                 * to the remote connection, since the stop will become implied
+                 * by the fencing.
                  */
                 pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable");
-                order_action_then_stop(action, remote_rsc,
-                                       pe_order_implies_first, data_set);
 
             } else if(remote_rsc->next_role == RSC_ROLE_STOPPED) {
                 /* State must be remote_state_unknown or remote_state_stopped.
-- 
1.8.3.1


From 091c367369b892d26fe0de99d35cf521b6249d10 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Sat, 8 Jun 2019 16:51:20 -0500
Subject: [PATCH 12/13] Test: cts-scheduler: update regression tests for remote
 connection ordering change

Remote connection stops no longer have to wait for implied stops of resources
behind the connection.

Unchanged from before, if the remote connection stops are implied themselves,
they can be confirmed immediately without waiting for their host's fencing,
because remote connections have "requires" set to "quorum" rather than
"fencing".
---
 cts/scheduler/order-expired-failure.dot               |  1 -
 cts/scheduler/order-expired-failure.exp               |  6 +-----
 cts/scheduler/order-expired-failure.summary           |  2 +-
 cts/scheduler/remote-connection-unrecoverable.dot     |  2 --
 cts/scheduler/remote-connection-unrecoverable.exp     |  9 +--------
 cts/scheduler/remote-connection-unrecoverable.summary |  2 +-
 cts/scheduler/remote-fence-before-reconnect.dot       |  1 -
 cts/scheduler/remote-fence-before-reconnect.exp       |  6 +-----
 cts/scheduler/remote-fence-before-reconnect.summary   |  2 +-
 cts/scheduler/remote-recover-all.dot                  |  2 --
 cts/scheduler/remote-recover-all.exp                  | 12 ++----------
 cts/scheduler/remote-recover-all.summary              |  4 ++--
 cts/scheduler/remote-recover-no-resources.dot         |  1 -
 cts/scheduler/remote-recover-no-resources.exp         |  6 +-----
 cts/scheduler/remote-recover-no-resources.summary     |  2 +-
 cts/scheduler/remote-recover-unknown.dot              |  1 -
 cts/scheduler/remote-recover-unknown.exp              |  6 +-----
 cts/scheduler/remote-recover-unknown.summary          |  2 +-
 18 files changed, 14 insertions(+), 53 deletions(-)

diff --git a/cts/scheduler/order-expired-failure.dot b/cts/scheduler/order-expired-failure.dot
index 2e9963b..5c21d5d 100644
--- a/cts/scheduler/order-expired-failure.dot
+++ b/cts/scheduler/order-expired-failure.dot
@@ -4,7 +4,6 @@ digraph "g" {
 "compute-unfence-trigger-clone_stop_0" [ style=bold color="green" fontcolor="orange"]
 "compute-unfence-trigger-clone_stopped_0" [ style=bold color="green" fontcolor="orange"]
 "compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "compute-unfence-trigger-clone_stopped_0" [ style = bold]
-"compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "overcloud-novacompute-1_stop_0 controller-1" [ style = bold]
 "compute-unfence-trigger_stop_0 overcloud-novacompute-1" [ style=bold color="green" fontcolor="orange"]
 "ip-10.0.0.110_monitor_10000 controller-1" [ style=bold color="green" fontcolor="black"]
 "ip-10.0.0.110_start_0 controller-1" -> "ip-10.0.0.110_monitor_10000 controller-1" [ style = bold]
diff --git a/cts/scheduler/order-expired-failure.exp b/cts/scheduler/order-expired-failure.exp
index c476bc2..4a50493 100644
--- a/cts/scheduler/order-expired-failure.exp
+++ b/cts/scheduler/order-expired-failure.exp
@@ -9,11 +9,7 @@
         </downed>
       </rsc_op>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="220" operation="stop" operation_key="compute-unfence-trigger_stop_0" internal_operation_key="compute-unfence-trigger:1_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
diff --git a/cts/scheduler/order-expired-failure.summary b/cts/scheduler/order-expired-failure.summary
index c86bb91..2cf43ed 100644
--- a/cts/scheduler/order-expired-failure.summary
+++ b/cts/scheduler/order-expired-failure.summary
@@ -52,6 +52,7 @@ Transition Summary:
  * Stop       compute-unfence-trigger:1            ( overcloud-novacompute-1 )   due to node availability
 
 Executing cluster transition:
+ * Resource action: overcloud-novacompute-1 stop on controller-1
  * Resource action: stonith-fence_compute-fence-nova stop on controller-2
  * Fencing overcloud-novacompute-1 (reboot)
  * Cluster action:  clear_failcount for overcloud-novacompute-1 on controller-1
@@ -62,7 +63,6 @@ Executing cluster transition:
  * Resource action: ip-10.0.0.110   monitor=10000 on controller-1
  * Pseudo action:   compute-unfence-trigger_stop_0
  * Pseudo action:   compute-unfence-trigger-clone_stopped_0
- * Resource action: overcloud-novacompute-1 stop on controller-1
 Using the original execution date of: 2018-04-09 07:55:35Z
 
 Revised cluster status:
diff --git a/cts/scheduler/remote-connection-unrecoverable.dot b/cts/scheduler/remote-connection-unrecoverable.dot
index 7728425..1017d2b 100644
--- a/cts/scheduler/remote-connection-unrecoverable.dot
+++ b/cts/scheduler/remote-connection-unrecoverable.dot
@@ -7,14 +7,12 @@ digraph "g" {
 "remote1_stop_0 node1" [ style=bold color="green" fontcolor="orange"]
 "rsc1_delete_0 remote1" -> "rsc1_start_0 node2" [ style = dashed]
 "rsc1_delete_0 remote1" [ style=dashed color="red" fontcolor="black"]
-"rsc1_monitor_0 node2" -> "remote1_stop_0 node1" [ style = bold]
 "rsc1_monitor_0 node2" -> "rsc1_start_0 node2" [ style = bold]
 "rsc1_monitor_0 node2" -> "rsc2-master_demote_0" [ style = bold]
 "rsc1_monitor_0 node2" [ style=bold color="green" fontcolor="black"]
 "rsc1_monitor_10000 node2" [ style=bold color="green" fontcolor="black"]
 "rsc1_start_0 node2" -> "rsc1_monitor_10000 node2" [ style = bold]
 "rsc1_start_0 node2" [ style=bold color="green" fontcolor="black"]
-"rsc1_stop_0 remote1" -> "remote1_stop_0 node1" [ style = bold]
 "rsc1_stop_0 remote1" -> "rsc1_delete_0 remote1" [ style = dashed]
 "rsc1_stop_0 remote1" -> "rsc1_start_0 node2" [ style = bold]
 "rsc1_stop_0 remote1" -> "rsc2-master_demote_0" [ style = bold]
diff --git a/cts/scheduler/remote-connection-unrecoverable.exp b/cts/scheduler/remote-connection-unrecoverable.exp
index 2c9357b..d57c106 100644
--- a/cts/scheduler/remote-connection-unrecoverable.exp
+++ b/cts/scheduler/remote-connection-unrecoverable.exp
@@ -5,14 +5,7 @@
         <attributes CRM_meta_timeout="20000"  reconnect_interval="60"/>
       </pseudo_event>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="6" operation="stop" operation_key="rsc1_stop_0"/>
-      </trigger>
-      <trigger>
-        <rsc_op id="8" operation="monitor" operation_key="rsc1_monitor_0" on_node="node2" on_node_uuid="2"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
diff --git a/cts/scheduler/remote-connection-unrecoverable.summary b/cts/scheduler/remote-connection-unrecoverable.summary
index 23fa9ca..caff564 100644
--- a/cts/scheduler/remote-connection-unrecoverable.summary
+++ b/cts/scheduler/remote-connection-unrecoverable.summary
@@ -21,6 +21,7 @@ Transition Summary:
  * Stop       rsc2:0      (     Master node1 )   due to node availability
 
 Executing cluster transition:
+ * Pseudo action:   remote1_stop_0
  * Resource action: killer          stop on node2
  * Resource action: rsc1            monitor on node2
  * Fencing node1 (reboot)
@@ -29,7 +30,6 @@ Executing cluster transition:
  * Resource action: killer          monitor=60000 on node2
  * Pseudo action:   rsc1_stop_0
  * Pseudo action:   rsc2-master_demote_0
- * Pseudo action:   remote1_stop_0
  * Resource action: rsc1            start on node2
  * Pseudo action:   rsc2_demote_0
  * Pseudo action:   rsc2-master_demoted_0
diff --git a/cts/scheduler/remote-fence-before-reconnect.dot b/cts/scheduler/remote-fence-before-reconnect.dot
index 4ced43e..5812b7f 100644
--- a/cts/scheduler/remote-fence-before-reconnect.dot
+++ b/cts/scheduler/remote-fence-before-reconnect.dot
@@ -3,7 +3,6 @@
 "fake2_monitor_10000 c7auto1" [ style=bold color="green" fontcolor="black"]
 "fake2_start_0 c7auto1" -> "fake2_monitor_10000 c7auto1" [ style = bold]
 "fake2_start_0 c7auto1" [ style=bold color="green" fontcolor="black"]
-"fake2_stop_0 c7auto4" -> "c7auto4_stop_0 c7auto1" [ style = bold]
 "fake2_stop_0 c7auto4" -> "fake2_start_0 c7auto1" [ style = bold]
 "fake2_stop_0 c7auto4" [ style=bold color="green" fontcolor="orange"]
 "stonith 'reboot' c7auto4" -> "fake2_start_0 c7auto1" [ style = bold]
diff --git a/cts/scheduler/remote-fence-before-reconnect.exp b/cts/scheduler/remote-fence-before-reconnect.exp
index f99d9ef..f506f85 100644
--- a/cts/scheduler/remote-fence-before-reconnect.exp
+++ b/cts/scheduler/remote-fence-before-reconnect.exp
@@ -9,11 +9,7 @@
         </downed>
       </rsc_op>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="13" operation="stop" operation_key="fake2_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
diff --git a/cts/scheduler/remote-fence-before-reconnect.summary b/cts/scheduler/remote-fence-before-reconnect.summary
index f61e18b..03eac20 100644
--- a/cts/scheduler/remote-fence-before-reconnect.summary
+++ b/cts/scheduler/remote-fence-before-reconnect.summary
@@ -17,9 +17,9 @@ Transition Summary:
  * Move       fake2       ( c7auto4 -> c7auto1 )  
 
 Executing cluster transition:
+ * Resource action: c7auto4         stop on c7auto1
  * Fencing c7auto4 (reboot)
  * Pseudo action:   fake2_stop_0
- * Resource action: c7auto4         stop on c7auto1
  * Resource action: fake2           start on c7auto1
  * Resource action: fake2           monitor=10000 on c7auto1
 
diff --git a/cts/scheduler/remote-recover-all.dot b/cts/scheduler/remote-recover-all.dot
index deed802..4128b10 100644
--- a/cts/scheduler/remote-recover-all.dot
+++ b/cts/scheduler/remote-recover-all.dot
@@ -19,7 +19,6 @@ digraph "g" {
 "galera_demote_0 galera-2" -> "galera_stop_0 galera-2" [ style = bold]
 "galera_demote_0 galera-2" [ style=bold color="green" fontcolor="orange"]
 "galera_monitor_10000 galera-0" [ style=bold color="green" fontcolor="black"]
-"galera_stop_0 galera-2" -> "galera-2_stop_0 controller-1" [ style = bold]
 "galera_stop_0 galera-2" -> "galera-master_stopped_0" [ style = bold]
 "galera_stop_0 galera-2" [ style=bold color="green" fontcolor="orange"]
 "haproxy-clone_stop_0" -> "haproxy-clone_stopped_0" [ style = bold]
@@ -60,7 +59,6 @@ digraph "g" {
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold]
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold]
 "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"]
-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold]
 "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold]
 "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"]
 "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"]
diff --git a/cts/scheduler/remote-recover-all.exp b/cts/scheduler/remote-recover-all.exp
index 8137ffb..0cb51f6 100644
--- a/cts/scheduler/remote-recover-all.exp
+++ b/cts/scheduler/remote-recover-all.exp
@@ -5,11 +5,7 @@
         <attributes CRM_meta_name="stop" CRM_meta_timeout="60000"  reconnect_interval="60"/>
       </pseudo_event>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="39" operation="stop" operation_key="rabbitmq_stop_0" internal_operation_key="rabbitmq:2_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
@@ -57,11 +53,7 @@
         <attributes CRM_meta_name="stop" CRM_meta_timeout="60000"  reconnect_interval="60"/>
       </pseudo_event>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="49" operation="stop" operation_key="galera_stop_0" internal_operation_key="galera:1_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="5" priority="1000000">
     <action_set>
diff --git a/cts/scheduler/remote-recover-all.summary b/cts/scheduler/remote-recover-all.summary
index 2ac0c6a..d095fdd 100644
--- a/cts/scheduler/remote-recover-all.summary
+++ b/cts/scheduler/remote-recover-all.summary
@@ -56,7 +56,9 @@ Transition Summary:
  * Move       stonith-fence_ipmilan-5254005bdbb5     ( controller-1 -> controller-2 )  
 
 Executing cluster transition:
+ * Pseudo action:   messaging-1_stop_0
  * Pseudo action:   galera-0_stop_0
+ * Pseudo action:   galera-2_stop_0
  * Pseudo action:   galera-master_demote_0
  * Pseudo action:   redis-master_pre_notify_stop_0
  * Resource action: stonith-fence_ipmilan-525400bbf613 stop on controller-0
@@ -94,7 +96,6 @@ Executing cluster transition:
  * Resource action: stonith-fence_ipmilan-525400b4f6bd monitor=60000 on controller-0
  * Resource action: stonith-fence_ipmilan-5254005bdbb5 start on controller-2
  * Resource action: galera-0        monitor=20000 on controller-2
- * Pseudo action:   galera-2_stop_0
  * Resource action: rabbitmq        notify on messaging-2
  * Resource action: rabbitmq        notify on messaging-0
  * Pseudo action:   rabbitmq_notified_0
@@ -107,7 +108,6 @@ Executing cluster transition:
  * Resource action: ip-172.17.1.17  start on controller-2
  * Resource action: ip-172.17.4.11  start on controller-2
  * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2
- * Pseudo action:   messaging-1_stop_0
  * Pseudo action:   redis_notified_0
  * Resource action: ip-172.17.1.14  monitor=10000 on controller-2
  * Resource action: ip-172.17.1.17  monitor=10000 on controller-2
diff --git a/cts/scheduler/remote-recover-no-resources.dot b/cts/scheduler/remote-recover-no-resources.dot
index ef78aa6..a2f8ce0 100644
--- a/cts/scheduler/remote-recover-no-resources.dot
+++ b/cts/scheduler/remote-recover-no-resources.dot
@@ -45,7 +45,6 @@ digraph "g" {
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold]
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold]
 "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"]
-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold]
 "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold]
 "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"]
 "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"]
diff --git a/cts/scheduler/remote-recover-no-resources.exp b/cts/scheduler/remote-recover-no-resources.exp
index 8a67c11..90470fb 100644
--- a/cts/scheduler/remote-recover-no-resources.exp
+++ b/cts/scheduler/remote-recover-no-resources.exp
@@ -5,11 +5,7 @@
         <attributes CRM_meta_name="stop" CRM_meta_timeout="60000"  reconnect_interval="60"/>
       </pseudo_event>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="38" operation="stop" operation_key="rabbitmq_stop_0" internal_operation_key="rabbitmq:2_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
diff --git a/cts/scheduler/remote-recover-no-resources.summary b/cts/scheduler/remote-recover-no-resources.summary
index 89da784..18a989b 100644
--- a/cts/scheduler/remote-recover-no-resources.summary
+++ b/cts/scheduler/remote-recover-no-resources.summary
@@ -54,6 +54,7 @@ Transition Summary:
  * Move       stonith-fence_ipmilan-5254005bdbb5     ( controller-1 -> controller-2 )  
 
 Executing cluster transition:
+ * Pseudo action:   messaging-1_stop_0
  * Pseudo action:   galera-0_stop_0
  * Pseudo action:   galera-2_stop_0
  * Pseudo action:   redis-master_pre_notify_stop_0
@@ -92,7 +93,6 @@ Executing cluster transition:
  * Pseudo action:   ip-172.17.1.17_stop_0
  * Pseudo action:   ip-172.17.4.11_stop_0
  * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2
- * Pseudo action:   messaging-1_stop_0
  * Resource action: redis           notify on controller-0
  * Resource action: redis           notify on controller-2
  * Pseudo action:   redis-master_confirmed-post_notify_stopped_0
diff --git a/cts/scheduler/remote-recover-unknown.dot b/cts/scheduler/remote-recover-unknown.dot
index 5cd760b..29ab59f 100644
--- a/cts/scheduler/remote-recover-unknown.dot
+++ b/cts/scheduler/remote-recover-unknown.dot
@@ -46,7 +46,6 @@ digraph "g" {
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold]
 "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold]
 "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"]
-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold]
 "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold]
 "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"]
 "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"]
diff --git a/cts/scheduler/remote-recover-unknown.exp b/cts/scheduler/remote-recover-unknown.exp
index ac6f004..82cb65f7 100644
--- a/cts/scheduler/remote-recover-unknown.exp
+++ b/cts/scheduler/remote-recover-unknown.exp
@@ -5,11 +5,7 @@
         <attributes CRM_meta_name="stop" CRM_meta_timeout="60000"  reconnect_interval="60"/>
       </pseudo_event>
     </action_set>
-    <inputs>
-      <trigger>
-        <pseudo_event id="39" operation="stop" operation_key="rabbitmq_stop_0" internal_operation_key="rabbitmq:2_stop_0"/>
-      </trigger>
-    </inputs>
+    <inputs/>
   </synapse>
   <synapse id="1">
     <action_set>
diff --git a/cts/scheduler/remote-recover-unknown.summary b/cts/scheduler/remote-recover-unknown.summary
index 2c60713..4d7a411 100644
--- a/cts/scheduler/remote-recover-unknown.summary
+++ b/cts/scheduler/remote-recover-unknown.summary
@@ -55,6 +55,7 @@ Transition Summary:
  * Move       stonith-fence_ipmilan-5254005bdbb5     ( controller-1 -> controller-2 )  
 
 Executing cluster transition:
+ * Pseudo action:   messaging-1_stop_0
  * Pseudo action:   galera-0_stop_0
  * Pseudo action:   galera-2_stop_0
  * Pseudo action:   redis-master_pre_notify_stop_0
@@ -94,7 +95,6 @@ Executing cluster transition:
  * Pseudo action:   ip-172.17.1.17_stop_0
  * Pseudo action:   ip-172.17.4.11_stop_0
  * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2
- * Pseudo action:   messaging-1_stop_0
  * Resource action: redis           notify on controller-0
  * Resource action: redis           notify on controller-2
  * Pseudo action:   redis-master_confirmed-post_notify_stopped_0
-- 
1.8.3.1


From 9a5f7952c921f7f8eea3c7b0af711df2995a4e60 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 7 Jun 2019 17:11:27 -0500
Subject: [PATCH 13/13] Low: libpe_status: don't add /var/log mount to bundles
 if user did

---
 lib/pengine/bundle.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c
index b223f03..060e73a 100644
--- a/lib/pengine/bundle.c
+++ b/lib/pengine/bundle.c
@@ -1027,6 +1027,7 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set)
     xmlNode *xml_obj = NULL;
     xmlNode *xml_resource = NULL;
     pe__bundle_variant_data_t *bundle_data = NULL;
+    bool need_log_mount = TRUE;
 
     CRM_ASSERT(rsc != NULL);
     pe_rsc_trace(rsc, "Processing resource %s...", rsc->id);
@@ -1151,6 +1152,9 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set)
 
         if (source && target) {
             mount_add(bundle_data, source, target, options, flags);
+            if (strcmp(target, "/var/log") == 0) {
+                need_log_mount = FALSE;
+            }
         } else {
             pe_err("Invalid mount directive %s", ID(xml_child));
         }
@@ -1253,8 +1257,10 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set)
         mount_add(bundle_data, DEFAULT_REMOTE_KEY_LOCATION,
                   DEFAULT_REMOTE_KEY_LOCATION, NULL, pe__bundle_mount_none);
 
-        mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL,
-                  pe__bundle_mount_subdir);
+        if (need_log_mount) {
+            mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL,
+                      pe__bundle_mount_subdir);
+        }
 
         port = calloc(1, sizeof(pe__bundle_port_t));
         if(bundle_data->control_port) {
-- 
1.8.3.1