Blob Blame History Raw
From 4e190ebc5460563bae2586b28afb0415f2eb3d1a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 1 Jul 2020 20:38:16 -0500
Subject: [PATCH 1/4] Test: CTS: libqb shared memory creates directories now

... so use "rm -rf" instead of "rm -f"
---
 cts/CTS.py.in    | 2 +-
 cts/CTSaudits.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cts/CTS.py.in b/cts/CTS.py.in
index c418318..091bb1f 100644
--- a/cts/CTS.py.in
+++ b/cts/CTS.py.in
@@ -546,7 +546,7 @@ class ClusterManager(UserDict):
         if self.rsh(node, self.templates["StopCmd"]) == 0:
             # Make sure we can continue even if corosync leaks
             # fdata-* is the old name
-            #self.rsh(node, "rm -f /dev/shm/qb-* /dev/shm/fdata-*")
+            #self.rsh(node, "rm -rf /dev/shm/qb-* /dev/shm/fdata-*")
             self.ShouldBeStatus[node] = "down"
             self.cluster_stable(self.Env["DeadTime"])
             return 1
diff --git a/cts/CTSaudits.py b/cts/CTSaudits.py
index b7e0827..cc82171 100755
--- a/cts/CTSaudits.py
+++ b/cts/CTSaudits.py
@@ -233,7 +233,7 @@ class FileAudit(ClusterAudit):
                     for line in lsout:
                         self.CM.debug("ps[%s]: %s" % (node, line))
 
-                    self.CM.rsh(node, "rm -f /dev/shm/qb-*")
+                    self.CM.rsh(node, "rm -rf /dev/shm/qb-*")
 
             else:
                 self.CM.debug("Skipping %s" % node)
-- 
1.8.3.1


From 4316507d50d51c7864d8d34aac1da31a232b9f42 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 2 Jul 2020 16:09:20 -0500
Subject: [PATCH 2/4] Test: CTS: ignore error logged by recent pcs versions

... because it is expected when a node is fenced, and we should already see
pacemaker errors if a node is unexpectedly fenced
---
 cts/patterns.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cts/patterns.py b/cts/patterns.py
index 96d6471..7eed90c 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -21,6 +21,10 @@ class BasePatterns(object):
 
             # Logging bug in some versions of libvirtd
             r"libvirtd.*: internal error: Failed to parse PCI config address",
+
+            # pcs can log this when node is fenced, but fencing is OK in some
+            # tests (and we will catch it in pacemaker logs when not OK)
+            r"pcs.daemon:No response from: .* request: get_configs, error:",
         ]
         self.BadNews = []
         self.components = {}
-- 
1.8.3.1


From 598ae0f65bad6ed16978d1ab6e24e8e358e0a1a4 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 2 Jul 2020 20:40:00 -0500
Subject: [PATCH 3/4] Low: libcrmcommon: avoid assertion on controller protocol
 errors

Previously, after a protocol error, we would set reply to NULL and then try to
call crm_element_value() on it, which would log an assertion.
---
 lib/common/ipc_controld.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/lib/common/ipc_controld.c b/lib/common/ipc_controld.c
index 5917cc5..22cb9e0 100644
--- a/lib/common/ipc_controld.c
+++ b/lib/common/ipc_controld.c
@@ -187,53 +187,51 @@ dispatch(pcmk_ipc_api_t *api, xmlNode *reply)
         crm_debug("Unrecognizable controller message: invalid message type '%s'",
                   crm_str(value));
         status = CRM_EX_PROTOCOL;
-        reply = NULL;
+        goto done;
     }
 
     if (crm_element_value(reply, XML_ATTR_REFERENCE) == NULL) {
         crm_debug("Unrecognizable controller message: no reference");
         status = CRM_EX_PROTOCOL;
-        reply = NULL;
+        goto done;
     }
 
     value = crm_element_value(reply, F_CRM_TASK);
     if (value == NULL) {
         crm_debug("Unrecognizable controller message: no command name");
         status = CRM_EX_PROTOCOL;
-        reply = NULL;
+        goto done;
     }
 
     // Parse useful info from reply
 
-    if (reply != NULL) {
-        reply_data.feature_set = crm_element_value(reply, XML_ATTR_VERSION);
-        reply_data.host_from = crm_element_value(reply, F_CRM_HOST_FROM);
-        msg_data = get_message_xml(reply, F_CRM_DATA);
+    reply_data.feature_set = crm_element_value(reply, XML_ATTR_VERSION);
+    reply_data.host_from = crm_element_value(reply, F_CRM_HOST_FROM);
+    msg_data = get_message_xml(reply, F_CRM_DATA);
 
-        if (!strcmp(value, CRM_OP_REPROBE)) {
-            reply_data.reply_type = pcmk_controld_reply_reprobe;
+    if (!strcmp(value, CRM_OP_REPROBE)) {
+        reply_data.reply_type = pcmk_controld_reply_reprobe;
 
-        } else if (!strcmp(value, CRM_OP_NODE_INFO)) {
-            set_node_info_data(&reply_data, msg_data);
+    } else if (!strcmp(value, CRM_OP_NODE_INFO)) {
+        set_node_info_data(&reply_data, msg_data);
 
-        } else if (!strcmp(value, CRM_OP_INVOKE_LRM)) {
-            reply_data.reply_type = pcmk_controld_reply_resource;
-            reply_data.data.resource.node_state = msg_data;
+    } else if (!strcmp(value, CRM_OP_INVOKE_LRM)) {
+        reply_data.reply_type = pcmk_controld_reply_resource;
+        reply_data.data.resource.node_state = msg_data;
 
-        } else if (!strcmp(value, CRM_OP_PING)) {
-            set_ping_data(&reply_data, msg_data);
+    } else if (!strcmp(value, CRM_OP_PING)) {
+        set_ping_data(&reply_data, msg_data);
 
-        } else if (!strcmp(value, PCMK__CONTROLD_CMD_NODES)) {
-            set_nodes_data(&reply_data, msg_data);
+    } else if (!strcmp(value, PCMK__CONTROLD_CMD_NODES)) {
+        set_nodes_data(&reply_data, msg_data);
 
-        } else {
-            crm_debug("Unrecognizable controller message: unknown command '%s'",
-                      value);
-            status = CRM_EX_PROTOCOL;
-            reply = NULL;
-        }
+    } else {
+        crm_debug("Unrecognizable controller message: unknown command '%s'",
+                  value);
+        status = CRM_EX_PROTOCOL;
     }
 
+done:
     pcmk__call_ipc_callback(api, pcmk_ipc_event_reply, status, &reply_data);
 
     // Free any reply data that was allocated
-- 
1.8.3.1


From 5ae4101b60f8c0cd96eb2097a65a59aaa1750d73 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 17 Jul 2020 17:20:23 -0500
Subject: [PATCH 4/4] Log: fencer: don't log assertion if unable to create full
 request reply

Previously, we would log an assertion and a warning if asked to create a reply
to a NULL request. However there is a possible sequence for this to happen:

- Some nodes are up and some down at cluster start-up
- One node is elected DC and schedules fencing of the down nodes
- Fencing is initiated for one of the down nodes
- One of the other down nodes comes up and is elected DC
- The fencing result comes back and all peers (including new DC) are notified
- New DC tries to create a notification for its client (the controller)
  but doesn't know anything about the initial request

For now, just log a warning and drop the assertion. Longer term, maybe we
should synchronize in-flight request information when a fencer joins the
process group.
---
 daemons/fenced/fenced_commands.c | 55 +++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c
index 05c5437..9c27d61 100644
--- a/daemons/fenced/fenced_commands.c
+++ b/daemons/fenced/fenced_commands.c
@@ -2336,22 +2336,8 @@ stonith_fence(xmlNode * msg)
 xmlNode *
 stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc)
 {
-    int lpc = 0;
     xmlNode *reply = NULL;
 
-    const char *name = NULL;
-    const char *value = NULL;
-
-    const char *names[] = {
-        F_STONITH_OPERATION,
-        F_STONITH_CALLID,
-        F_STONITH_CLIENTID,
-        F_STONITH_CLIENTNAME,
-        F_STONITH_REMOTE_OP_ID,
-        F_STONITH_CALLOPTS
-    };
-
-    crm_trace("Creating a basic reply");
     reply = create_xml_node(NULL, T_STONITH_REPLY);
 
     crm_xml_add(reply, "st_origin", __FUNCTION__);
@@ -2359,16 +2345,39 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i
     crm_xml_add(reply, "st_output", output);
     crm_xml_add_int(reply, F_STONITH_RC, rc);
 
-    CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply);
-    for (lpc = 0; lpc < DIMOF(names); lpc++) {
-        name = names[lpc];
-        value = crm_element_value(request, name);
-        crm_xml_add(reply, name, value);
-    }
+    if (request == NULL) {
+        /* Most likely, this is the result of a stonith operation that was
+         * initiated before we came up. Unfortunately that means we lack enough
+         * information to provide clients with a full result.
+         *
+         * @TODO Maybe synchronize this information at start-up?
+         */
+        crm_warn("Missing request information for client notifications for "
+                 "operation with result %d (initiated before we came up?)", rc);
 
-    if (data != NULL) {
-        crm_trace("Attaching reply output");
-        add_message_xml(reply, F_STONITH_CALLDATA, data);
+    } else {
+        const char *name = NULL;
+        const char *value = NULL;
+
+        const char *names[] = {
+            F_STONITH_OPERATION,
+            F_STONITH_CALLID,
+            F_STONITH_CLIENTID,
+            F_STONITH_CLIENTNAME,
+            F_STONITH_REMOTE_OP_ID,
+            F_STONITH_CALLOPTS
+        };
+
+        crm_trace("Creating a result reply with%s reply output (rc=%d)",
+                  (data? "" : "out"), rc);
+        for (int lpc = 0; lpc < DIMOF(names); lpc++) {
+            name = names[lpc];
+            value = crm_element_value(request, name);
+            crm_xml_add(reply, name, value);
+        }
+        if (data != NULL) {
+            add_message_xml(reply, F_STONITH_CALLDATA, data);
+        }
     }
     return reply;
 }
-- 
1.8.3.1