From 4e190ebc5460563bae2586b28afb0415f2eb3d1a Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 1 Jul 2020 20:38:16 -0500 Subject: [PATCH 1/4] Test: CTS: libqb shared memory creates directories now ... so use "rm -rf" instead of "rm -f" --- cts/CTS.py.in | 2 +- cts/CTSaudits.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cts/CTS.py.in b/cts/CTS.py.in index c418318..091bb1f 100644 --- a/cts/CTS.py.in +++ b/cts/CTS.py.in @@ -546,7 +546,7 @@ class ClusterManager(UserDict): if self.rsh(node, self.templates["StopCmd"]) == 0: # Make sure we can continue even if corosync leaks # fdata-* is the old name - #self.rsh(node, "rm -f /dev/shm/qb-* /dev/shm/fdata-*") + #self.rsh(node, "rm -rf /dev/shm/qb-* /dev/shm/fdata-*") self.ShouldBeStatus[node] = "down" self.cluster_stable(self.Env["DeadTime"]) return 1 diff --git a/cts/CTSaudits.py b/cts/CTSaudits.py index b7e0827..cc82171 100755 --- a/cts/CTSaudits.py +++ b/cts/CTSaudits.py @@ -233,7 +233,7 @@ class FileAudit(ClusterAudit): for line in lsout: self.CM.debug("ps[%s]: %s" % (node, line)) - self.CM.rsh(node, "rm -f /dev/shm/qb-*") + self.CM.rsh(node, "rm -rf /dev/shm/qb-*") else: self.CM.debug("Skipping %s" % node) -- 1.8.3.1 From 4316507d50d51c7864d8d34aac1da31a232b9f42 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 2 Jul 2020 16:09:20 -0500 Subject: [PATCH 2/4] Test: CTS: ignore error logged by recent pcs versions ... because it is expected when a node is fenced, and we should already see pacemaker errors if a node is unexpectedly fenced --- cts/patterns.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cts/patterns.py b/cts/patterns.py index 96d6471..7eed90c 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -21,6 +21,10 @@ class BasePatterns(object): # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", + + # pcs can log this when node is fenced, but fencing is OK in some + # tests (and we will catch it in pacemaker logs when not OK) + r"pcs.daemon:No response from: .* request: get_configs, error:", ] self.BadNews = [] self.components = {} -- 1.8.3.1 From 598ae0f65bad6ed16978d1ab6e24e8e358e0a1a4 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 2 Jul 2020 20:40:00 -0500 Subject: [PATCH 3/4] Low: libcrmcommon: avoid assertion on controller protocol errors Previously, after a protocol error, we would set reply to NULL and then try to call crm_element_value() on it, which would log an assertion. --- lib/common/ipc_controld.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/lib/common/ipc_controld.c b/lib/common/ipc_controld.c index 5917cc5..22cb9e0 100644 --- a/lib/common/ipc_controld.c +++ b/lib/common/ipc_controld.c @@ -187,53 +187,51 @@ dispatch(pcmk_ipc_api_t *api, xmlNode *reply) crm_debug("Unrecognizable controller message: invalid message type '%s'", crm_str(value)); status = CRM_EX_PROTOCOL; - reply = NULL; + goto done; } if (crm_element_value(reply, XML_ATTR_REFERENCE) == NULL) { crm_debug("Unrecognizable controller message: no reference"); status = CRM_EX_PROTOCOL; - reply = NULL; + goto done; } value = crm_element_value(reply, F_CRM_TASK); if (value == NULL) { crm_debug("Unrecognizable controller message: no command name"); status = CRM_EX_PROTOCOL; - reply = NULL; + goto done; } // Parse useful info from reply - if (reply != NULL) { - reply_data.feature_set = crm_element_value(reply, XML_ATTR_VERSION); - reply_data.host_from = crm_element_value(reply, F_CRM_HOST_FROM); - msg_data = get_message_xml(reply, F_CRM_DATA); + reply_data.feature_set = crm_element_value(reply, XML_ATTR_VERSION); + reply_data.host_from = crm_element_value(reply, F_CRM_HOST_FROM); + msg_data = get_message_xml(reply, F_CRM_DATA); - if (!strcmp(value, CRM_OP_REPROBE)) { - reply_data.reply_type = pcmk_controld_reply_reprobe; + if (!strcmp(value, CRM_OP_REPROBE)) { + reply_data.reply_type = pcmk_controld_reply_reprobe; - } else if (!strcmp(value, CRM_OP_NODE_INFO)) { - set_node_info_data(&reply_data, msg_data); + } else if (!strcmp(value, CRM_OP_NODE_INFO)) { + set_node_info_data(&reply_data, msg_data); - } else if (!strcmp(value, CRM_OP_INVOKE_LRM)) { - reply_data.reply_type = pcmk_controld_reply_resource; - reply_data.data.resource.node_state = msg_data; + } else if (!strcmp(value, CRM_OP_INVOKE_LRM)) { + reply_data.reply_type = pcmk_controld_reply_resource; + reply_data.data.resource.node_state = msg_data; - } else if (!strcmp(value, CRM_OP_PING)) { - set_ping_data(&reply_data, msg_data); + } else if (!strcmp(value, CRM_OP_PING)) { + set_ping_data(&reply_data, msg_data); - } else if (!strcmp(value, PCMK__CONTROLD_CMD_NODES)) { - set_nodes_data(&reply_data, msg_data); + } else if (!strcmp(value, PCMK__CONTROLD_CMD_NODES)) { + set_nodes_data(&reply_data, msg_data); - } else { - crm_debug("Unrecognizable controller message: unknown command '%s'", - value); - status = CRM_EX_PROTOCOL; - reply = NULL; - } + } else { + crm_debug("Unrecognizable controller message: unknown command '%s'", + value); + status = CRM_EX_PROTOCOL; } +done: pcmk__call_ipc_callback(api, pcmk_ipc_event_reply, status, &reply_data); // Free any reply data that was allocated -- 1.8.3.1 From 5ae4101b60f8c0cd96eb2097a65a59aaa1750d73 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Fri, 17 Jul 2020 17:20:23 -0500 Subject: [PATCH 4/4] Log: fencer: don't log assertion if unable to create full request reply Previously, we would log an assertion and a warning if asked to create a reply to a NULL request. However there is a possible sequence for this to happen: - Some nodes are up and some down at cluster start-up - One node is elected DC and schedules fencing of the down nodes - Fencing is initiated for one of the down nodes - One of the other down nodes comes up and is elected DC - The fencing result comes back and all peers (including new DC) are notified - New DC tries to create a notification for its client (the controller) but doesn't know anything about the initial request For now, just log a warning and drop the assertion. Longer term, maybe we should synchronize in-flight request information when a fencer joins the process group. --- daemons/fenced/fenced_commands.c | 55 +++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c index 05c5437..9c27d61 100644 --- a/daemons/fenced/fenced_commands.c +++ b/daemons/fenced/fenced_commands.c @@ -2336,22 +2336,8 @@ stonith_fence(xmlNode * msg) xmlNode * stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) { - int lpc = 0; xmlNode *reply = NULL; - const char *name = NULL; - const char *value = NULL; - - const char *names[] = { - F_STONITH_OPERATION, - F_STONITH_CALLID, - F_STONITH_CLIENTID, - F_STONITH_CLIENTNAME, - F_STONITH_REMOTE_OP_ID, - F_STONITH_CALLOPTS - }; - - crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); @@ -2359,16 +2345,39 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i crm_xml_add(reply, "st_output", output); crm_xml_add_int(reply, F_STONITH_RC, rc); - CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply); - for (lpc = 0; lpc < DIMOF(names); lpc++) { - name = names[lpc]; - value = crm_element_value(request, name); - crm_xml_add(reply, name, value); - } + if (request == NULL) { + /* Most likely, this is the result of a stonith operation that was + * initiated before we came up. Unfortunately that means we lack enough + * information to provide clients with a full result. + * + * @TODO Maybe synchronize this information at start-up? + */ + crm_warn("Missing request information for client notifications for " + "operation with result %d (initiated before we came up?)", rc); - if (data != NULL) { - crm_trace("Attaching reply output"); - add_message_xml(reply, F_STONITH_CALLDATA, data); + } else { + const char *name = NULL; + const char *value = NULL; + + const char *names[] = { + F_STONITH_OPERATION, + F_STONITH_CALLID, + F_STONITH_CLIENTID, + F_STONITH_CLIENTNAME, + F_STONITH_REMOTE_OP_ID, + F_STONITH_CALLOPTS + }; + + crm_trace("Creating a result reply with%s reply output (rc=%d)", + (data? "" : "out"), rc); + for (int lpc = 0; lpc < DIMOF(names); lpc++) { + name = names[lpc]; + value = crm_element_value(request, name); + crm_xml_add(reply, name, value); + } + if (data != NULL) { + add_message_xml(reply, F_STONITH_CALLDATA, data); + } } return reply; } -- 1.8.3.1