diff --git a/SOURCES/009-fencing-reasons.patch b/SOURCES/009-fencing-reasons.patch new file mode 100644 index 0000000..3fb5bc7 --- /dev/null +++ b/SOURCES/009-fencing-reasons.patch @@ -0,0 +1,2985 @@ +From fcd42a5926e9a63d425586552ecc7b543838d352 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 16:57:03 -0600 +Subject: [PATCH 01/23] Feature: fencer: pass full result in async command + replies + +The services library callbacks for async commands, which call +send_async_reply() -> construct_async_reply() to create the reply, now add +fields for exit status, operation status, and exit reason, in addition to the +existing action standard output and legacy return code. + +Nothing uses the new fields yet. +--- + daemons/fenced/fenced_commands.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f34cb4f136..3497428c18 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2415,9 +2415,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); +- int rc = pcmk_rc2legacy(stonith__result2rc(result)); + +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); ++ stonith__xe_set_result(notify_data, result); + crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); + crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); + crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); +@@ -2425,7 +2424,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + } +@@ -2728,9 +2727,8 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); + crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); +- crm_xml_add_int(reply, F_STONITH_RC, +- pcmk_rc2legacy(stonith__result2rc(result))); +- crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); ++ ++ stonith__xe_set_result(reply, result); + return reply; + } + +-- +2.27.0 + + +From 4bac2e9811872f92571e4f5a47d8c5032cfc3016 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:41:29 -0600 +Subject: [PATCH 02/23] Refactor: fencer: track full result for direct agent + actions + +This renames stonith_device_action() to execute_agent_action() for readability, +and has it set a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 95 +++++++++++++++++++------------- + 1 file changed, 56 insertions(+), 39 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 3497428c18..2f59ef84b7 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1729,23 +1729,6 @@ stonith_level_remove(xmlNode *msg, char **desc) + return pcmk_ok; + } + +-/*! +- * \internal +- * \brief Schedule an (asynchronous) action directly on a stonith device +- * +- * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action +- * directly on a specified device. Only list, monitor, and status actions are +- * expected to use this call, though it should work with any agent command. +- * +- * \param[in] msg API message XML with desired action +- * \param[out] output Unused +- * +- * \return -EINPROGRESS on success, -errno otherwise +- * \note If the action is monitor, the device must be registered via the API +- * (CIB registration is not sufficient), because monitor should not be +- * possible unless the device is "started" (API registered). +- */ +- + static char * + list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + { +@@ -1778,8 +1761,23 @@ list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + return rv; + } + +-static int +-stonith_device_action(xmlNode * msg, char **output) ++/*! ++ * \internal ++ * \brief Execute a fence agent action directly (and asynchronously) ++ * ++ * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action ++ * directly on a specified device. Only list, monitor, and status actions are ++ * expected to use this call, though it should work with any agent command. ++ * ++ * \param[in] msg Request XML specifying action ++ * \param[out] result Where to store result of action ++ * ++ * \note If the action is monitor, the device must be registered via the API ++ * (CIB registration is not sufficient), because monitor should not be ++ * possible unless the device is "started" (API registered). ++ */ ++static void ++execute_agent_action(xmlNode *msg, pcmk__action_result_t *result) + { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); + xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); +@@ -1792,39 +1790,56 @@ stonith_device_action(xmlNode * msg, char **output) + crm_info("Malformed API action request: device %s, action %s", + (id? id : "not specified"), + (action? action : "not specified")); +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { ++ // Watchdog agent actions are implemented internally + if (stonith_watchdog_timeout_ms <= 0) { +- return -ENODEV; +- } else { +- if (pcmk__str_eq(action, "list", pcmk__str_casei)) { +- *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); +- return pcmk_ok; +- } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- return pcmk_ok; +- } ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Watchdog fence device not configured"); ++ return; ++ ++ } else if (pcmk__str_eq(action, "list", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result_output(result, ++ list_to_string(stonith_watchdog_targets, ++ "\n", TRUE), ++ NULL); ++ return; ++ ++ } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ return; + } + } + + device = g_hash_table_lookup(device_list, id); +- if ((device == NULL) +- || (!device->api_registered && !strcmp(action, "monitor"))) { ++ if (device == NULL) { ++ crm_info("Ignoring API '%s' action request because device %s not found", ++ action, id); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); ++ return; + ++ } else if (!device->api_registered && !strcmp(action, "monitor")) { + // Monitors may run only on "started" (API-registered) devices +- crm_info("Ignoring API '%s' action request because device %s not found", ++ crm_info("Ignoring API '%s' action request because device %s not active", + action, id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Fence device not active"); ++ return; + } + + cmd = create_async_command(msg); + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + schedule_stonith_command(cmd, device); +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + static void +@@ -2911,8 +2926,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + xmlNode *data = NULL; + bool need_reply = true; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- char *output = NULL; + const char *op = crm_element_value(request, F_STONITH_OPERATION); + const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); + +@@ -2935,8 +2950,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { +- rc = stonith_device_action(request, &output); +- need_reply = (rc != -EINPROGRESS); ++ execute_agent_action(request, &result); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3150,19 +3166,20 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, output, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free(output); + free_xml(data); + + crm_debug("Processed %s request from %s %s: %s (rc=%d)", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), + ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ ++ pcmk__reset_result(&result); + } + + static void +-- +2.27.0 + + +From 9601b2aff1ea6a4eef0bb2701c22c1e971a657eb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 17:31:20 -0600 +Subject: [PATCH 03/23] Refactor: fencer: track full result for local fencing + +This renames stonith_fence() to fence_locally() for readability, and has it set +a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 38 +++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f59ef84b7..bfb0d71e5f 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2626,37 +2626,49 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + } + } + +-static int +-stonith_fence(xmlNode * msg) ++/*! ++ * \internal ++ * \brief Execute a fence action via the local node ++ * ++ * \param[in] msg Fencing request ++ * \param[out] result Where to store result of fence action ++ */ ++static void ++fence_locally(xmlNode *msg, pcmk__action_result_t *result) + { + const char *device_id = NULL; + stonith_device_t *device = NULL; + async_command_t *cmd = create_async_command(msg); + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); + ++ CRM_CHECK(result != NULL, return); ++ + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + device_id = crm_element_value(dev, F_STONITH_DEVICE); +- if (device_id) { ++ if (device_id != NULL) { + device = g_hash_table_lookup(device_list, device_id); + if (device == NULL) { + crm_err("Requested device '%s' is not available", device_id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Requested fence device not found"); ++ return; + } + schedule_stonith_command(cmd, device); + + } else { + const char *host = crm_element_value(dev, F_STONITH_TARGET); + +- if (cmd->options & st_opt_cs_nodeid) { +- int nodeid; +- crm_node_t *node; ++ if (pcmk_is_set(cmd->options, st_opt_cs_nodeid)) { ++ int nodeid = 0; ++ crm_node_t *node = NULL; + + pcmk__scan_min_int(host, &nodeid, 0); + node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY); +- if (node) { ++ if (node != NULL) { + host = node->uname; + } + } +@@ -2666,7 +2678,7 @@ stonith_fence(xmlNode * msg) + TRUE, cmd, stonith_fence_get_devices_cb); + } + +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + xmlNode * +@@ -3016,9 +3028,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +- +- if (remote_peer || stand_alone) { +- rc = stonith_fence(request); ++ if ((remote_peer != NULL) || stand_alone) { ++ fence_locally(request, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { +-- +2.27.0 + + +From b7c7676cfd36fd72d3b29e86a23db97081e19b03 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:06:52 -0600 +Subject: [PATCH 04/23] Low: fencer: handle topology level registration errors + better + +Rename stonith_level_register() to fenced_register_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use a new combination of +PCMK_EXEC_INVALID with CRM_EX_INVALID_PARAM for invalid levels, so it gets +mapped back to the legacy code -EINVAL (which was returned before). +--- + daemons/fenced/fenced_commands.c | 52 +++++++++++++++++++++---------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 3 +- + lib/fencing/st_actions.c | 1 + + 4 files changed, 44 insertions(+), 21 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index bfb0d71e5f..975f8633a4 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1583,20 +1583,19 @@ parse_device_list(const char *devices) + + /*! + * \internal +- * \brief Register a STONITH level for a target ++ * \brief Register a fencing topology level for a target + * + * Given an XML request specifying the target name, level index, and device IDs + * for the level, this will create an entry for the target in the global topology + * table if one does not already exist, then append the specified device IDs to + * the entry's device list for the specified level. + * +- * \param[in] msg XML request for STONITH level registration +- * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") +- * +- * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of registration + */ +-int +-stonith_level_register(xmlNode *msg, char **desc) ++void ++fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + { + int id = 0; + xmlNode *level; +@@ -1607,6 +1606,13 @@ stonith_level_register(xmlNode *msg, char **desc) + stonith_key_value_t *dIter = NULL; + stonith_key_value_t *devices = NULL; + ++ CRM_CHECK(result != NULL, return); ++ ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } ++ + /* Allow the XML here to point to the level tag directly, or wrapped in + * another tag. If directly, don't search by xpath, because it might give + * multiple hits (e.g. if the XML is the CIB). +@@ -1614,11 +1620,15 @@ stonith_level_register(xmlNode *msg, char **desc) + if (pcmk__str_eq(TYPE(msg), XML_TAG_FENCING_LEVEL, pcmk__str_casei)) { + level = msg; + } else { +- level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ } ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; + } +- CRM_CHECK(level != NULL, return -EINVAL); + + mode = stonith_level_kind(level); ++ + target = stonith_level_key(level, mode); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +@@ -1626,18 +1636,26 @@ stonith_level_register(xmlNode *msg, char **desc) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + +- /* Sanity-check arguments */ +- if (mode >= 3 || (id <= 0) || (id >= ST_LEVEL_MAX)) { +- crm_trace("Could not add %s[%d] (%d) to the topology (%d active entries)", target, id, mode, g_hash_table_size(topology)); ++ // Ensure level ID is in allowed range ++ if ((id <= 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology registration for %s with invalid level %d", ++ target, id); + free(target); +- crm_log_xml_err(level, "Bad topology"); +- return -EINVAL; ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; + } + + /* Find or create topology table entry */ + tp = g_hash_table_lookup(topology, target); + if (tp == NULL) { + tp = calloc(1, sizeof(stonith_topology_t)); ++ if (tp == NULL) { ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ strerror(ENOMEM)); ++ return; ++ } + tp->kind = mode; + tp->target = target; + tp->target_value = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_VALUE); +@@ -1671,7 +1689,8 @@ stonith_level_register(xmlNode *msg, char **desc) + crm_info("Target %s has %d active fencing level%s", + tp->target, nlevels, pcmk__plural_s(nlevels)); + } +- return pcmk_ok; ++ ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + int +@@ -3142,7 +3161,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_register(request, &device_id); ++ fenced_register_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 0a8b3bf6f2..469304f67c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -452,8 +452,8 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) + static void + handle_topology_change(xmlNode *match, bool remove) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(match != NULL, return); + crm_trace("Updating %s", ID(match)); +@@ -467,9 +467,10 @@ handle_topology_change(xmlNode *match, bool remove) + free(key); + } + +- rc = stonith_level_register(match, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); +- ++ fenced_register_level(match, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free(desc); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 5162ada75d..cf114fb979 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -218,7 +218,8 @@ void stonith_device_remove(const char *id, bool from_cib); + + char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); +-int stonith_level_register(xmlNode * msg, char **desc); ++void fenced_register_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + int stonith_level_remove(xmlNode * msg, char **desc); + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 7eaa8b0f2b..37fa849847 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -325,6 +325,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + */ + case PCMK_EXEC_INVALID: + switch (result->exit_status) { ++ case CRM_EX_INVALID_PARAM: return EINVAL; + case CRM_EX_INSUFFICIENT_PRIV: return EACCES; + case CRM_EX_PROTOCOL: return EPROTO; + +-- +2.27.0 + + +From 27cedca4070328ecac1761f81c2890059af19dcf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:29:38 -0600 +Subject: [PATCH 05/23] Low: fencer: handle topology level unregistration + errors better + +Rename stonith_level_remove() to fenced_unregister_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use PCMK_EXEC_INVALID with +CRM_EX_INVALID_PARAM for invalid levels, so it gets mapped back to the legacy +code -EINVAL (which reverses the recent change in ec60f014b, both for backward +compatibility and because it makes sense -- a missing parameter is a protocol +error, while an invalid parameter is an invalid parameter error). +--- + daemons/fenced/fenced_commands.c | 52 ++++++++++++++++++++++++------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 4 +-- + 3 files changed, 48 insertions(+), 17 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 975f8633a4..ef41dc0e52 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1693,25 +1693,54 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +-int +-stonith_level_remove(xmlNode *msg, char **desc) ++/*! ++ * \internal ++ * \brief Unregister a fencing topology level for a target ++ * ++ * Given an XML request specifying the target name and level index (or 0 for all ++ * levels), this will remove any corresponding entry for the target from the ++ * global topology table. ++ * ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of unregistration ++ */ ++void ++fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result) + { + int id = -1; + stonith_topology_t *tp; + char *target; ++ xmlNode *level = NULL; ++ ++ CRM_CHECK(result != NULL, return); + +- /* Unlike additions, removal requests should always have one level tag */ +- xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + +- CRM_CHECK(level != NULL, return -EPROTO); ++ // Unlike additions, removal requests should always have one level tag ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + + target = stonith_level_key(level, -1); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +- CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), +- crm_log_xml_warn(msg, "invalid level"); +- free(target); +- return -EPROTO); ++ // Ensure level ID is in allowed range ++ if ((id < 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology unregistration for %s with invalid level %d", ++ target, id); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; ++ } + + if (desc) { + *desc = crm_strdup_printf("%s[%d]", target, id); +@@ -1745,7 +1774,7 @@ stonith_level_remove(xmlNode *msg, char **desc) + } + + free(target); +- return pcmk_ok; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + static char * +@@ -3173,7 +3202,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_remove(request, &device_id); ++ fenced_unregister_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 469304f67c..56acc93f31 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -409,17 +409,18 @@ do_stonith_notify_level(const char *op, int rc, const char *desc) + static void + topology_remove_helper(const char *node, int level) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); + + crm_xml_add(data, F_STONITH_ORIGIN, __func__); + crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + +- rc = stonith_level_remove(data, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); +- ++ fenced_unregister_level(data, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free_xml(data); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index cf114fb979..0006e02e7d 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -220,8 +220,8 @@ char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); + void fenced_register_level(xmlNode *msg, char **desc, + pcmk__action_result_t *result); +- +-int stonith_level_remove(xmlNode * msg, char **desc); ++void fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + stonith_topology_t *find_topology_for_host(const char *host); + +-- +2.27.0 + + +From 3f603defca78eb2bdd46c51a80ed04a4c773442b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 12:22:33 -0600 +Subject: [PATCH 06/23] Log: fencer: track and log full result when handling + requests + +handle_request() now tracks and logs a full result rather than just a +legacy return code. +--- + daemons/fenced/fenced_commands.c | 95 ++++++++++++++++++-------------- + 1 file changed, 53 insertions(+), 42 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index ef41dc0e52..996c18faaa 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2981,9 +2981,7 @@ static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) + { +- int call_options = 0; +- int rc = -EOPNOTSUPP; +- ++ int call_options = st_opt_none; + xmlNode *data = NULL; + bool need_reply = true; + pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +@@ -3006,13 +3004,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + pcmk__ipc_send_xml(client, id, reply, flags); + client->request_id = 0; + free_xml(reply); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { + execute_agent_action(request, &result); + need_reply = (result.execution_status != PCMK_EXEC_PENDING); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3021,7 +3018,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); + do_stonith_async_timeout_update(client_id, call_id, op_timeout); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { +@@ -3033,7 +3030,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + remove_relay_op(request); + + stonith_query(request, remote_peer, client_id, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { +@@ -3055,7 +3052,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { +@@ -3069,27 +3066,27 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value(dev, F_STONITH_TARGET)); + + if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { + if ((remote_peer != NULL) || stand_alone) { + fence_locally(request, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { + case pcmk_rc_ok: +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + break; + case EINPROGRESS: +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, ++ NULL); + break; + default: +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + break; + } + +@@ -3100,17 +3097,15 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *action = crm_element_value(dev, F_STONITH_ACTION); + const char *device = crm_element_value(dev, F_STONITH_DEVICE); + +- if (client) { ++ if (client != NULL) { + int tolerance = 0; + + crm_notice("Client %s wants to fence (%s) %s using %s", + pcmk__client_name(client), action, + target, (device? device : "any device")); +- + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); +- + if (stonith_check_fence_tolerance(tolerance, target, action)) { +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + goto done; + } + +@@ -3143,24 +3138,24 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_xml_add(request, F_STONITH_REMOTE_OP_ID, op->id); + send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, + FALSE); +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + + } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); ++ + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + } +- need_reply = (rc != -EINPROGRESS); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { + stonith_fence_history(request, &data, remote_peer, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + if (pcmk_is_set(call_options, st_opt_discard_reply)) { + /* we don't expect answers to the broadcast + * we might have sent out + */ +- rc = pcmk_ok; + need_reply = false; + } + +@@ -3168,11 +3163,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_device_register(request, &device_id, FALSE); ++ int rc = stonith_device_register(request, &device_id, FALSE); ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ ((rc == pcmk_ok)? NULL : pcmk_strerror(rc))); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3180,22 +3182,25 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + stonith_device_remove(device_id, false); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; + + if (is_privileged(client, op)) { + fenced_register_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3203,11 +3208,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + fenced_unregister_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +@@ -3216,31 +3222,36 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(request, XML_ATTR_ID, &node_id); + name = crm_element_value(request, XML_ATTR_UNAME); + reap_crm_member(node_id, name); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else { + crm_err("Unknown IPC request %s from %s %s", op, + ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client))); ++ pcmk__set_result(&result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, ++ "Unknown IPC request type (bug?)"); + } + + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, ++ pcmk_rc2legacy(stonith__result2rc(&result))); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free_xml(data); +- +- crm_debug("Processed %s request from %s %s: %s (rc=%d)", ++ crm_debug("Processed %s request from %s %s: %s%s%s%s", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), +- ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + ++ free_xml(data); + pcmk__reset_result(&result); + } + +-- +2.27.0 + + +From 5e13199699a4e9279520b3668c072e3db49c9782 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:10:36 -0600 +Subject: [PATCH 07/23] Feature: fencer: pass full result in replies to + requests + +Rename stonith_construct_reply() to fenced_construct_reply() for consistency, +make it take a full result as an argument rather than separate arguments for +legacy return code and output, and add the full result to the reply (along with +the legacy return code, for backward compatibility). + +This is used for peer query replies and some request replies (including replies +to local clients who requested fencing). Other replies, such as those built by +construct_async_reply(), are not affected by this commit. +--- + daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++++++--------- + daemons/fenced/fenced_remote.c | 9 ++++++++- + daemons/fenced/pacemaker-fenced.h | 4 ++-- + 3 files changed, 34 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 996c18faaa..84f89e8daf 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2322,6 +2322,7 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + const char *target = NULL; + int timeout = 0; + xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_NEVER); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); + if (dev) { +@@ -2338,7 +2339,8 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + crm_log_xml_debug(msg, "Query"); + query = calloc(1, sizeof(struct st_query_data)); + +- query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ query->reply = fenced_construct_reply(msg, NULL, &result); + query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; + query->client_id = client_id ? strdup(client_id) : NULL; + query->target = target ? strdup(target) : NULL; +@@ -2729,8 +2731,23 @@ fence_locally(xmlNode *msg, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + ++/*! ++ * \internal ++ * \brief Build an XML reply for a fencing operation ++ * ++ * \param[in] request Request that reply is for ++ * \param[in] data If not NULL, add to reply as call data ++ * \param[in] result Full result of fencing operation ++ * ++ * \return Newly created XML reply ++ * \note The caller is responsible for freeing the result. ++ * \note This has some overlap with construct_async_reply(), but that copies ++ * values from an async_command_t, whereas this one copies them from the ++ * request. ++ */ + xmlNode * +-stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) ++fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *reply = NULL; + +@@ -2738,8 +2755,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- crm_xml_add(reply, F_STONITH_OUTPUT, output); +- crm_xml_add_int(reply, F_STONITH_RC, rc); ++ stonith__xe_set_result(reply, result); + + if (request == NULL) { + /* Most likely, this is the result of a stonith operation that was +@@ -2749,12 +2765,14 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + * @TODO Maybe synchronize this information at start-up? + */ + crm_warn("Missing request information for client notifications for " +- "operation with result %d (initiated before we came up?)", rc); ++ "operation with result '%s' (initiated before we came up?)", ++ pcmk_exec_status_str(result->execution_status)); + + } else { + const char *name = NULL; + const char *value = NULL; + ++ // Attributes to copy from request to reply + const char *names[] = { + F_STONITH_OPERATION, + F_STONITH_CALLID, +@@ -2764,8 +2782,6 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + F_STONITH_CALLOPTS + }; + +- crm_trace("Creating a result reply with%s reply output (rc=%d)", +- (data? "" : "out"), rc); + for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { + name = names[lpc]; + value = crm_element_value(request, name); +@@ -3236,8 +3252,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, +- pcmk_rc2legacy(stonith__result2rc(&result))); ++ xmlNode *reply = fenced_construct_reply(request, data, &result); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8feb401477..baa07d9e78 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -415,7 +415,14 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = stonith_construct_reply(op->request, NULL, data, rc); ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ reply = fenced_construct_reply(op->request, data, &result); ++ } + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0006e02e7d..d5f4bc79fd 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -228,8 +228,8 @@ stonith_topology_t *find_topology_for_host(const char *host); + void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, + gboolean from_peer); + +-xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, +- int rc); ++xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result); + + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); +-- +2.27.0 + + +From b32aa252b321ff40c834d153cb23f8b3be471611 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:43:20 -0600 +Subject: [PATCH 08/23] Log: fencer: grab and log full result when processing + peer fencing replies + +fenced_process_fencing_reply() now checks for the full result, instead of only +a legacy return code, in peer replies, and uses it in log messages. +--- + daemons/fenced/fenced_remote.c | 63 ++++++++++++++++++++-------------- + 1 file changed, 37 insertions(+), 26 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index baa07d9e78..c6369f0051 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2095,21 +2095,21 @@ process_remote_stonith_query(xmlNode * msg) + void + fenced_process_fencing_reply(xmlNode *msg) + { +- int rc = 0; + const char *id = NULL; + const char *device = NULL; + remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return); + + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); + CRM_CHECK(id != NULL, return); + +- dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); ++ dev = stonith__find_xe_with_result(msg); + CRM_CHECK(dev != NULL, return); + +- crm_element_value_int(dev, F_STONITH_RC, &rc); ++ stonith__xe_get_result(dev, &result); + + device = crm_element_value(dev, F_STONITH_DEVICE); + +@@ -2117,7 +2117,7 @@ fenced_process_fencing_reply(xmlNode *msg) + op = g_hash_table_lookup(stonith_remote_op_list, id); + } + +- if (op == NULL && rc == pcmk_ok) { ++ if ((op == NULL) && pcmk__result_ok(&result)) { + /* Record successful fencing operations */ + const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID); + +@@ -2139,16 +2139,19 @@ fenced_process_fencing_reply(xmlNode *msg) + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +- crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " ++ crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_strerror(rc), op->id); +- if (rc == pcmk_ok) { ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2162,28 +2165,35 @@ fenced_process_fencing_reply(xmlNode *msg) + if (pcmk_is_set(op->call_options, st_opt_topology)) { + const char *device = crm_element_value(msg, F_STONITH_DEVICE); + +- crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s " +- CRM_XS " rc=%d", ++ crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, +- op->originator, pcmk_strerror(rc), rc); ++ op->originator, ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + +- if ((op->phase == 2) && (rc != pcmk_ok)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ +- crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s " +- "after successful 'off'", device, rc, op->target); +- rc = pcmk_ok; ++ crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " ++ "after successful 'off'", ++ device, pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : ": ", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ op->target); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (rc == pcmk_ok) { ++ if (pcmk__result_ok(&result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +@@ -2193,29 +2203,30 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + } +- } else if (rc == pcmk_ok && op->devices == NULL) { ++ } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); +- + op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; +- } else if (rc == -ETIME && op->devices == NULL) { ++ } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ + } + + /* Retry on failure */ +- crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, +- op->client_name, rc); +- call_remote_stonith(op, NULL, rc); ++ crm_trace("Next for %s on behalf of %s@%s (result was: %s)", ++ op->target, op->originator, op->client_name, ++ pcmk_exec_status_str(result.execution_status)); ++ call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); + } + + gboolean +-- +2.27.0 + + +From afb5706ac606a8ea883aa1597ee63d9891cc2e13 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:56:30 -0600 +Subject: [PATCH 09/23] Refactor: fencer: pass full result of previous failed + action when initiating peer fencing + +Rename call_remote_stonith() to request_peer_fencing() for readability, and +make it take the full result of the previous failed action, rather than just +its legacy return code, as an argument. + +This does cause one change in behavior: if topology is in use, a previous +attempt failed, and no more peers have the appropriate device, then the +legacy return code returned will be -ENODEV rather than -EHOSTUNREACH. +These are treated similarly internally, and hopefully that will not cause +problems for external code. +--- + daemons/fenced/fenced_remote.c | 89 +++++++++++++++++++++++++--------- + 1 file changed, 67 insertions(+), 22 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c6369f0051..31d5ee6e93 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,12 +76,13 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, +- int rc); + static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + ++static void request_peer_fencing(remote_fencing_op_t *op, ++ peer_device_info_t *peer, ++ pcmk__action_result_t *result); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -609,12 +610,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- call_remote_stonith(op, NULL, -ETIME); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ ++ // Try another device, if appropriate ++ request_peer_fencing(op, NULL, &result); + return FALSE; + } + +@@ -685,9 +690,13 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { ++ // Result won't be used in this case, but we need to pass something ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1533,6 +1542,10 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1569,7 +1582,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", +@@ -1598,15 +1611,30 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + return FALSE; + } + +-void +-call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) ++/*! ++ * \internal ++ * \brief Ask a peer to execute a fencing operation ++ * ++ * \param[in] op Fencing operation to be executed ++ * \param[in] peer If NULL or topology is in use, choose best peer to execute ++ * the fencing, otherwise use this peer ++ * \param[in] result Full result of previous failed attempt, if any (used as ++ * final result only if a previous attempt failed, topology ++ * is not in use, and no devices remain to be attempted) ++ */ ++static void ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, ++ pcmk__action_result_t *result) + { + const char *device = NULL; +- int timeout = op->base_timeout; ++ int timeout; ++ ++ CRM_CHECK(op != NULL, return); + + crm_trace("Action %.8s targeting %s for %s is %s", + op->id, op->target, op->client_name, + stonith_op_state_str(op->state)); ++ timeout = op->base_timeout; + if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) { + peer = stonith_choose_peer(op); + } +@@ -1623,9 +1651,14 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) { +- /* Ignore any peer preference, they might not have the device we need */ +- /* When using topology, stonith_choose_peer() removes the device from +- * further consideration, so be sure to calculate timeout beforehand */ ++ /* Ignore the caller's peer preference if topology is in use, because ++ * that peer might not have access to the required device. With ++ * topology, stonith_choose_peer() removes the device from further ++ * consideration, so the timeout must be calculated beforehand. ++ * ++ * @TODO Basing the total timeout on the caller's preferred peer (above) ++ * is less than ideal. ++ */ + peer = stonith_choose_peer(op); + + device = op->devices->data; +@@ -1722,8 +1755,6 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + finalize_timed_out_op(op); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { +-// int rc = -EHOSTUNREACH; +- + /* if the operation never left the query state, + * but we have all the expected replies, then no devices + * are available to execute the fencing operation. */ +@@ -1735,17 +1766,28 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + } + ++ // This is the only case in which result will be used ++ CRM_CHECK(result != NULL, return); ++ + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- rc = -ENODEV; ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- rc = -EHOSTUNREACH; +- } ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); ++ } ++ /* ... else use result provided by caller -- overwriting it with ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ setting the correct delegate if needed. ++ */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " + "for client %s " CRM_XS " state=%s", +@@ -1754,7 +1796,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + op->state = st_failed; +- remote_op_done(op, NULL, rc, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2004,6 +2046,7 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2038,6 +2081,8 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, + * it is possible fencing levels will be skipped because of the missing +@@ -2045,12 +2090,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } + + } else if (op->state == st_query) { +@@ -2062,12 +2107,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2226,7 +2271,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); +- call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); ++ request_peer_fencing(op, NULL, &result); + } + + gboolean +-- +2.27.0 + + +From 43e08ba7ee1635e47bfaf2a57636101c675b89ae Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:02:04 -0600 +Subject: [PATCH 10/23] Feature: fencer: set exit reason for timeouts waiting + for peer replies + +--- + daemons/fenced/fenced_remote.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 31d5ee6e93..415a7c1b98 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -616,7 +616,9 @@ remote_op_timeout_one(gpointer userdata) + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Peer did not send fence result within timeout"); ++ + + // Try another device, if appropriate + request_peer_fencing(op, NULL, &result); +-- +2.27.0 + + +From 34e5baebac78b7235825b31bebc44e3d65ae45cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:10:28 -0600 +Subject: [PATCH 11/23] Refactor: fencer: pass full result when handling + duplicate actions + +Rename handle_duplicates() to finalize_op_duplicates() for readability, and +make it take a full result rather than a legacy return code as an argument. +--- + daemons/fenced/fenced_remote.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 415a7c1b98..850bfb6eb3 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -439,12 +439,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Finalize all duplicates of a given fencer operation ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { +- GList *iter = NULL; +- +- for (iter = op->duplicates; iter != NULL; iter = iter->next) { ++ for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; + + if (other->state == st_duplicate) { +@@ -452,8 +459,9 @@ handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_strerror(rc), other->id); +- remote_op_done(other, data, rc, TRUE); ++ pcmk_exec_status_str(result->execution_status), ++ other->id); ++ remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); + + } else { + // Possible if (for example) it timed out already +@@ -570,8 +578,13 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + + handle_local_reply_and_notify(op, data, rc); + +- if (dup == FALSE) { +- handle_duplicates(op, data, rc); ++ if (!dup) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ finalize_op_duplicates(op, data, &result); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 939bd6f5f0f79b19d0cc4d869f3c8980fda2e461 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:23:20 -0600 +Subject: [PATCH 12/23] Feature: fencer: set exit reasons for fencing timeouts + +finalize_timed_out_op() now takes an exit reason as an argument. +It is called for fencing timeouts, peer query reply timeouts, +and all capable nodes failing to fence. + +At this point, the exit reason is not used, but that is planned. +--- + daemons/fenced/fenced_remote.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 850bfb6eb3..c10a32442e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -643,10 +643,12 @@ remote_op_timeout_one(gpointer userdata) + * \brief Finalize a remote fencer operation that timed out + * + * \param[in] op Fencer operation that timed out ++ * \param[in] reason Readable description of what step timed out + */ + static void +-finalize_timed_out_op(remote_fencing_op_t *op) ++finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_total = 0; + +@@ -660,13 +662,13 @@ finalize_timed_out_op(remote_fencing_op_t *op) + * devices, and return success. + */ + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ } else { ++ op->state = st_failed; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- +- op->state = st_failed; +- +- remote_op_done(op, NULL, -ETIME, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ pcmk__reset_result(&result); + } + + /*! +@@ -687,7 +689,8 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata); ++ finalize_timed_out_op(userdata, "Fencing could not be completed " ++ "within overall timeout"); + } + return G_SOURCE_REMOVE; + } +@@ -719,7 +722,8 @@ remote_op_query_timeout(gpointer data) + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "No capable peers replied to device query " ++ "within timeout"); + } + + return FALSE; +@@ -1767,7 +1771,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + CRM_XS " state=%s", op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + CRM_CHECK(op->state < st_done, return); +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "All nodes failed, or are unable, to " ++ "fence target"); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { + /* if the operation never left the query state, +-- +2.27.0 + + +From b80b02799260feb98723a460f2f8e8ad5cdc467f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:32:04 -0600 +Subject: [PATCH 13/23] Refactor: fencer: pass full result when finalizing peer + fencing actions + +Rename remote_op_done() to finalize_op() for readability, and make it take a +full result as an argument, rather than a legacy return code. + +This does cause one change in behavior: when all topology levels fail, +the legacy return code returned will be -pcmk_err_generic instead of EINVAL. +--- + daemons/fenced/fenced_history.c | 2 +- + daemons/fenced/fenced_remote.c | 177 ++++++++++++++++++-------------- + 2 files changed, 103 insertions(+), 76 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index bc159383c2..9e38ff0a20 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -374,7 +374,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + set_fencing_completed(op); + /* use -EHOSTUNREACH to not introduce a new return-code that might + trigger unexpected results at other places and to prevent +- remote_op_done from setting the delegate if not present ++ finalize_op from setting the delegate if not present + */ + stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); + } +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c10a32442e..aefc5f311c 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,13 +76,14 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -461,7 +462,7 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); ++ finalize_op(other, data, result, true); + + } else { + // Possible if (for example) it timed out already +@@ -487,104 +488,100 @@ delegate_from_xml(xmlNode *xml) + + /*! + * \internal +- * \brief Finalize a remote operation. ++ * \brief Finalize a peer fencing operation + * +- * \description This function has two code paths. ++ * Clean up after a fencing operation completes. This function has two code ++ * paths: the executioner uses it to broadcast the result to CPG peers, and then ++ * each peer (including the executioner) uses it to process that broadcast and ++ * notify its IPC clients of the result. + * +- * Path 1. This node is the owner of the operation and needs +- * to notify the cpg group via a broadcast as to the operation's +- * results. +- * +- * Path 2. The cpg broadcast is received. All nodes notify their local +- * stonith clients the operation results. +- * +- * So, The owner of the operation first notifies the cluster of the result, +- * and once that cpg notify is received back it notifies all the local clients. +- * +- * Nodes that are passive watchers of the operation will receive the +- * broadcast and only need to notify their local clients the operation finished. +- * +- * \param op, The fencing operation to finalize +- * \param data, The xml msg reply (if present) of the last delegated fencing +- * operation. +- * \param dup, Is this operation a duplicate, if so treat it a little differently +- * making sure the broadcast is not sent out. ++ * \param[in] op Fencer operation that completed ++ * \param[in] data If not NULL, XML reply of last delegated fencing operation ++ * \param[in] result Full operation result ++ * \param[in] dup Whether this operation is a duplicate of another ++ * (in which case, do not broadcast the result) + */ + static void +-remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + ++ CRM_CHECK((op != NULL) && (result != NULL), return); ++ ++ if (op->notify_sent) { ++ // Most likely, this is a timed-out action that eventually completed ++ crm_notice("Operation '%s'%s%s by %s for %s@%s%s: " ++ "Result arrived too late " CRM_XS " id=%.8s", ++ op->action, (op->target? " targeting " : ""), ++ (op->target? op->target : ""), ++ (op->delegate? op->delegate : "unknown node"), ++ op->client_name, op->originator, ++ (op_merged? " (merged)" : ""), ++ op->id); ++ return; ++ } ++ + set_fencing_completed(op); + clear_remote_op_timers(op); + undo_op_remap(op); + +- if (op->notify_sent == TRUE) { +- crm_err("Already sent notifications for '%s' targeting %s by %s for " +- "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s", +- op->action, op->target, +- (op->delegate? op->delegate : "unknown node"), +- op->client_name, op->originator, pcmk_strerror(rc), +- rc, stonith_op_state_str(op->state), op->id); +- goto remote_op_done_cleanup; +- } +- + if (data == NULL) { + data = create_xml_node(NULL, "remote-op"); + local_data = data; + + } else if (op->delegate == NULL) { +- switch (rc) { +- case -ENODEV: +- case -EHOSTUNREACH: ++ switch (result->execution_status) { ++ case PCMK_EXEC_NO_FENCE_DEVICE: + break; ++ case PCMK_EXEC_INVALID: ++ if (result->exit_status == CRM_EX_EXPIRED) { ++ break; ++ } ++ // else fall through + default: + op->delegate = delegate_from_xml(data); + break; + } + } + +- if(dup) { +- op_merged = TRUE; +- } else if (crm_element_value(data, F_STONITH_MERGED)) { +- op_merged = TRUE; +- } ++ if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) { ++ op_merged = true; ++ } + + /* Tell everyone the operation is done, we will continue + * with doing the local notifications once we receive + * the broadcast back. */ + subt = crm_element_value(data, F_SUBTYPE); +- if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { ++ if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE)); +- goto remote_op_done_cleanup; ++ stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ free_xml(local_data); ++ return; + } + +- if (rc == pcmk_ok || dup) { +- level = LOG_NOTICE; +- } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ if (pcmk__result_ok(result) || dup ++ || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +- +- do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s " ++ do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) " + CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""), + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id); ++ (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : ": "), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ op->id); + +- handle_local_reply_and_notify(op, data, rc); ++ handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); + + if (!dup) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- finalize_op_duplicates(op, data, &result); ++ finalize_op_duplicates(op, data, result); + } + + /* Free non-essential parts of the record +@@ -594,20 +591,27 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + g_list_free_full(op->query_results, free_remote_query); + op->query_results = NULL; + } +- + if (op->request) { + free_xml(op->request); + op->request = NULL; + } + +- remote_op_done_cleanup: + free_xml(local_data); + } + ++/*! ++ * \internal ++ * \brief Finalize a watchdog fencer op after the waiting time expires ++ * ++ * \param[in] userdata Fencer operation that completed ++ * ++ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) ++ */ + static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -615,8 +619,9 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return FALSE; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, &result, false); ++ return G_SOURCE_REMOVE; + } + + static gboolean +@@ -667,7 +672,7 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + op->state = st_failed; + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, NULL, &result, false); + pcmk__reset_result(&result); + } + +@@ -1064,9 +1069,13 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- // For the fencer's purposes, the fencing operation is done ++ { ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, &result, false); ++ } + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1200,6 +1209,16 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + return op; + } + ++/*! ++ * \internal ++ * \brief Create a peer fencing operation from a request, and initiate it ++ * ++ * \param[in] client IPC client that made request (NULL to get from request) ++ * \param[in] request Request XML ++ * \param[in] manual_ack Whether this is a manual action confirmation ++ * ++ * \return Newly created operation on success, otherwise NULL ++ */ + remote_fencing_op_t * + initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + gboolean manual_ack) +@@ -1234,9 +1253,17 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + + switch (op->state) { + case st_failed: +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- remote_op_done(op, NULL, -EINVAL, FALSE); ++ // advance_topology_level() exhausted levels ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, &result, false); ++ pcmk__reset_result(&result); ++ } + return op; + + case st_duplicate: +@@ -1607,7 +1634,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ finalize_op(op, msg, &result, false); + } + } + +@@ -1805,7 +1832,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } + /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from + setting the correct delegate if needed. + */ + +@@ -1816,7 +1843,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); ++ finalize_op(op, NULL, result, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2216,7 +2243,7 @@ fenced_process_fencing_reply(xmlNode *msg) + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2241,7 +2268,7 @@ fenced_process_fencing_reply(xmlNode *msg) + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + +@@ -2268,20 +2295,20 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + } + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ +-- +2.27.0 + + +From 8f19c09f1b961ba9aa510b7dcd1875bbabcddcdc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:39:23 -0600 +Subject: [PATCH 14/23] Refactor: fencer: pass full result when broadcasting + replies + +Rename stonith_bcast_result_to_peers() to fenced_broadcast_op_result() for +consistency, and make it take the full result as an argument instead of a +legacy return code. The full result is not yet used, but that is planned. +--- + daemons/fenced/fenced_history.c | 18 ++++++++++++------ + daemons/fenced/fenced_remote.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 9 ++------- + 3 files changed, 26 insertions(+), 16 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 9e38ff0a20..1e07a9815a 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -359,24 +359,29 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); + + g_hash_table_iter_init(&iter, remote_history); + while (g_hash_table_iter_next(&iter, NULL, (void **)&op)) { +- + if (stonith__op_state_pending(op->state) && + pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ + crm_warn("Failing pending operation %.8s originated by us but " + "known only from peer history", op->id); + op->state = st_failed; + set_fencing_completed(op); +- /* use -EHOSTUNREACH to not introduce a new return-code that might +- trigger unexpected results at other places and to prevent +- finalize_op from setting the delegate if not present +- */ +- stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); ++ ++ /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() ++ * from setting a delegate ++ */ ++ pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ "Initiated by earlier fencer " ++ "process and presumed failed"); ++ fenced_broadcast_op_result(op, &result, false); + } + + g_hash_table_iter_steal(&iter); +@@ -391,6 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + ++ pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index aefc5f311c..a0f026c790 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -374,12 +374,21 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + return notify_data; + } + ++/*! ++ * \internal ++ * \brief Broadcast a fence result notification to all CPG peers ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * \param[in] op_merged Whether this operation is a duplicate of another ++ */ + void +-stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, rc); ++ xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -558,7 +567,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ fenced_broadcast_op_result(op, result, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index d5f4bc79fd..ed47ab046c 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -153,13 +153,8 @@ typedef struct remote_fencing_op_s { + + } remote_fencing_op_t; + +-/*! +- * \internal +- * \brief Broadcast the result of an operation to the peers. +- * \param op, Operation whose result should be broadcast +- * \param rc, Result of the operation +- */ +-void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 3396e66b4c9cca895c7412b66159fd2342de1911 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:42:46 -0600 +Subject: [PATCH 15/23] Feature: fencer: add full result to local replies + +handle_local_reply_and_notify() now takes the full result as an argument +instead of a legacy return code, and adds it to the reply to the local +requester. It does not add it to notifications yet, but that is planned. +--- + daemons/fenced/fenced_remote.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index a0f026c790..329e06c444 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -409,8 +409,17 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + return; + } + ++/*! ++ * \internal ++ * \brief Reply to a local request originator and notify all subscribed clients ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -421,26 +430,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, rc); ++ notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- reply = fenced_construct_reply(op->request, data, &result); +- } ++ reply = fenced_construct_reply(op->request, data, result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + + /* mark this op as having notify's already sent */ +@@ -587,7 +589,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + ((result->exit_reason == NULL)? "" : result->exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); ++ handle_local_reply_and_notify(op, data, result); + + if (!dup) { + finalize_op_duplicates(op, data, result); +-- +2.27.0 + + +From 004583f3ef908cbd9dc6305597cb55d5ad22882c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:47:13 -0600 +Subject: [PATCH 16/23] Refactor: fencer: pass full result when sending device + notifications + +Rename do_stonith_notify_device() to fenced_send_device_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 15 +++++++++++++-- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 84f89e8daf..86a761dfab 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3190,7 +3190,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3204,7 +3204,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 56acc93f31..42e167ce78 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -394,10 +394,21 @@ do_stonith_notify_config(const char *op, int rc, + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a device change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or ++ * STONITH_OP_DEVICE_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc ID of device that changed ++ */ + void +-do_stonith_notify_device(const char *op, int rc, const char *desc) ++fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + + void +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index ed47ab046c..0b63680171 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -230,7 +230,9 @@ void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + + void do_stonith_notify(const char *type, int result, xmlNode *data); +-void do_stonith_notify_device(const char *op, int rc, const char *desc); ++void fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + void do_stonith_notify_level(const char *op, int rc, const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, +-- +2.27.0 + + +From ee0777d5ca99d8d2d7805d4a73241ab696c68751 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:51:55 -0600 +Subject: [PATCH 17/23] Refactor: fencer: pass full result when sending + topology notifications + +Rename do_stonith_notify_level() to fenced_send_level_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 21 +++++++++++++++------ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 86a761dfab..2f3dbb035a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3216,7 +3216,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3229,7 +3229,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 42e167ce78..773cf57f6b 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -411,10 +411,21 @@ fenced_send_device_notification(const char *op, + do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a topology level change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc String representation of level ([]) ++ */ + void +-do_stonith_notify_level(const char *op, int rc, const char *desc) ++fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); + } + + static void +@@ -429,8 +440,7 @@ topology_remove_helper(const char *node, int level) + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + + fenced_unregister_level(data, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_DEL, &result, desc); + pcmk__reset_result(&result); + free_xml(data); + free(desc); +@@ -480,8 +490,7 @@ handle_topology_change(xmlNode *match, bool remove) + } + + fenced_register_level(match, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_ADD, &result, desc); + pcmk__reset_result(&result); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0b63680171..8503e813bf 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -233,7 +233,9 @@ void do_stonith_notify(const char *type, int result, xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-void do_stonith_notify_level(const char *op, int rc, const char *desc); ++void fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, +-- +2.27.0 + + +From deec1ea9bcd7e0062755aa8b74358bfd12e4b9f0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:53:26 -0600 +Subject: [PATCH 18/23] Refactor: fencer: pass full result when sending + configuration notifications + +Rename do_stonith_notify_config() to send_config_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/pacemaker-fenced.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 773cf57f6b..d64358e07f 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -379,8 +379,19 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_trace("Notify complete"); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a configuration change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, ++ * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc Description of what changed ++ * \param[in] active Current number of devices or topologies in use ++ */ + static void +-do_stonith_notify_config(const char *op, int rc, ++send_config_notification(const char *op, const pcmk__action_result_t *result, + const char *desc, int active) + { + xmlNode *notify_data = create_xml_node(NULL, op); +@@ -390,7 +401,7 @@ do_stonith_notify_config(const char *op, int rc, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, rc, notify_data); ++ do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + free_xml(notify_data); + } + +@@ -408,7 +419,7 @@ fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); ++ send_config_notification(op, result, desc, g_hash_table_size(device_list)); + } + + /*! +@@ -425,7 +436,7 @@ fenced_send_level_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); ++ send_config_notification(op, result, desc, g_hash_table_size(topology)); + } + + static void +-- +2.27.0 + + +From 432e4445b630fb158482a5f6de1e0e41697a381f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:56:12 -0600 +Subject: [PATCH 19/23] Feature: fencer: pass full result when sending + notifications + +Rename do_stonith_notify() to fenced_send_notification() for consistency, and +make it take the full result as an argument rather than a legacy return code, +and add the full result to the notifications. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/fenced_history.c | 6 +++--- + daemons/fenced/fenced_remote.c | 6 +++--- + daemons/fenced/pacemaker-fenced.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 5 files changed, 23 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f3dbb035a..54ebc12947 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2489,8 +2489,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 1e07a9815a..44310ed77b 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, + g_hash_table_foreach_remove(stonith_remote_op_list, + stonith_remove_history_entry, + (gpointer) target); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +@@ -402,7 +402,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + + if (updated) { + stonith_fence_history_trim(); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + if (cnt == 0) { +@@ -473,7 +473,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + is done so send a notification for anything + that smells like history-sync + */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY_SYNCED, NULL, NULL); + if (crm_element_value(msg, F_STONITH_CALLID)) { + /* this is coming from the stonith-API + * +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 329e06c444..16c181b4b0 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -442,8 +442,8 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; +@@ -1211,7 +1211,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + if (op->state != st_duplicate) { + /* kick history readers */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + /* safe to trim as long as that doesn't touch pending ops */ +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index d64358e07f..6b31b814a3 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -356,8 +356,17 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Notify relevant IPC clients of a fencing operation result ++ * ++ * \param[in] type Notification type ++ * \param[in] result Result of fencing operation (assume success if NULL) ++ * \param[in] data If not NULL, add to notification as call data ++ */ + void +-do_stonith_notify(const char *type, int result, xmlNode *data) ++fenced_send_notification(const char *type, const pcmk__action_result_t *result, ++ xmlNode *data) + { + /* TODO: Standardize the contents of data */ + xmlNode *update_msg = create_xml_node(NULL, "notify"); +@@ -367,7 +376,7 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); + crm_xml_add(update_msg, F_SUBTYPE, type); + crm_xml_add(update_msg, F_STONITH_OPERATION, type); +- crm_xml_add_int(update_msg, F_STONITH_RC, result); ++ stonith__xe_set_result(update_msg, result); + + if (data != NULL) { + add_message_xml(update_msg, F_STONITH_CALLDATA, data); +@@ -401,7 +410,7 @@ send_config_notification(const char *op, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); ++ fenced_send_notification(op, result, notify_data); + free_xml(notify_data); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 8503e813bf..502fcc9a29 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -229,7 +229,9 @@ xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + +-void do_stonith_notify(const char *type, int result, xmlNode *data); ++void fenced_send_notification(const char *type, ++ const pcmk__action_result_t *result, ++ xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-- +2.27.0 + + +From 86deababe506c2bb8259538e5380b6a78dc4b770 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:58:03 -0600 +Subject: [PATCH 20/23] Feature: fencer: pass full result when sending + notifications + +Rename create_op_done_notify() to fencing_result2xml() for readability, +make it take the full result as an argument rather than a legacy return code, +and add the full result to broadcasts and notifications. +--- + daemons/fenced/fenced_remote.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 16c181b4b0..4cf723e6df 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -356,13 +356,22 @@ undo_op_remap(remote_fencing_op_t *op) + } + } + ++/*! ++ * \internal ++ * \brief Create notification data XML for a fencing operation result ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * ++ * \return Newly created XML to add as notification data ++ * \note The caller is responsible for freeing the result. ++ */ + static xmlNode * +-create_op_done_notify(remote_fencing_op_t * op, int rc) ++fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + + crm_xml_add_int(notify_data, "state", op->state); +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); + crm_xml_add(notify_data, F_STONITH_TARGET, op->target); + crm_xml_add(notify_data, F_STONITH_ACTION, op->action); + crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); +@@ -371,6 +380,7 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + ++ stonith__xe_set_result(notify_data, result); + return notify_data; + } + +@@ -388,7 +398,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); ++ xmlNode *notify_data = fencing_result2xml(op, result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -430,7 +440,6 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); +@@ -442,13 +451,14 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ ++ notify_data = fencing_result2xml(op, result); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; + free_xml(reply); +- free_xml(notify_data); + } + + /*! +-- +2.27.0 + + +From 2814cde97520b63ca5f9baf3df37d73507e89d34 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 15 Dec 2021 17:40:52 -0600 +Subject: [PATCH 21/23] Low: fencer: restore check for invalid topology level + target + +... per review. b7c7676c mistakenly dropped it +--- + daemons/fenced/fenced_commands.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 54ebc12947..1a4a791385 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1636,6 +1636,16 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + ++ // Ensure a valid target was specified ++ if ((mode < 0) || (mode > 2)) { ++ crm_warn("Ignoring topology level registration without valid target"); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level target"); ++ return; ++ } ++ + // Ensure level ID is in allowed range + if ((id <= 0) || (id >= ST_LEVEL_MAX)) { + crm_warn("Ignoring topology registration for %s with invalid level %d", +@@ -1643,7 +1653,7 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + free(target); + crm_log_xml_warn(level, "Bad level"); + pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, +- "Invalid topology level"); ++ "Invalid topology level number"); + return; + } + +-- +2.27.0 + + +From c82806f9e16abcea00025fd3a290477aef2d8d83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:23:29 -0600 +Subject: [PATCH 22/23] Low: fencer: free result memory when processing fencing + replies + +found in review +--- + daemons/fenced/fenced_remote.c | 24 +++++++++++++++--------- + 1 file changed, 15 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 4cf723e6df..9fda9ef060 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2241,14 +2241,14 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- return; ++ goto done; + } + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- return; ++ goto done; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +@@ -2265,14 +2265,15 @@ fenced_process_fencing_reply(xmlNode *msg) + op->state = st_failed; + } + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- return; ++ goto done; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2290,7 +2291,7 @@ fenced_process_fencing_reply(xmlNode *msg) + * and notify our local clients. */ + if (op->state == st_done) { + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + + if ((op->phase == 2) && !pcmk__result_ok(&result)) { +@@ -2310,27 +2311,30 @@ fenced_process_fencing_reply(xmlNode *msg) + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- return; ++ goto done; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + } ++ + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else { + /* fall-through and attempt other fencing action using another peer */ + } +@@ -2340,6 +2344,8 @@ fenced_process_fencing_reply(xmlNode *msg) + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); + request_peer_fencing(op, NULL, &result); ++done: ++ pcmk__reset_result(&result); + } + + gboolean +-- +2.27.0 + + +From 137bf97fdb39043eebb02a0d3ebbe47ee8c7044c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:26:22 -0600 +Subject: [PATCH 23/23] Log: fencer: clarify timeout message + +... as suggested by review +--- + daemons/fenced/fenced_remote.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 9fda9ef060..1e237150c5 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -656,7 +656,7 @@ remote_op_timeout_one(gpointer userdata) + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Peer did not send fence result within timeout"); ++ "Peer did not return fence result within timeout"); + + + // Try another device, if appropriate +-- +2.27.0 + diff --git a/SOURCES/010-probe-failures.patch b/SOURCES/010-probe-failures.patch new file mode 100644 index 0000000..d90fc3c --- /dev/null +++ b/SOURCES/010-probe-failures.patch @@ -0,0 +1,4157 @@ +From f2e51898735b5e9990464141fc4aea3dd83f5067 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 14:36:41 -0400 +Subject: [PATCH 01/21] Refactor: scheduler: Use bool in unpack_rsc_op. + +Previously, we were using bool but TRUE/FALSE. Instead, use the actual +values. +--- + lib/pengine/unpack.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b1e84110a2..ecc7275e15 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3671,7 +3671,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + const char *task = NULL; + const char *task_key = NULL; + const char *exit_reason = NULL; +- bool expired = FALSE; ++ bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; + +@@ -3727,7 +3727,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if ((status != PCMK_EXEC_NOT_INSTALLED) + && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { +- expired = TRUE; ++ expired = true; + } + + if (!strcmp(task, CRMD_ACTION_STATUS)) { +-- +2.27.0 + + +From 4c961b8e670d336a368c7fd1535c247e40c6b48e Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 15:07:01 -0400 +Subject: [PATCH 02/21] Refactor: scheduler: Add functions for determining if + an op is a probe. + +--- + include/crm/common/util.h | 3 + + lib/common/operations.c | 21 +++++++ + lib/common/tests/operations/Makefile.am | 6 +- + .../tests/operations/pcmk_is_probe_test.c | 37 +++++++++++++ + .../tests/operations/pcmk_xe_is_probe_test.c | 55 +++++++++++++++++++ + lib/pengine/unpack.c | 12 ++-- + lib/pengine/utils.c | 5 +- + 7 files changed, 127 insertions(+), 12 deletions(-) + create mode 100644 lib/common/tests/operations/pcmk_is_probe_test.c + create mode 100644 lib/common/tests/operations/pcmk_xe_is_probe_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index 2728b64492..fbea6e560c 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -72,6 +72,9 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + const char *timeout); + #define CRM_DEFAULT_OP_TIMEOUT_S "20s" + ++bool pcmk_is_probe(const char *task, guint interval); ++bool pcmk_xe_is_probe(xmlNode *xml_op); ++ + int compare_version(const char *version1, const char *version2); + + /* coverity[+kill] */ +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 366c189702..978df79082 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -537,3 +537,24 @@ pcmk__is_fencing_action(const char *action) + { + return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); + } ++ ++bool ++pcmk_is_probe(const char *task, guint interval) ++{ ++ if (task == NULL) { ++ return false; ++ } ++ ++ return (interval == 0) && pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none); ++} ++ ++bool ++pcmk_xe_is_probe(xmlNode *xml_op) ++{ ++ const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *interval_ms_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS); ++ int interval_ms; ++ ++ pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); ++ return pcmk_is_probe(task, interval_ms); ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index c8814ff0a8..2e3d0b0679 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -1,5 +1,5 @@ + # +-# Copyright 2020 the Pacemaker project contributors ++# Copyright 2020-2021 the Pacemaker project contributors + # + # The version control history for this file may have further details. + # +@@ -12,6 +12,8 @@ LDADD = $(top_builddir)/lib/common/libcrmcommon.la -lcmocka + include $(top_srcdir)/mk/tap.mk + + # Add "_test" to the end of all test program names to simplify .gitignore. +-check_PROGRAMS = parse_op_key_test ++check_PROGRAMS = parse_op_key_test \ ++ pcmk_is_probe_test \ ++ pcmk_xe_is_probe_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_is_probe_test.c b/lib/common/tests/operations/pcmk_is_probe_test.c +new file mode 100644 +index 0000000000..9b449f1a70 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_is_probe_test.c +@@ -0,0 +1,37 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++is_probe_test(void **state) ++{ ++ assert_false(pcmk_is_probe(NULL, 0)); ++ assert_false(pcmk_is_probe("", 0)); ++ assert_false(pcmk_is_probe("blahblah", 0)); ++ assert_false(pcmk_is_probe("monitor", 1)); ++ assert_true(pcmk_is_probe("monitor", 0)); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/common/tests/operations/pcmk_xe_is_probe_test.c b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +new file mode 100644 +index 0000000000..0283d1c145 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +@@ -0,0 +1,55 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_probe_test(void **state) ++{ ++ xmlNode *node = NULL; ++ ++ assert_false(pcmk_xe_is_probe(NULL)); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_is_probe(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index ecc7275e15..7c0c66e696 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -83,7 +83,6 @@ is_dangling_guest_node(pe_node_t *node) + return FALSE; + } + +- + /*! + * \brief Schedule a fence action for a node + * +@@ -2984,7 +2983,6 @@ static void + unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, + enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + pe_action_t *action = NULL; + +@@ -2998,10 +2996,7 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + *last_failure = xml_op; + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; +- } ++ is_probe = pcmk_xe_is_probe(xml_op); + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3163,8 +3158,9 @@ determine_op_status( + } + + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; ++ is_probe = pcmk_xe_is_probe(xml_op); ++ ++ if (is_probe) { + task = "probe"; + } + +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index c5eda3898e..07753e173a 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -1066,8 +1066,7 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + { + int timeout_ms = 0; + const char *value = NULL; +- bool is_probe = pcmk__str_eq(action->task, RSC_STATUS, pcmk__str_casei) +- && (interval_ms == 0); ++ bool is_probe = false; + #if ENABLE_VERSIONED_ATTRS + pe_rsc_action_details_t *rsc_details = NULL; + #endif +@@ -1094,6 +1093,8 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + + CRM_CHECK(action && action->rsc, return); + ++ is_probe = pcmk_is_probe(action->task, interval_ms); ++ + // Cluster-wide + pe__unpack_dataset_nvpairs(data_set->op_defaults, XML_TAG_META_SETS, &rule_data, + action->meta, NULL, FALSE, data_set); +-- +2.27.0 + + +From 09f32df97ab5064a15ba5a1fb3970d5c64ee7b30 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:47:22 -0500 +Subject: [PATCH 03/21] Refactor: scheduler: Move setting interval_ms in + determine_op_status. + +This can now happen in the only place it's being used. +--- + lib/pengine/unpack.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 7c0c66e696..b9986d2462 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3142,7 +3142,6 @@ static int + determine_op_status( + pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + int result = PCMK_EXEC_DONE; + const char *key = get_op_key(xml_op); +@@ -3157,7 +3156,6 @@ determine_op_status( + exit_reason = ""; + } + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + is_probe = pcmk_xe_is_probe(xml_op); + + if (is_probe) { +@@ -3230,12 +3228,17 @@ determine_op_status( + result = PCMK_EXEC_ERROR_FATAL; + break; + +- case PCMK_OCF_UNIMPLEMENT_FEATURE: ++ case PCMK_OCF_UNIMPLEMENT_FEATURE: { ++ guint interval_ms = 0; ++ crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); ++ + if (interval_ms > 0) { + result = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through ++ } ++ + case PCMK_OCF_NOT_INSTALLED: + case PCMK_OCF_INVALID_PARAM: + case PCMK_OCF_INSUFFICIENT_PRIV: +-- +2.27.0 + + +From 6c8f47453afd6c100fddc45187faff17e15f7bfe Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:57:57 -0500 +Subject: [PATCH 04/21] Refactor: scheduler: Add pcmk_xe_mask_failed_probe. + +Given an xmlNodePtr for a resource operation, this function will +determine whether it is a failed probe operation that should not be +displayed in crm_mon (or other places, I suppose) or not. +--- + include/crm/common/util.h | 1 + + lib/common/operations.c | 17 ++ + lib/common/tests/operations/Makefile.am | 3 +- + .../pcmk_xe_mask_probe_failure_test.c | 162 ++++++++++++++++++ + 4 files changed, 182 insertions(+), 1 deletion(-) + create mode 100644 lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index fbea6e560c..784069ba1b 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -74,6 +74,7 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + + bool pcmk_is_probe(const char *task, guint interval); + bool pcmk_xe_is_probe(xmlNode *xml_op); ++bool pcmk_xe_mask_probe_failure(xmlNode *xml_op); + + int compare_version(const char *version1, const char *version2); + +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 978df79082..54482b8863 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -558,3 +558,20 @@ pcmk_xe_is_probe(xmlNode *xml_op) + pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); + return pcmk_is_probe(task, interval_ms); + } ++ ++bool ++pcmk_xe_mask_probe_failure(xmlNode *xml_op) ++{ ++ int status = PCMK_EXEC_UNKNOWN; ++ int rc = PCMK_OCF_OK; ++ ++ if (!pcmk_xe_is_probe(xml_op)) { ++ return false; ++ } ++ ++ crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); ++ crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); ++ ++ return rc == PCMK_OCF_NOT_INSTALLED || rc == PCMK_OCF_INVALID_PARAM || ++ status == PCMK_EXEC_NOT_INSTALLED; ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index 2e3d0b0679..457c5f7c7a 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -14,6 +14,7 @@ include $(top_srcdir)/mk/tap.mk + # Add "_test" to the end of all test program names to simplify .gitignore. + check_PROGRAMS = parse_op_key_test \ + pcmk_is_probe_test \ +- pcmk_xe_is_probe_test ++ pcmk_xe_is_probe_test \ ++ pcmk_xe_mask_probe_failure_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +new file mode 100644 +index 0000000000..a13f6d98f4 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +@@ -0,0 +1,162 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_not_probe_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* Not worth testing this thoroughly since it's just a duplicate of whether ++ * pcmk_op_is_probe works or not. ++ */ ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++op_does_not_have_right_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++check_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* PCMK_EXEC_NOT_SUPPORTED */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_DONE */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_NOT_INSTALLED */ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_HARD */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_FATAL */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_not_probe_test), ++ cmocka_unit_test(op_does_not_have_right_values_test), ++ cmocka_unit_test(check_values_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +-- +2.27.0 + + +From c9ce1aaf93cd20bb01e80102dda0ffffb07e6472 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 14:26:31 -0500 +Subject: [PATCH 05/21] Refactor: scheduler: Combine op status and rc remapping + into one function. + +Well, not quite. Doing the remapping is complicated enough to where it +makes sense to have them in separate functions. However, they can both +be called from a single new function that takes the place of the +previous two calls in unpack_rsc_op. +--- + lib/pengine/unpack.c | 157 ++++++++++++++++++++----------------------- + 1 file changed, 72 insertions(+), 85 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b9986d2462..b659f319fb 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3121,36 +3121,68 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + /*! + * \internal +- * \brief Remap operation status based on action result ++ * \brief Remap informational monitor results and operation status + * +- * Given an action result, determine an appropriate operation status for the +- * purposes of responding to the action (the status provided by the executor is +- * not directly usable since the executor does not know what was expected). ++ * For the monitor results, certain OCF codes are for providing extended information ++ * to the user about services that aren't yet failed but not entirely healthy either. ++ * These must be treated as the "normal" result by Pacemaker. ++ * ++ * For operation status, the action result can be used to determine an appropriate ++ * status for the purposes of responding to the action. The status provided by the ++ * executor is not directly usable since the executor does not know what was expected. + * ++ * \param[in] xml_op Operation history entry XML from CIB status + * \param[in,out] rsc Resource that operation history entry is for +- * \param[in] rc Actual return code of operation +- * \param[in] target_rc Expected return code of operation + * \param[in] node Node where operation was executed +- * \param[in] xml_op Operation history entry XML from CIB status +- * \param[in,out] on_fail What should be done about the result + * \param[in] data_set Current cluster working set ++ * \param[in,out] on_fail What should be done about the result ++ * \param[in] target_rc Expected return code of operation ++ * \param[in,out] rc Actual return code of operation ++ * \param[in,out] status Operation execution status ++ * ++ * \note If the result is remapped and the node is not shutting down or failed, ++ * the operation will be recorded in the data set's list of failed operations ++ * to highlight it for the user. + * +- * \return Operation status based on return code and action info + * \note This may update the resource's current and next role. + */ +-static int +-determine_op_status( +- pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) +-{ ++static void ++remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set, enum action_fail_response *on_fail, ++ int target_rc, int *rc, int *status) { + bool is_probe = false; +- int result = PCMK_EXEC_DONE; +- const char *key = get_op_key(xml_op); + const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *key = get_op_key(xml_op); + const char *exit_reason = crm_element_value(xml_op, + XML_LRM_ATTR_EXIT_REASON); + ++ if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { ++ int remapped_rc = pcmk__effective_rc(*rc); ++ ++ if (*rc != remapped_rc) { ++ crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); ++ if (!node->details->shutdown || node->details->online) { ++ record_failed_op(xml_op, node, rsc, data_set); ++ } ++ ++ *rc = remapped_rc; ++ } ++ } ++ ++ /* If the executor reported an operation status of anything but done or ++ * error, consider that final. But for done or error, we know better whether ++ * it should be treated as a failure or not, because we know the expected ++ * result. ++ */ ++ if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { ++ return; ++ } ++ + CRM_ASSERT(rsc); +- CRM_CHECK(task != NULL, return PCMK_EXEC_ERROR); ++ CRM_CHECK(task != NULL, ++ *status = PCMK_EXEC_ERROR; return); ++ ++ *status = PCMK_EXEC_DONE; + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3171,23 +3203,23 @@ determine_op_status( + * those versions or processing of saved CIB files from those versions, + * so we do not need to care much about this case. + */ +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", + key, node->details->uname); + +- } else if (target_rc != rc) { +- result = PCMK_EXEC_ERROR; ++ } else if (target_rc != *rc) { ++ *status = PCMK_EXEC_ERROR; + pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", + key, node->details->uname, + target_rc, services_ocf_exitcode_str(target_rc), +- rc, services_ocf_exitcode_str(rc), ++ *rc, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason); + } + +- switch (rc) { ++ switch (*rc) { + case PCMK_OCF_OK: + if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, "Probe found %s active on %s at %s", + rsc->id, node->details->uname, + last_change_str(xml_op)); +@@ -3195,10 +3227,10 @@ determine_op_status( + break; + + case PCMK_OCF_NOT_RUNNING: +- if (is_probe || (target_rc == rc) ++ if (is_probe || (target_rc == *rc) + || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { + +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + rsc->role = RSC_ROLE_STOPPED; + + /* clear any previous failure actions */ +@@ -3208,8 +3240,8 @@ determine_op_status( + break; + + case PCMK_OCF_RUNNING_PROMOTED: +- if (is_probe && (rc != target_rc)) { +- result = PCMK_EXEC_DONE; ++ if (is_probe && (*rc != target_rc)) { ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, + "Probe found %s active and promoted on %s at %s", + rsc->id, node->details->uname, +@@ -3221,11 +3253,11 @@ determine_op_status( + case PCMK_OCF_DEGRADED_PROMOTED: + case PCMK_OCF_FAILED_PROMOTED: + rsc->role = RSC_ROLE_PROMOTED; +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + break; + + case PCMK_OCF_NOT_CONFIGURED: +- result = PCMK_EXEC_ERROR_FATAL; ++ *status = PCMK_EXEC_ERROR_FATAL; + break; + + case PCMK_OCF_UNIMPLEMENT_FEATURE: { +@@ -3233,7 +3265,7 @@ determine_op_status( + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + + if (interval_ms > 0) { +- result = PCMK_EXEC_NOT_SUPPORTED; ++ *status = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through +@@ -3248,26 +3280,27 @@ determine_op_status( + pe_proc_err("No further recovery can be attempted for %s " + "because %s on %s failed (%s%s%s) at %s " + CRM_XS " rc=%d id=%s", rsc->id, task, +- node->details->uname, services_ocf_exitcode_str(rc), ++ node->details->uname, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason, +- last_change_str(xml_op), rc, ID(xml_op)); ++ last_change_str(xml_op), *rc, ID(xml_op)); + pe__clear_resource_flags(rsc, pe_rsc_managed); + pe__set_resource_flags(rsc, pe_rsc_block); + } +- result = PCMK_EXEC_ERROR_HARD; ++ *status = PCMK_EXEC_ERROR_HARD; + break; + + default: +- if (result == PCMK_EXEC_DONE) { ++ if (*status == PCMK_EXEC_DONE) { + crm_info("Treating unknown exit status %d from %s of %s " + "on %s at %s as failure", +- rc, task, rsc->id, node->details->uname, ++ *rc, task, rsc->id, node->details->uname, + last_change_str(xml_op)); +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + } + break; + } +- return result; ++ ++ pe_rsc_trace(rsc, "Remapped %s status to %d", key, *status); + } + + // return TRUE if start or monitor last failure but parameters changed +@@ -3622,41 +3655,6 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + } + } + +-/*! +- * \internal +- * \brief Remap informational monitor results to usual values +- * +- * Certain OCF result codes are for providing extended information to the +- * user about services that aren't yet failed but not entirely healthy either. +- * These must be treated as the "normal" result by Pacemaker. +- * +- * \param[in] rc Actual result of a monitor action +- * \param[in] xml_op Operation history XML +- * \param[in] node Node that operation happened on +- * \param[in] rsc Resource that operation happened to +- * \param[in] data_set Cluster working set +- * +- * \return Result code that pacemaker should use +- * +- * \note If the result is remapped, and the node is not shutting down or failed, +- * the operation will be recorded in the data set's list of failed +- * operations, to highlight it for the user. +- */ +-static int +-remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, +- const pe_resource_t *rsc, pe_working_set_t *data_set) +-{ +- int remapped_rc = pcmk__effective_rc(rc); +- +- if (rc != remapped_rc) { +- crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); +- if (!node->details->shutdown || node->details->online) { +- record_failed_op(xml_op, node, rsc, data_set); +- } +- } +- return remapped_rc; +-} +- + static void + unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + xmlNode **last_failure, enum action_fail_response *on_fail, +@@ -3712,7 +3710,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + node->details->uname, rsc->id); + } + +- /* It should be possible to call remap_monitor_rc() first then call ++ /* It should be possible to call remap_operation() first then call + * check_operation_expiry() only if rc != target_rc, because there should + * never be a fail count without at least one unexpected result in the + * resource history. That would be more efficient by avoiding having to call +@@ -3729,9 +3727,8 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + +- if (!strcmp(task, CRMD_ACTION_STATUS)) { +- rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); +- } ++ remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, ++ &rc, &status); + + if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); +@@ -3761,16 +3758,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + +- /* If the executor reported an operation status of anything but done or +- * error, consider that final. But for done or error, we know better whether +- * it should be treated as a failure or not, because we know the expected +- * result. +- */ +- if(status == PCMK_EXEC_DONE || status == PCMK_EXEC_ERROR) { +- status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); +- pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); +- } +- + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From 9fdca1999872b3930cf18b7d807ddb259f23e8a5 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:08:16 -0500 +Subject: [PATCH 06/21] Test: cts-cli: Add test output for a native resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 16 +++++++++++ + cts/cli/regression.crm_mon.exp | 50 ++++++++++++++++++++++++++-------- + 2 files changed, 55 insertions(+), 11 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index e6c6894b6f..b7817e4775 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,16 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -94,6 +104,9 @@ + + + ++ ++ ++ + + + +@@ -135,6 +148,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8714f917a9..d12dce3ae8 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,6 +3485,9 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3495,7 +3498,7 @@ Active Resources: + + + +- ++ + + + +@@ -3548,6 +3551,7 @@ Active Resources: + + + ++ + + + +@@ -3574,6 +3578,9 @@ Active Resources: + + + ++ ++ ++ + + + +@@ -3603,6 +3610,9 @@ Active Resources: + + + ++ ++ ++ + + + =#=#=#= End test: XML output of partially active resources - OK (0) =#=#=#= +@@ -3614,7 +3624,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3631,6 +3641,10 @@ Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3640,13 +3654,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: ++ * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] +@@ -3676,6 +3691,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3695,6 +3712,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3704,7 +3724,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3722,7 +3742,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3741,7 +3761,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3759,7 +3779,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,7 +3797,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3806,6 +3826,7 @@ Inactive Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: + * Node: cluster01: +@@ -3826,6 +3847,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3845,6 +3868,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3854,7 +3880,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3865,6 +3891,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +@@ -3875,7 +3902,7 @@ Full List of Resources: + + + +- ++ + + + +@@ -3905,6 +3932,7 @@ Full List of Resources: + + + ++ + + + +-- +2.27.0 + + +From 1c54d0bbb74d066d55a56eae28d1a579b8854604 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:17:52 -0500 +Subject: [PATCH 07/21] Test: cts-cli: Add test output for a cloned resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 3 +++ + cts/cli/regression.crm_mon.exp | 12 ++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index b7817e4775..1f9dc156aa 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -107,6 +107,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d12dce3ae8..d093bd8106 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3488,6 +3488,7 @@ Active Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3581,6 +3582,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3612,6 +3616,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3645,6 +3650,7 @@ Full List of Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,6 +3699,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3715,6 +3723,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3849,6 +3858,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3871,6 +3882,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +-- +2.27.0 + + +From 9408f08c07eb531ff84b07bf959f3d681ebf2b78 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:48:16 -0500 +Subject: [PATCH 08/21] Test: cts-cli: Change the resources in + partially-active-group. + +dummy-2 is now not running because it failed to start due to an +unimplemented feature. I don't know what could possibly be +unimplemented about a dummy resource, but it's not important. + +There is also a new dummy-3 resource that acts exactly the same as +dummy-2. This preserves checking that the inactive member output can +still be displayed. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 6 +++- + cts/cli/regression.crm_mon.exp | 62 +++++++++++++++++++++++----------- + 2 files changed, 47 insertions(+), 21 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1f9dc156aa..1ce80ea58a 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -54,7 +54,8 @@ + + + +- ++ ++ + + + +@@ -104,6 +105,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d093bd8106..8cf3a1215e 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,8 +3485,10 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3499,12 +3501,12 @@ Failed Resource Actions: + + + +- ++ + + + + +- ++ + + + +@@ -3546,11 +3548,14 @@ Failed Resource Actions: + + + +- ++ + + + +- ++ ++ ++ ++ + + + +@@ -3579,6 +3584,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3615,6 +3623,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3629,7 +3638,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3645,10 +3654,12 @@ Full List of Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3660,7 +3671,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3676,7 +3687,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3697,6 +3708,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3722,6 +3735,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3733,7 +3747,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3742,6 +3756,7 @@ Node List: + Active Resources: + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group + =#=#=#= Begin test: Text output of partially active group, with inactive resources =#=#=#= +@@ -3751,7 +3766,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3760,7 +3775,8 @@ Node List: + Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3770,7 +3786,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3788,7 +3804,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3796,7 +3812,10 @@ Node List: + + Active Resources: + * Resource Group: partially-active-group (1 member inactive): +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ ++Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3806,7 +3825,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3820,7 +3839,7 @@ Node List: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +- * 1 (ocf:pacemaker:Dummy): Active ++ * 2 (ocf:pacemaker:Dummy): Active + * 1 (ocf:pacemaker:remote): Active + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: +@@ -3834,7 +3853,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3856,6 +3875,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3881,6 +3902,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3892,7 +3914,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3914,7 +3936,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 85e76b8bdb4de261a9cb4858eeedd49fba0346a1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:55:51 -0500 +Subject: [PATCH 09/21] Test: cts-cli: Add a failed probe on a new dummy-4 + resource. + +This is to verify that these resources which are part of a group are +displayed properly. No code changes will be necessary, since groups are +just several other resources all in the same pile. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 4 +++ + cts/cli/regression.crm_mon.exp | 51 ++++++++++++++++++++++------------ + 2 files changed, 37 insertions(+), 18 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1ce80ea58a..d4d4a70848 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,7 @@ + + + ++ + + + +@@ -108,6 +109,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8cf3a1215e..c524b199e3 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3483,12 +3483,13 @@ Active Resources: + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3501,7 +3502,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3548,7 +3549,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3556,6 +3557,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3587,6 +3589,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3624,6 +3629,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3638,7 +3644,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3656,10 +3662,12 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3671,7 +3679,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3687,7 +3695,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3710,6 +3718,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3736,6 +3746,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3747,14 +3758,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= +@@ -3766,7 +3777,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,6 +3788,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3786,14 +3798,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + =#=#=#= End test: Text output of active member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of active member of partially active group +@@ -3804,14 +3816,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +@@ -3825,7 +3837,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3853,7 +3865,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3877,6 +3889,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3903,6 +3917,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3914,7 +3929,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3936,7 +3951,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 206d733b6ce8e0ffcad243d282e8baa8c3ff72b4 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 23 Nov 2021 14:33:47 -0500 +Subject: [PATCH 10/21] Test: cts-cli: Add test output for a bundle resource + with a failed probe op. + +This just changes the existing failed bundle resource from not starting +to failing with a reason. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 9 ++++++++ + cts/cli/regression.crm_mon.exp | 40 +++++++++++++++++++++++++--------- + 2 files changed, 39 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index d4d4a70848..5981fc653c 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -178,5 +178,14 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index c524b199e3..b690a26fb6 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3482,7 +3482,7 @@ Active Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3492,6 +3492,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3509,7 +3510,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3540,7 +3541,9 @@ Failed Resource Actions: + + + +- ++ ++ ++ + + + +@@ -3626,12 +3629,18 @@ Failed Resource Actions: + + + ++ ++ ++ ++ ++ + + + + + + ++ + + + +@@ -3657,7 +3666,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3670,6 +3679,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,7 +3703,7 @@ Full List of Resources: + * Stopped: [ cluster02 ] + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + +@@ -3743,12 +3753,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3856,14 +3870,14 @@ Node List: + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: + * 1 (ocf:heartbeat:apache): Active ++ * GuestNode httpd-bundle-1@cluster01: online: ++ * Resources: ++ * 1 (ocf:heartbeat:apache): Active + + Inactive Resources: + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] + * Stopped: [ cluster02 ] +- * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped +@@ -3914,12 +3928,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3939,7 +3957,7 @@ Full List of Resources: + * Started: [ cluster01 ] + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node +@@ -3972,7 +3990,9 @@ Full List of Resources: + + + +- ++ ++ ++ + + + +-- +2.27.0 + + +From 6240a28d36c0349e3b1d7f52c36106580c53bb01 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 22 Nov 2021 10:59:10 -0500 +Subject: [PATCH 11/21] Test: cts: Add --show-detail to a couple of the crm_mon + tests. + +This straightens out a couple differences in output between running +tests locally (where --enable-compat-2.0 is not given, which would +automatically add --show-detail) and running tests under mock (where +that option is given). + +Note that this only really matters for failed resource actions, which +were not previously output as part of any crm_mon regression test. It +is only the patches in this series that have introduced those, and thus +this difference. +--- + cts/cli/regression.crm_mon.exp | 131 ++++++++++++++++++++------------- + cts/cts-cli.in | 10 +-- + 2 files changed, 83 insertions(+), 58 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b690a26fb6..d7b9d98e2c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3466,33 +3466,42 @@ Operations: + =#=#=#= Begin test: Text output of partially active resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3649,24 +3658,32 @@ Failed Resource Actions: + =#=#=#= Begin test: Text output of partially active resources, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3675,46 +3692,54 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3734,7 +3759,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3758,11 +3783,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3826,14 +3851,14 @@ Active Resources: + =#=#=#= Begin test: Text output of inactive member of partially active group =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +@@ -3841,27 +3866,27 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Node cluster01: online: ++ * Node cluster01 (1): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active + * 1 (ocf:pacemaker:ping): Active + * 1 (ocf:pacemaker:remote): Active + * 1 (stonith:fence_xvm): Active +- * Node cluster02: online: ++ * Node cluster02 (2): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +@@ -3876,20 +3901,20 @@ Node List: + + Inactive Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3909,7 +3934,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3933,11 +3958,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index d32bfb7ed1..457816afab 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -420,7 +420,7 @@ function test_crm_mon() { + export CIB_file="$test_home/cli/crm_mon-partial.xml" + + desc="Text output of partially active resources" +- cmd="crm_mon -1" ++ cmd="crm_mon -1 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="XML output of partially active resources" +@@ -428,13 +428,13 @@ function test_crm_mon() { + test_assert_validate $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources" +- cmd="crm_mon -1 -r" ++ cmd="crm_mon -1 -r --show-detail" + test_assert $CRM_EX_OK 0 + + # XML already includes inactive resources + + desc="Complete brief text output, with inactive resources" +- cmd="crm_mon -1 -r --include=all --brief" ++ cmd="crm_mon -1 -r --include=all --brief --show-detail" + test_assert $CRM_EX_OK 0 + + # XML does not have a brief output option +@@ -452,11 +452,11 @@ function test_crm_mon() { + test_assert $CRM_EX_OK 0 + + desc="Text output of inactive member of partially active group" +- cmd="crm_mon -1 --resource=dummy-2" ++ cmd="crm_mon -1 --resource=dummy-2 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Complete brief text output grouped by node, with inactive resources" +- cmd="crm_mon -1 -r --include=all --group-by-node --brief" ++ cmd="crm_mon -1 -r --include=all --group-by-node --brief --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources, filtered by node" +-- +2.27.0 + + +From da14053e5957d84ed0647688d37733adc2f988a3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 29 Nov 2021 15:05:42 -0500 +Subject: [PATCH 12/21] Test: scheduler: Add tests for failed probe operations. + +This adds identical sets of tests for primitive resources and cloned +resources. For the moment, the output reflects the current state of the +code. No changes have been made to properly handle these operations +yet. + +Each set has three resources, and each is set up with a slightly +different configuration of probe failures: + +(1) - Maskable probe failure on each node. +(2) - Maskable probe failure on one node, successful "not running" probe + on the other node. The resource should be started on the node + where "not running" was returned. +(3) - Maskable probe failure on one node, non-maskable probe failure on + the other node. The resource should not be running anywhere, and + should be stopped on the node with the non-maskable failure. +--- + cts/cts-scheduler.in | 2 + + cts/scheduler/dot/failed-probe-clone.dot | 30 ++++ + cts/scheduler/dot/failed-probe-primitive.dot | 4 + + cts/scheduler/exp/failed-probe-clone.exp | 141 ++++++++++++++++++ + cts/scheduler/exp/failed-probe-primitive.exp | 20 +++ + .../scores/failed-probe-clone.scores | 33 ++++ + .../scores/failed-probe-primitive.scores | 9 ++ + .../summary/failed-probe-clone.summary | 46 ++++++ + .../summary/failed-probe-primitive.summary | 27 ++++ + cts/scheduler/xml/failed-probe-clone.xml | 110 ++++++++++++++ + cts/scheduler/xml/failed-probe-primitive.xml | 71 +++++++++ + 11 files changed, 493 insertions(+) + create mode 100644 cts/scheduler/dot/failed-probe-clone.dot + create mode 100644 cts/scheduler/dot/failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/failed-probe-clone.exp + create mode 100644 cts/scheduler/exp/failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/failed-probe-clone.scores + create mode 100644 cts/scheduler/scores/failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/failed-probe-clone.summary + create mode 100644 cts/scheduler/summary/failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/failed-probe-clone.xml + create mode 100644 cts/scheduler/xml/failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 17fd6cefdf..3abcbc6c9d 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -113,6 +113,8 @@ TESTS = [ + [ "probe-3", "Probe (pending node)" ], + [ "probe-4", "Probe (pending node + stopped resource)" ], + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], ++ [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], ++ [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/failed-probe-clone.dot b/cts/scheduler/dot/failed-probe-clone.dot +new file mode 100644 +index 0000000000..90536b46ed +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-clone.dot +@@ -0,0 +1,30 @@ ++ digraph "g" { ++"ping-1_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-1_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2-clone_running_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2-clone_start_0" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2-clone_start_0" -> "ping-2_start_0 cluster02" [ style = bold] ++"ping-2-clone_start_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-2_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_monitor_10000 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_start_0 cluster02" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2_start_0 cluster02" -> "ping-2_monitor_10000 cluster02" [ style = bold] ++"ping-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3-clone_running_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_start_0" -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3-clone_start_0" -> "ping-3_start_0 " [ style = dashed] ++"ping-3-clone_start_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_stop_0" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3-clone_stop_0" -> "ping-3_stop_0 cluster01" [ style = bold] ++"ping-3-clone_stop_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3-clone_stopped_0" -> "ping-3-clone_start_0" [ style = dashed] ++"ping-3-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-3_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3_start_0 " -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3_start_0 " [ style=dashed color="red" fontcolor="black"] ++"ping-3_stop_0 cluster01" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3_stop_0 cluster01" -> "ping-3_start_0 " [ style = dashed] ++"ping-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/dot/failed-probe-primitive.dot b/cts/scheduler/dot/failed-probe-primitive.dot +new file mode 100644 +index 0000000000..6e0c83216a +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-primitive.dot +@@ -0,0 +1,4 @@ ++ digraph "g" { ++"dummy-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/failed-probe-clone.exp b/cts/scheduler/exp/failed-probe-clone.exp +new file mode 100644 +index 0000000000..6be18935bf +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-clone.exp +@@ -0,0 +1,141 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/exp/failed-probe-primitive.exp b/cts/scheduler/exp/failed-probe-primitive.exp +new file mode 100644 +index 0000000000..d0d8aa44dc +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-primitive.exp +@@ -0,0 +1,20 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/failed-probe-clone.scores b/cts/scheduler/scores/failed-probe-clone.scores +new file mode 100644 +index 0000000000..7418b7f153 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-clone.scores +@@ -0,0 +1,33 @@ ++ ++pcmk__clone_allocate: ping-1-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:1 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-3-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__native_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/scores/failed-probe-primitive.scores b/cts/scheduler/scores/failed-probe-primitive.scores +new file mode 100644 +index 0000000000..f313029451 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-primitive.scores +@@ -0,0 +1,9 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-3 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-3 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +new file mode 100644 +index 0000000000..ca15c302aa +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -0,0 +1,46 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * ping-3 (ocf:pacemaker:ping): FAILED cluster01 ++ * Stopped: [ cluster02 ] ++ ++Transition Summary: ++ * Start ping-2:0 ( cluster02 ) ++ * Stop ping-3:0 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Cluster action: clear_failcount for ping-1 on cluster02 ++ * Cluster action: clear_failcount for ping-1 on cluster01 ++ * Cluster action: clear_failcount for ping-2 on cluster02 ++ * Cluster action: clear_failcount for ping-2 on cluster01 ++ * Pseudo action: ping-2-clone_start_0 ++ * Cluster action: clear_failcount for ping-3 on cluster01 ++ * Cluster action: clear_failcount for ping-3 on cluster02 ++ * Pseudo action: ping-3-clone_stop_0 ++ * Resource action: ping-2 start on cluster02 ++ * Pseudo action: ping-2-clone_running_0 ++ * Resource action: ping-3 stop on cluster01 ++ * Pseudo action: ping-3-clone_stopped_0 ++ * Resource action: ping-2 monitor=10000 on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Started: [ cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * Stopped: [ cluster01 cluster02 ] +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +new file mode 100644 +index 0000000000..a634e7f00b +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -0,0 +1,27 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 ++ ++Transition Summary: ++ * Start dummy-2 ( cluster02 ) ++ * Stop dummy-3 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Resource action: dummy-2 start on cluster02 ++ * Resource action: dummy-3 stop on cluster01 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped +diff --git a/cts/scheduler/xml/failed-probe-clone.xml b/cts/scheduler/xml/failed-probe-clone.xml +new file mode 100644 +index 0000000000..f677585bab +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-clone.xml +@@ -0,0 +1,110 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/xml/failed-probe-primitive.xml b/cts/scheduler/xml/failed-probe-primitive.xml +new file mode 100644 +index 0000000000..0c2f6416f5 +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-primitive.xml +@@ -0,0 +1,71 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 271d50e7d6b0ee5ef670b571c6d7aae9272b75ad Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 11 Nov 2021 13:57:05 -0500 +Subject: [PATCH 13/21] Feature: scheduler: Don't output failed resource + probes... + +in the crm_mon "Failed Resource Actions" section. It is expected that +these one-off probes will fail, in which case displaying them in that +section can just come across as confusing to the user. + +And update the crm_mon test output to account for these changes. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 20 -------------------- + lib/pengine/pe_output.c | 4 ++++ + 2 files changed, 4 insertions(+), 20 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d7b9d98e2c..b1643f8b29 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3498,10 +3498,6 @@ Active Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3646,10 +3642,6 @@ Failed Resource Actions: + + + +- +- +- +- + + + +@@ -3693,10 +3685,6 @@ Full List of Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3784,10 +3772,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3959,10 +3943,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c +index 715e001d51..84684598dd 100644 +--- a/lib/pengine/pe_output.c ++++ b/lib/pengine/pe_output.c +@@ -1370,6 +1370,10 @@ failed_action_list(pcmk__output_t *out, va_list args) { + continue; + } + ++ if (pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ + id = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); + if (parse_op_key(id ? id : ID(xml_op), &rsc, NULL, NULL) == FALSE) { + continue; +-- +2.27.0 + + +From 90f641b9223c64701d494297ce3dd3382365acb8 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 10:11:19 -0500 +Subject: [PATCH 14/21] Feature: scheduler: Add a function for finding a failed + probe action... + +for a given resource ID. Optionally, a node ID can also be given to +restrict the failed probe action to one run on the given node. +Otherwise, just the first failed probe action for the resource ID will +be returned. + +See: rhbz#1506372 +--- + include/crm/pengine/internal.h | 2 ++ + lib/pengine/utils.c | 42 ++++++++++++++++++++++++++++++++++ + 2 files changed, 44 insertions(+) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 8c8fbaca90..58dd2e8727 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -574,4 +574,6 @@ gboolean pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean che + gboolean pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + ++xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); ++ + #endif +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 07753e173a..3151f0120b 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2569,3 +2569,45 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + + return resources; + } ++ ++xmlNode * ++pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) ++{ ++ const char *rsc_id = rsc->id; ++ ++ for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; ++ xml_op = pcmk__xml_next(xml_op)) { ++ const char *value = NULL; ++ char *op_id = NULL; ++ ++ /* This resource operation is not a failed probe. */ ++ if (!pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ ++ /* This resource operation was not run on the given node. Note that if name is ++ * NULL, this will always succeed. ++ */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); ++ if (value == NULL || !pcmk__str_eq(value, name, pcmk__str_casei|pcmk__str_null_matches)) { ++ continue; ++ } ++ ++ /* This resource operation has no operation_key. */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); ++ if (!parse_op_key(value ? value : ID(xml_op), &op_id, NULL, NULL)) { ++ continue; ++ } ++ ++ /* This resource operation's ID does not match the rsc_id we are looking for. */ ++ if (!pcmk__str_eq(op_id, rsc_id, pcmk__str_none)) { ++ free(op_id); ++ continue; ++ } ++ ++ free(op_id); ++ return xml_op; ++ } ++ ++ return NULL; ++} +-- +2.27.0 + + +From 2ad9774fe994554243078b131799fed0d1a6dffd Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 15:43:24 -0500 +Subject: [PATCH 15/21] Feature: scheduler: Display the reason why a native rsc + probe failed. + +If inactive resources are being shown, add an extra blurb of text to any +stopped resources that have a failed probe action indicating why the +probe failed. + +And then add a new primitive resource to crm_mon-partial.xml with a +failed probe operation and update the expected test output. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 10 +++++----- + cts/scheduler/summary/failed-probe-primitive.summary | 8 ++++---- + cts/scheduler/summary/multiply-active-stonith.summary | 2 +- + lib/pengine/native.c | 11 +++++++++++ + 4 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b1643f8b29..4333caa11c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3680,8 +3680,8 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +@@ -3811,7 +3811,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3889,7 +3889,7 @@ Inactive Resources: + * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Node Attributes: + * Node: cluster01 (1): +@@ -3963,7 +3963,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +index a634e7f00b..ea8edae494 100644 +--- a/cts/scheduler/summary/failed-probe-primitive.summary ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -4,8 +4,8 @@ Current cluster status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped +- * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 + + Transition Summary: +@@ -22,6 +22,6 @@ Revised Cluster Status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-3 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (not installed) +diff --git a/cts/scheduler/summary/multiply-active-stonith.summary b/cts/scheduler/summary/multiply-active-stonith.summary +index 8ce21d68ee..ec37de03b0 100644 +--- a/cts/scheduler/summary/multiply-active-stonith.summary ++++ b/cts/scheduler/summary/multiply-active-stonith.summary +@@ -25,4 +25,4 @@ Revised Cluster Status: + + * Full List of Resources: + * fencer (stonith:fence_ipmilan): Started node3 +- * rsc1 (lsb:rsc1): Stopped ++ * rsc1 (lsb:rsc1): Stopped (not installed) +diff --git a/lib/pengine/native.c b/lib/pengine/native.c +index 36121c527f..a95c90c09a 100644 +--- a/lib/pengine/native.c ++++ b/lib/pengine/native.c +@@ -599,6 +599,17 @@ pcmk__native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node + g_string_append_printf(outstr, " %s", node->details->uname); + } + ++ // Failed probe operation ++ if (native_displayable_role(rsc) == RSC_ROLE_STOPPED) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node ? node->details->uname : NULL); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_string_append_printf(outstr, " (%s) ", services_ocf_exitcode_str(rc)); ++ } ++ } ++ + // Flags, as: ( [...]) + if (node && !(node->details->online) && node->details->unclean) { + have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); +-- +2.27.0 + + +From b9ca2e834ee01b35c03f153438ef8828b609fb38 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 18 Nov 2021 10:41:42 -0500 +Subject: [PATCH 16/21] Refactor: scheduler: Rearrange pe__clone_default. + +Instead of the single stopped list, maintain a hash table where the keys +are nodes and the values are the status of the node. For now, this is +just "Stopped" or "Stopped (disabled)" but in the future will be +expanded to cover failed probe operations. +--- + lib/pengine/clone.c | 103 +++++++++++++++++++++++++++++++++++--------- + 1 file changed, 82 insertions(+), 21 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 5569c6b6e9..58fb24d24e 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -28,6 +28,55 @@ + #define UNPROMOTED_INSTANCES RSC_ROLE_UNPROMOTED_S + #endif + ++static GList * ++sorted_hash_table_values(GHashTable *table) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!g_list_find_custom(retval, value, (GCompareFunc) strcmp)) { ++ retval = g_list_prepend(retval, (char *) value); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) strcmp); ++ return retval; ++} ++ ++static GList * ++nodes_with_status(GHashTable *table, const char *status) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!strcmp((char *) value, status)) { ++ retval = g_list_prepend(retval, key); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) pcmk__numeric_strcasecmp); ++ return retval; ++} ++ ++static char * ++node_list_to_str(GList *list) ++{ ++ char *retval = NULL; ++ size_t len = 0; ++ ++ for (GList *iter = list; iter != NULL; iter = iter->next) { ++ pcmk__add_word(&retval, &len, (char *) iter->data); ++ } ++ ++ return retval; ++} ++ + static void + clone_header(pcmk__output_t *out, int *rc, pe_resource_t *rsc, clone_variant_data_t *clone_data) + { +@@ -710,10 +759,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + ++ GHashTable *stopped = pcmk__strkey_table(free, free); ++ + char *list_text = NULL; +- char *stopped_list = NULL; + size_t list_text_len = 0; +- size_t stopped_list_len = 0; + + GList *promoted_list = NULL; + GList *started_list = NULL; +@@ -768,7 +817,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, child_rsc->id); ++ g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + + } else if (is_set_recursive(child_rsc, pe_rsc_orphan, TRUE) +@@ -822,7 +871,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- free(stopped_list); ++ g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -890,23 +939,15 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- const char *state = "Stopped"; +- enum rsc_role_e role = configured_role(rsc); +- +- if (role == RSC_ROLE_STOPPED) { +- state = "Stopped (disabled)"; +- } +- + if (!pcmk_is_set(rsc->flags, pe_rsc_unique) + && (clone_data->clone_max > active_instances)) { + + GList *nIter; + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + +- /* Custom stopped list for non-unique clones */ +- free(stopped_list); +- stopped_list = NULL; +- stopped_list_len = 0; ++ /* Custom stopped table for non-unique clones */ ++ g_hash_table_destroy(stopped); ++ stopped = pcmk__strkey_table(free, free); + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -922,19 +963,39 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, +- node->details->uname); ++ const char *state = "Stopped"; ++ ++ if (configured_role(rsc) == RSC_ROLE_STOPPED) { ++ state = "Stopped (disabled)"; ++ } ++ ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); + } + } + g_list_free(list); + } + +- if (stopped_list != NULL) { ++ if (g_hash_table_size(stopped) > 0) { ++ GList *list = sorted_hash_table_values(stopped); ++ + clone_header(out, &rc, rsc, clone_data); + +- out->list_item(out, NULL, "%s: [ %s ]", state, stopped_list); +- free(stopped_list); +- stopped_list_len = 0; ++ for (GList *status_iter = list; status_iter != NULL; status_iter = status_iter->next) { ++ const char *status = status_iter->data; ++ GList *nodes = nodes_with_status(stopped, status); ++ char *str = node_list_to_str(nodes); ++ ++ if (str != NULL) { ++ out->list_item(out, NULL, "%s: [ %s ]", status, str); ++ free(str); ++ } ++ ++ g_list_free(nodes); ++ } ++ ++ g_list_free(list); ++ g_hash_table_destroy(stopped); + + /* If there are no instances of this clone (perhaps because there are no + * nodes configured), simply output the clone header by itself. This can +-- +2.27.0 + + +From 0228a64cea412936fb8ee91b0f83f9800048d3ba Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 10:06:18 -0500 +Subject: [PATCH 17/21] Feature: scheduler: Display the reason why a clone rsc + probe failed. + +This is similar to the previous commit that adds reasons for primitive +resources. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 8 +++---- + .../summary/failed-probe-clone.summary | 14 +++++++------ + include/crm/pengine/internal.h | 2 ++ + lib/pengine/clone.c | 21 +++++++++++++++++-- + lib/pengine/utils.c | 7 +++++++ + 5 files changed, 40 insertions(+), 12 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 4333caa11c..5688500ce5 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3479,7 +3479,7 @@ Node List: + Active Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3663,7 +3663,7 @@ Node List: + Full List of Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3705,7 +3705,7 @@ Full List of Resources: + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 +@@ -3886,7 +3886,7 @@ Node List: + Inactive Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +index ca15c302aa..febee14400 100644 +--- a/cts/scheduler/summary/failed-probe-clone.summary ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -5,12 +5,13 @@ Current cluster status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: + * ping-3 (ocf:pacemaker:ping): FAILED cluster01 +- * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster02 ] + + Transition Summary: + * Start ping-2:0 ( cluster02 ) +@@ -38,9 +39,10 @@ Revised Cluster Status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: + * Started: [ cluster02 ] +- * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster02 ] +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 58dd2e8727..2b20da6e5f 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -576,4 +576,6 @@ gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean ch + + xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); + ++const char *pe__clone_child_id(pe_resource_t *rsc); ++ + #endif +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 58fb24d24e..ef4bdc0edf 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -963,14 +963,23 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node->details->uname); + const char *state = "Stopped"; + + if (configured_role(rsc) == RSC_ROLE_STOPPED) { + state = "Stopped (disabled)"; + } + +- g_hash_table_insert(stopped, strdup(node->details->uname), +- strdup(state)); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ crm_strdup_printf("Stopped (%s)", services_ocf_exitcode_str(rc))); ++ } else { ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); ++ } + } + } + g_list_free(list); +@@ -1113,3 +1122,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent + + return !passes; + } ++ ++const char * ++pe__clone_child_id(pe_resource_t *rsc) ++{ ++ clone_variant_data_t *clone_data = NULL; ++ get_clone_variant_data(clone_data, rsc); ++ return ID(clone_data->xml_obj_child); ++} +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 3151f0120b..6c4f3b6971 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2573,8 +2573,15 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + xmlNode * + pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) + { ++ pe_resource_t *parent = uber_parent(rsc); + const char *rsc_id = rsc->id; + ++ if (rsc->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(rsc); ++ } else if (parent->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(parent); ++ } ++ + for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; + xml_op = pcmk__xml_next(xml_op)) { + const char *value = NULL; +-- +2.27.0 + + +From cf8b01da93fce87526617fefdcee6eb9f6ecdbd1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 24 Nov 2021 10:57:05 -0500 +Subject: [PATCH 18/21] Test: cts-cli: Update the last-rc-change sed + expression. + +This can now occur in both the XML output (where it's wrapped in double +quotes) and the text output (where it's wrapped in single quotes and +followed by a comma). In addition, a plus or minus can occur in the +time string. + +The "{0,1}" syntax takes the place of a "?" for marking the optional +comma. In FreeBSD sed, "?" doesn't mean anything special. +--- + cts/cli/regression.crm_mon.exp | 12 ++++++------ + cts/cts-cli.in | 2 +- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 5688500ce5..957758832d 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3497,7 +3497,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3641,7 +3641,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3684,7 +3684,7 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3771,7 +3771,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3850,7 +3850,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3942,7 +3942,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index 457816afab..72e9a1e912 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -1870,7 +1870,7 @@ for t in $tests; do + -e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \ +- -e 's/ last-rc-change=\"[A-Za-z0-9: ]*\"//'\ ++ -e "s/ last-rc-change=['\"][-+A-Za-z0-9: ]*['\"],\{0,1\}//" \ + -e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\ + -e 's/^Entity: line [0-9][0-9]*: //'\ + -e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \ +-- +2.27.0 + + +From dea61f1b6507fbc978e040c1555384d8d7ffa9f3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 16:23:14 -0500 +Subject: [PATCH 19/21] Fix: include: Bump feature set to 3.12.0. + +This is for the scheduler handling changing regarding maskable probe +failures. + +See: rhbz#1506372. +--- + include/crm/crm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 04d2324d75..16b35e9c55 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.11.0" ++# define CRM_FEATURE_SET "3.12.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +-- +2.27.0 + + +From fef2c61ef462c221809dc91467ea1e96d5478c74 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 6 Dec 2021 16:42:15 -0500 +Subject: [PATCH 20/21] Feature: scheduler: Handle masked probes in the + scheduler. + +These probe operations get their rc/status codes mapped to not +running/done, but still ensures they end up in the list of failed +operations so tool output continues to display them properly. + +Note that failures on bundled resources do not get masked. + +There are no test case changes for this patch. + +See: rhbz#1506372. +--- + lib/pengine/unpack.c | 42 +++++++++++++++++++++++++++++++++++++----- + 1 file changed, 37 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b659f319fb..f3583e97d8 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3169,6 +3169,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, + } + } + ++ if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { ++ *status = PCMK_EXEC_DONE; ++ *rc = PCMK_OCF_NOT_RUNNING; ++ } ++ + /* If the executor reported an operation status of anything but done or + * error, consider that final. But for done or error, we know better whether + * it should be treated as a failure or not, because we know the expected +@@ -3567,12 +3572,12 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + CRM_ASSERT(rsc); + CRM_ASSERT(xml_op); + +- if (rc == PCMK_OCF_NOT_RUNNING) { +- clear_past_failure = TRUE; +- +- } else if (rc == PCMK_OCF_NOT_INSTALLED) { ++ if (rc == PCMK_OCF_NOT_INSTALLED || (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op))) { + rsc->role = RSC_ROLE_STOPPED; + ++ } else if (rc == PCMK_OCF_NOT_RUNNING) { ++ clear_past_failure = TRUE; ++ + } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) { + if (last_failure) { + const char *op_key = get_op_key(xml_op); +@@ -3661,8 +3666,10 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + pe_working_set_t *data_set) + { + int rc = 0; ++ int old_rc = 0; + int task_id = 0; + int target_rc = 0; ++ int old_target_rc = 0; + int status = PCMK_EXEC_UNKNOWN; + guint interval_ms = 0; + const char *task = NULL; +@@ -3671,6 +3678,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; ++ bool maskable_probe_failure = false; + + CRM_CHECK(rsc && node && xml_op, return); + +@@ -3727,10 +3735,22 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + ++ old_rc = rc; ++ old_target_rc = target_rc; ++ + remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, + &rc, &status); + +- if (expired && (rc != target_rc)) { ++ maskable_probe_failure = !pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op); ++ ++ if (expired && maskable_probe_failure && old_rc != old_target_rc) { ++ if (rsc->role <= RSC_ROLE_STOPPED) { ++ rsc->role = RSC_ROLE_UNKNOWN; ++ } ++ ++ goto done; ++ ++ } else if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); + + if (interval_ms == 0) { +@@ -3758,6 +3778,18 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + ++ if (maskable_probe_failure) { ++ crm_notice("Treating probe result '%s' for %s on %s as 'not running'", ++ services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, ++ on_fail, data_set); ++ crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); ++ ++ record_failed_op(xml_op, node, rsc, data_set); ++ resource_location(parent, node, -INFINITY, "masked-probe-failure", data_set); ++ goto done; ++ } ++ + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From ccff6eb60598f389008b0621447056457da79671 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 4 Jan 2022 10:14:48 -0500 +Subject: [PATCH 21/21] Test: scheduler: Add tests for expired, masked probe + failures. + +dummy-1 is a stopped resource with an expired masked probe failure. +This probe should be rescheduled. dummy-2 is a started resource with an +expired masked probe failure. This probe should not be rescheduled. +--- + cts/cts-scheduler.in | 1 + + .../dot/expired-failed-probe-primitive.dot | 8 ++ + .../exp/expired-failed-probe-primitive.exp | 45 ++++++++++++ + .../expired-failed-probe-primitive.scores | 7 ++ + .../expired-failed-probe-primitive.summary | 26 +++++++ + .../xml/expired-failed-probe-primitive.xml | 73 +++++++++++++++++++ + 6 files changed, 160 insertions(+) + create mode 100644 cts/scheduler/dot/expired-failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/expired-failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/expired-failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/expired-failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/expired-failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 3abcbc6c9d..7bc41a0936 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -115,6 +115,7 @@ TESTS = [ + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], + [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], + [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], ++ [ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/expired-failed-probe-primitive.dot b/cts/scheduler/dot/expired-failed-probe-primitive.dot +new file mode 100644 +index 0000000000..610c2b8047 +--- /dev/null ++++ b/cts/scheduler/dot/expired-failed-probe-primitive.dot +@@ -0,0 +1,8 @@ ++ digraph "g" { ++"dummy-1_monitor_0 cluster01" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"dummy-1_monitor_0 cluster02" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-1_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-2_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/expired-failed-probe-primitive.exp b/cts/scheduler/exp/expired-failed-probe-primitive.exp +new file mode 100644 +index 0000000000..3c2cbfe411 +--- /dev/null ++++ b/cts/scheduler/exp/expired-failed-probe-primitive.exp +@@ -0,0 +1,45 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/expired-failed-probe-primitive.scores b/cts/scheduler/scores/expired-failed-probe-primitive.scores +new file mode 100644 +index 0000000000..51ae5510e6 +--- /dev/null ++++ b/cts/scheduler/scores/expired-failed-probe-primitive.scores +@@ -0,0 +1,7 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 +diff --git a/cts/scheduler/summary/expired-failed-probe-primitive.summary b/cts/scheduler/summary/expired-failed-probe-primitive.summary +new file mode 100644 +index 0000000000..ac0604e84f +--- /dev/null ++++ b/cts/scheduler/summary/expired-failed-probe-primitive.summary +@@ -0,0 +1,26 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Transition Summary: ++ * Start dummy-1 ( cluster02 ) ++ ++Executing Cluster Transition: ++ * Resource action: dummy-1 monitor on cluster02 ++ * Resource action: dummy-1 monitor on cluster01 ++ * Resource action: dummy-2 monitor on cluster01 ++ * Resource action: dummy-1 start on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +diff --git a/cts/scheduler/xml/expired-failed-probe-primitive.xml b/cts/scheduler/xml/expired-failed-probe-primitive.xml +new file mode 100644 +index 0000000000..684aa73f92 +--- /dev/null ++++ b/cts/scheduler/xml/expired-failed-probe-primitive.xml +@@ -0,0 +1,73 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + diff --git a/SOURCES/011-fencing-reasons.patch b/SOURCES/011-fencing-reasons.patch new file mode 100644 index 0000000..4422ca0 --- /dev/null +++ b/SOURCES/011-fencing-reasons.patch @@ -0,0 +1,1450 @@ +From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:25:21 -0600 +Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function + renames + +--- + doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index a51220cac9..68158484ce 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -106,7 +106,7 @@ or messaging layer callback, which calls: + the number of active peers), and if this is the last expected reply, + calls + +- * ``call_remote_stonith()``, which calculates the timeout and sends ++ * ``request_peer_fencing()``, which calculates the timeout and sends + ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target + node has a fencing "topology" (which allows specifications such as + "this node can be fenced either with device A, or devices B and C in +@@ -156,7 +156,7 @@ returns, and calls + * done callback (``st_child_done()``), which calls ``schedule_stonith_command()`` + for a new device if there are further required actions to execute or if the + original action failed, then builds and sends an XML reply to the original +- fencer (via ``stonith_send_async_reply()``), then checks whether any ++ fencer (via ``send_async_reply()``), then checks whether any + pending actions are the same as the one just executed and merges them if so. + + Fencing replies +@@ -169,18 +169,18 @@ messaging layer callback, which calls: + + * ``handle_reply()``, which calls + +- * ``process_remote_stonith_exec()``, which calls either +- ``call_remote_stonith()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ * ``fenced_process_fencing_reply()``, which calls either ++ ``request_peer_fencing()`` (to retry a failed operation, or try the next ++ device in a topology is appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or +- ``remote_op_done()`` (if the operation is definitively failed or ++ ``finalize_op()`` (if the operation is definitively failed or + successful). + +- * remote_op_done() broadcasts the result to all peers. ++ * ``finalize_op()`` broadcasts the result to all peers. + + Finally, all peers receive the broadcast result and call + +-* ``remote_op_done()``, which sends the result to all local clients. ++* ``finalize_op()``, which sends the result to all local clients. + + + .. index:: +-- +2.27.0 + + +From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 09:58:16 -0600 +Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action + callback data + +stonith_callback_data_t previously only contained the legacy return code for +the action. Use its new opaque member to store the full result, along with +accessors (available only internally for now). +--- + include/crm/fencing/internal.h | 3 ++ + lib/fencing/st_client.c | 99 ++++++++++++++++++++++++++-------- + 2 files changed, 81 insertions(+), 21 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index f0d294a0b3..eff689e59b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++int stonith__exit_status(stonith_callback_data_t *data); ++int stonith__execution_status(stonith_callback_data_t *data); ++const char *stonith__exit_reason(stonith_callback_data_t *data); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2ca094566b..9d93ffd481 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) + * \param[in] st Fencer API connection + * \param[in] call_id If positive, call ID of completed fence action, otherwise + * legacy return code for early action failure +- * \param[in] rc Legacy return code for action result ++ * \param[in] result Full result for action + * \param[in] userdata User data to pass to callback + * \param[in] callback Fence action callback to invoke + */ + static void +-invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, ++invoke_fence_action_callback(stonith_t *st, int call_id, ++ pcmk__action_result_t *result, ++ void *userdata, + void (*callback) (stonith_t *st, + stonith_callback_data_t *data)) + { + stonith_callback_data_t data = { 0, }; + + data.call_id = call_id; +- data.rc = rc; ++ data.rc = pcmk_rc2legacy(stonith__result2rc(result)); + data.userdata = userdata; ++ data.opaque = (void *) result; + + callback(st, &data); + } +@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *cb_info = NULL; +- int rc = pcmk_ok; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(stonith != NULL, return); + CRM_CHECK(stonith->st_private != NULL, return); +@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + + if (msg == NULL) { + // Fencer didn't reply in time +- rc = -ETIME; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Timeout waiting for reply from fencer"); + CRM_LOG_ASSERT(call_id > 0); + + } else { + // We have the fencer reply +- +- if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { +- rc = -pcmk_err_generic; +- } +- + if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) + || (call_id <= 0)) { + crm_log_xml_warn(msg, "Bad fencer reply"); + } ++ stonith__xe_get_result(msg, &result); + } + + if (call_id > 0) { +@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + } + + if ((cb_info != NULL) && (cb_info->callback != NULL) +- && (rc == pcmk_ok || !(cb_info->only_success))) { ++ && (pcmk__result_ok(&result) || !(cb_info->only_success))) { + crm_trace("Invoking callback %s for call %d", + crm_str(cb_info->id), call_id); +- invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, +- cb_info->callback); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ cb_info->user_data, cb_info->callback); + +- } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { +- crm_warn("Fencing action without registered callback failed: %s", +- pcmk_strerror(rc)); ++ } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { ++ crm_warn("Fencing action without registered callback failed: %d (%s)", ++ result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); + crm_log_xml_debug(msg, "Failed fence update"); + } + + if (private->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); +- invoke_fence_action_callback(stonith, call_id, rc, NULL, ++ invoke_fence_action_callback(stonith, call_id, &result, NULL, + private->op_callback); + } + + if (cb_info != NULL) { + stonith_api_del_callback(stonith, call_id, FALSE); + } ++ pcmk__reset_result(&result); + } + + static gboolean +@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti + CRM_CHECK(stonith->st_private != NULL, return -EINVAL); + private = stonith->st_private; + +- if (call_id == 0) { ++ if (call_id == 0) { // Add global callback + private->op_callback = callback; + +- } else if (call_id < 0) { ++ } else if (call_id < 0) { // Call failed immediately, so call callback now + if (!(options & st_opt_report_only_success)) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); +- invoke_fence_action_callback(stonith, call_id, call_id, user_data, +- callback); ++ pcmk__set_result(&result, CRM_EX_ERROR, ++ stonith__legacy2status(call_id), NULL); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ user_data, callback); + } else { + crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); + } +@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, + freeXpathObject(xpath); + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit status from callback data ++ */ ++int ++stonith__exit_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Execution status from callback data ++ */ ++int ++stonith__execution_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit reason from callback data ++ */ ++const char * ++stonith__exit_reason(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_reason; ++} ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 9 Nov 2021 16:16:03 -0600 +Subject: [PATCH 03/17] Log: controller: improve fencing result messages + +Now that fence callbacks get the full result, we can log a better message. +Also check for error conditions better, improve message wording, and ensure +only a single message is logged per result. +--- + daemons/controld/controld_fencing.c | 83 +++++++++++++++++++---------- + 1 file changed, 56 insertions(+), 27 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f5a252c813..f8d2fc13f4 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + int stonith_id = -1; + int transition_id = -1; + crm_action_t *action = NULL; +- int call_id = data->call_id; +- int rc = data->rc; +- char *userdata = data->userdata; +- +- CRM_CHECK(userdata != NULL, return); +- crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, +- pcmk_strerror(rc), rc); ++ const char *target = NULL; + +- if (AM_I_DC == FALSE) { ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence operation %d result: " ++ "No transition key given (bug?)", ++ ((data == NULL)? -1 : data->call_id)); + return; + } + +- /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ +- /* op->call_id, op->optype, op->node_name, op->op_result, */ +- /* (char *)op->node_list, op->private_data); */ ++ if (!AM_I_DC) { ++ const char *reason = stonith__exit_reason(data); ++ ++ if (reason == NULL) { ++ reason = pcmk_exec_status_str(stonith__execution_status(data)); ++ } ++ crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s", ++ data->call_id, stonith__exit_status(data), reason, ++ (const char *) data->userdata); ++ return; ++ } + +- /* filter out old STONITH actions */ +- CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), ++ CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, ++ &stonith_id, NULL), + goto bail); + +- if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei) +- || transition_graph->id != transition_id) { +- crm_info("Ignoring STONITH action initiated outside of the current transition"); ++ if (transition_graph->complete || (stonith_id < 0) ++ || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none) ++ || (transition_graph->id != transition_id)) { ++ crm_info("Ignoring fence operation %d result: " ++ "Not from current transition " CRM_XS ++ " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", ++ data->call_id, pcmk__btoa(transition_graph->complete), ++ stonith_id, uuid, te_uuid, transition_id, transition_graph->id); + goto bail; + } + + action = controld_get_action(stonith_id); + if (action == NULL) { +- crm_err("Stonith action not matched"); ++ crm_err("Ignoring fence operation %d result: " ++ "Action %d not found in transition graph (bug?) " ++ CRM_XS " uuid=%s transition=%d", ++ data->call_id, stonith_id, uuid, transition_id); ++ goto bail; ++ } ++ ++ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (target == NULL) { ++ crm_err("Ignoring fence operation %d result: No target given (bug?)", ++ data->call_id); + goto bail; + } + + stop_te_timer(action->timer); +- if (rc == pcmk_ok) { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (stonith__exit_status(data) == CRM_EX_OK) { + const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + const char *op = crm_meta_value(action->params, "stonith_action"); + +- crm_info("Stonith operation %d for %s passed", call_id, target); ++ crm_notice("Fence operation %d for %s passed", data->call_id, target); + if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { + te_action_confirmed(action, NULL); + if (pcmk__str_eq("on", op, pcmk__str_casei)) { +@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + st_fail_count_reset(target); + + } else { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + enum transition_action abort_action = tg_restart; ++ int status = stonith__execution_status(data); ++ const char *reason = stonith__exit_reason(data); + ++ if (reason == NULL) { ++ if (status == PCMK_EXEC_DONE) { ++ reason = "Agent returned error"; ++ } else { ++ reason = pcmk_exec_status_str(status); ++ } ++ } + crm__set_graph_action_flags(action, pcmk__graph_action_failed); +- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", +- call_id, target, pcmk_strerror(rc)); + + /* If no fence devices were available, there's no use in immediately + * checking again, so don't start a new transition in that case. + */ +- if (rc == -ENODEV) { +- crm_warn("No devices found in cluster to fence %s, giving up", +- target); ++ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { ++ crm_warn("Fence operation %d for %s failed: %s " ++ "(aborting transition and giving up for now)", ++ data->call_id, target, reason); + abort_action = tg_stop; ++ } else { ++ crm_notice("Fence operation %d for %s failed: %s " ++ "(aborting transition)", data->call_id, target, reason); + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + trigger_graph(); + + bail: +- free(userdata); ++ free(data->userdata); + free(uuid); + return; + } +-- +2.27.0 + + +From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:37:16 -0600 +Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc() + function + +action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as +appropriate to the action standard. However, it was called only from a place +that did not process stonith actions, so that place can just call +services_result2ocf() directly. + +This will simplify planned changes. +--- + daemons/execd/execd_commands.c | 24 ++++++------------------ + 1 file changed, 6 insertions(+), 18 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5bb2aab692..5e123e322e 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc) + return rc; + } + +-static int +-action_get_uniform_rc(svc_action_t *action) +-{ +- lrmd_cmd_t *cmd = action->cb_data; +- +- if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH, +- pcmk__str_casei)) { +- return stonith2uniform_rc(cmd->action, action->rc); +- } else { +- enum ocf_exitcode code = services_result2ocf(action->standard, +- cmd->action, action->rc); +- +- // Cast variable instead of function return to keep compilers happy +- return (int) code; +- } +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -848,6 +831,7 @@ action_complete(svc_action_t * action) + { + lrmd_rsc_t *rsc; + lrmd_cmd_t *cmd = action->cb_data; ++ enum ocf_exitcode code; + + #ifdef PCMK__TIME_USE_CGT + const char *rclass = NULL; +@@ -867,8 +851,12 @@ action_complete(svc_action_t * action) + #endif + + cmd->last_pid = action->pid; +- pcmk__set_result(&(cmd->result), action_get_uniform_rc(action), ++ ++ // Cast variable instead of function return to keep compilers happy ++ code = services_result2ocf(action->standard, cmd->action, action->rc); ++ pcmk__set_result(&(cmd->result), (int) code, + action->status, services__exit_reason(action)); ++ + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + + #ifdef PCMK__TIME_USE_CGT +-- +2.27.0 + + +From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:39:30 -0600 +Subject: [PATCH 05/17] Feature: executor: use full result from fencer for + fence actions + +Now that fence callbacks get the full result, we can improve the executor +command result for fence actions. stonith_action_complete() now takes a +full result, allowing the executor to use that directly rather than map a +legacy return code. +--- + daemons/execd/execd_commands.c | 140 +++++++++++++++++++-------------- + 1 file changed, 80 insertions(+), 60 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5e123e322e..e722994012 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -8,6 +8,7 @@ + */ + + #include ++#include + + #include + +@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) + } + } + +-static int +-stonith2uniform_rc(const char *action, int rc) +-{ +- switch (rc) { +- case pcmk_ok: +- rc = PCMK_OCF_OK; +- break; +- +- case -ENODEV: +- /* This should be possible only for probes in practice, but +- * interpret for all actions to be safe. +- */ +- if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- rc = PCMK_OCF_NOT_RUNNING; +- } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) { +- rc = PCMK_OCF_OK; +- } else { +- rc = PCMK_OCF_NOT_INSTALLED; +- } +- break; +- +- case -EOPNOTSUPP: +- rc = PCMK_OCF_UNIMPLEMENT_FEATURE; +- break; +- +- default: +- rc = PCMK_OCF_UNKNOWN_ERROR; +- break; +- } +- return rc; +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -988,46 +957,84 @@ action_complete(svc_action_t * action) + cmd_finalize(cmd, rsc); + } + ++/*! ++ * \internal ++ * \brief Process the result of a fence device action (start, stop, or monitor) ++ * ++ * \param[in] cmd Fence device action that completed ++ * \param[in] exit_status Fencer API exit status for action ++ * \param[in] execution_status Fencer API execution status for action ++ * \param[in] exit_reason Human-friendly detail, if action failed ++ */ + static void +-stonith_action_complete(lrmd_cmd_t * cmd, int rc) ++stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, ++ enum pcmk_exec_status execution_status, ++ const char *exit_reason) + { + // This can be NULL if resource was removed before command completed + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + +- cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc); ++ // Simplify fencer exit status to uniform exit status ++ if (exit_status != CRM_EX_OK) { ++ exit_status = PCMK_OCF_UNKNOWN_ERROR; ++ } + +- /* This function may be called with status already set to cancelled, if a +- * pending action was aborted. Otherwise, we need to determine status from +- * the fencer return code. +- */ +- if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { +- cmd->result.execution_status = stonith__legacy2status(rc); ++ if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { ++ /* An in-flight fence action was cancelled. The execution status is ++ * already correct, so don't overwrite it. ++ */ ++ execution_status = PCMK_EXEC_CANCELLED; + +- // Simplify status codes from fencer +- switch (cmd->result.execution_status) { ++ } else { ++ /* Some execution status codes have specific meanings for the fencer ++ * that executor clients may not expect, so map them to a simple error ++ * status. ++ */ ++ switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: +- cmd->result.execution_status = PCMK_EXEC_ERROR; ++ execution_status = PCMK_EXEC_ERROR; + break; +- default: ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: ++ /* This should be possible only for probes in practice, but ++ * interpret for all actions to be safe. ++ */ ++ if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_NOT_RUNNING; ++ ++ } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_OK; ++ ++ } else { ++ exit_status = PCMK_OCF_NOT_INSTALLED; ++ } ++ execution_status = PCMK_EXEC_ERROR; + break; +- } + +- // Certain successful actions change the known state of the resource +- if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { +- if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK +- } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING +- } ++ case PCMK_EXEC_NOT_SUPPORTED: ++ exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; ++ break; ++ ++ default: ++ break; + } + } + +- // Give the user more detail than an OCF code +- if (rc != -pcmk_err_generic) { +- cmd->result.exit_reason = strdup(pcmk_strerror(rc)); ++ pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); ++ ++ // Certain successful actions change the known state of the resource ++ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { ++ ++ if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { ++ rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ ++ } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { ++ rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ } + } + + /* The recurring timer should not be running at this point in any case, but +@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + static void + lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- stonith_action_complete(data->userdata, data->rc); ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence action result: " ++ "Invalid callback arguments (bug?)"); ++ } else { ++ stonith_action_complete((lrmd_cmd_t *) data->userdata, ++ stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); ++ } + } + + void +@@ -1097,7 +1112,9 @@ stonith_connection_failed(void) + crm_err("Connection to fencer failed, finalizing %d pending operations", + g_list_length(cmd_list)); + for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { +- stonith_action_complete(cmd_iter->data, -ENOTCONN); ++ stonith_action_complete((lrmd_cmd_t *) cmd_iter->data, ++ CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, ++ "Lost connection to fencer"); + } + g_list_free(cmd_list); + } +@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +- if (rc == 0) { ++ if (rc == pcmk_ok) { + do_monitor = TRUE; + } + +@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + } + } + +- stonith_action_complete(cmd, rc); ++ stonith_action_complete(cmd, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); + } + + static int +-- +2.27.0 + + +From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 16:15:05 -0600 +Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence + results as a hard error + +Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to +PCMK_EXEC_ERROR to keep handling of that situation the same as before the new +code was added. + +However, the earlier handling was less than ideal -- a resource action that +failed due to missing secrets would be retried on the same node, and almost +certainly fail again for the same reason. Now, the executor passes along +PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the +CIB status, and the scheduler will treat it as a hard error (i.e. not retrying +on the same node). + +Backward compatibility isn't a problem because the scheduler treats unknown +status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to +handle it as before. The CRM feature set has been bumped so the handling can't +flip back and forth in a mixed-version cluster. +--- + daemons/execd/execd_commands.c | 1 - + include/crm/crm.h | 4 ++-- + lib/pengine/unpack.c | 3 --- + 3 files changed, 2 insertions(+), 6 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index e722994012..4ced6d1d5c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_SECRETS: + execution_status = PCMK_EXEC_ERROR; + break; + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 16b35e9c55..56b07cb12a 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.12.0" ++# define CRM_FEATURE_SET "3.13.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 3e0384cd2a..8a2d2a6d6d 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + case PCMK_EXEC_INVALID: + break; // Not done, do error handling + +- /* These should only be possible in fence action results, not operation +- * history, but have some handling in place as a fail-safe. +- */ + case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: + status = PCMK_EXEC_ERROR_HARD; +-- +2.27.0 + + +From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:05:20 -0600 +Subject: [PATCH 07/17] Low: executor: improve result for fence device probes + +Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy +return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and +rename to fence_probe_result). Set an appropriate exit reason when available. +--- + daemons/execd/execd_commands.c | 57 ++++++++++++++++++++++++++------- + daemons/execd/pacemaker-execd.h | 9 +++++- + 2 files changed, 54 insertions(+), 12 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 4ced6d1d5c..6e5505e973 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg) + rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); + rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); + rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc); +- rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running" ++ ++ // Initialize fence device probes (to return "not running") ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; + return rsc; + } + +@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ rsc->fence_probe_result = PCMK_EXEC_DONE; // "running" + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running" + } + } + +@@ -1081,14 +1083,13 @@ stonith_connection_failed(void) + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + /* If we registered this fence device, we don't know whether the + * fencer still has the registration or not. Cause future probes to +- * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or +- * started successfully. This is especially important if the +- * controller also went away (possibly due to a cluster layer +- * restart) and won't receive our client notification of any +- * monitors finalized below. ++ * return an error until the resource is stopped or started ++ * successfully. This is especially important if the controller also ++ * went away (possibly due to a cluster layer restart) and won't ++ * receive our client notification of any monitors finalized below. + */ +- if (rsc->st_probe_rc == pcmk_ok) { +- rsc->st_probe_rc = pcmk_err_generic; ++ if (rsc->fence_probe_result == PCMK_EXEC_DONE) { ++ rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED; + } + + if (rsc->active) { +@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) + return rc; + } + ++/*! ++ * \internal ++ * \brief Finalize the result of a fence device probe ++ * ++ * \param[in] cmd Probe action ++ * \param[in] probe_result Probe result ++ */ ++static void ++finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result) ++{ ++ int exit_status = CRM_EX_ERROR; ++ const char *reason = NULL; ++ ++ switch (probe_result) { ++ case PCMK_EXEC_DONE: // Device is "running" ++ exit_status = CRM_EX_OK; ++ break; ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running" ++ break; ++ ++ case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed() ++ reason = "Lost connection to fencer"; ++ break; ++ ++ default: // Shouldn't be possible ++ probe_result = PCMK_EXEC_ERROR; ++ reason = "Invalid fence device probe result (bug?)"; ++ break; ++ } ++ stonith_action_complete(cmd, exit_status, probe_result, reason); ++} ++ + static void + lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + { +@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + if (cmd->interval_ms > 0) { + do_monitor = TRUE; + } else { +- rc = rsc->st_probe_rc; ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; + } + } + +diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h +index 51ef8d22e6..057d889584 100644 +--- a/daemons/execd/pacemaker-execd.h ++++ b/daemons/execd/pacemaker-execd.h +@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s { + * that have been handed off from the pending ops list. */ + GList *recurring_ops; + +- int st_probe_rc; // What value should be returned for a probe if stonith ++ /* If this resource is a fence device, probes are handled internally by the ++ * executor, and this value indicates the result that should currently be ++ * returned for probes. It should be one of: ++ * PCMK_EXEC_DONE (to indicate "running"), ++ * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or ++ * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost"). ++ */ ++ enum pcmk_exec_status fence_probe_result; + + crm_trigger_t *work; + } lrmd_rsc_t; +-- +2.27.0 + + +From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:14:48 -0600 +Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for + probes + +For fence devices, probe results are based on earlier state determinations, +so handle them before requiring an active fencer connection. The effect may be +negligible, but it would allow probes to proceed while waiting for a +reconnection. +--- + daemons/execd/execd_commands.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 6e5505e973..5999ba19c9 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + stonith_t *stonith_api = get_stonith_connection(); + +- if (!stonith_api) { ++ if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) ++ && (cmd->interval_ms == 0)) { ++ // Probes don't require a fencer connection ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; ++ ++ } else if (stonith_api == NULL) { + rc = -ENOTCONN; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + rc = execd_stonith_stop(stonith_api, rsc); + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { +- if (cmd->interval_ms > 0) { +- do_monitor = TRUE; +- } else { +- finalize_fence_device_probe(cmd, rsc->fence_probe_result); +- return; +- } ++ do_monitor = TRUE; + } + + if (do_monitor) { +-- +2.27.0 + + +From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:20:34 -0600 +Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence + device actions + +... and set an exit reason. Previously, it would return success for unsupported +actions. It shouldn't be possible, but it would be nice to have an indication +of what is wrong if a bug is introduced. +--- + daemons/execd/execd_commands.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5999ba19c9..772d6446dc 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + do_monitor = TRUE; ++ ++ } else { ++ stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, ++ PCMK_EXEC_ERROR, ++ "Invalid fence device action (bug?)"); ++ return; + } + + if (do_monitor) { +-- +2.27.0 + + +From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:24:07 -0600 +Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection + +--- + daemons/execd/execd_commands.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 772d6446dc..7ae309d94c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + return; + + } else if (stonith_api == NULL) { +- rc = -ENOTCONN; ++ stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, ++ "No connection to fencer"); ++ return; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +-- +2.27.0 + + +From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 14:42:26 -0600 +Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence + callback + +--- + daemons/fenced/cts-fence-helper.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2adb032f24..c2b55d73b9 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2020 the Pacemaker project contributors ++ * Copyright 2009-2021 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e) + static void + st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- crm_notice("Call id %d completed with rc %d", data->call_id, data->rc); ++ crm_notice("Call %d exited %d: %s (%s)", ++ data->call_id, stonith__exit_status(data), ++ stonith__execution_status(data), ++ crm_str(stonith__exit_reason(data))); + } + + static void +-- +2.27.0 + + +From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:10:14 -0600 +Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of + legacy return code + +--- + daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++---------------- + 1 file changed, 37 insertions(+), 40 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index c2b55d73b9..2739f57804 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -34,23 +34,12 @@ + static GMainLoop *mainloop = NULL; + static crm_trigger_t *trig = NULL; + static int mainloop_iter = 0; +-static int callback_rc = 0; ++static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + typedef void (*mainloop_test_iteration_cb) (int check_event); + + #define MAINLOOP_DEFAULT_TIMEOUT 2 + +-#define mainloop_test_done(pass) \ +- if (pass) { \ +- crm_info("SUCCESS - %s", __func__); \ +- mainloop_iter++; \ +- mainloop_set_trigger(trig); \ +- } else { \ +- crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \ +- crm_exit(CRM_EX_ERROR); \ +- } \ +- callback_rc = 0; \ +- +- + enum test_modes { + test_standard = 0, // test using a specific developer environment + test_passive, // watch notifications only +@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call; + static int expected_notifications = 0; + static int verbose = 0; + ++static void ++mainloop_test_done(const char *origin, bool pass) ++{ ++ if (pass) { ++ crm_info("SUCCESS - %s", origin); ++ mainloop_iter++; ++ mainloop_set_trigger(trig); ++ result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.exit_status = CRM_EX_OK; ++ } else { ++ crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); ++ crm_exit(CRM_EX_ERROR); ++ } ++} ++ ++ + static void + dispatch_helper(int timeout) + { +@@ -385,7 +391,9 @@ static void + static void + mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- callback_rc = data->rc; ++ pcmk__set_result(&result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + iterate_mainloop_tests(TRUE); + } + +@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != 0) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event) + if (check_event) { + uint32_t diff = (time(NULL) - begin); + +- if (callback_rc != -ETIME) { +- mainloop_test_done(FALSE); ++ if (result.execution_status != PCMK_EXEC_TIMEOUT) { ++ mainloop_test_done(__func__, false); + } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { + crm_err + ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", + CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } else { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + return; + } +@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event) + rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != -ENODEV) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, ++ (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); + return; + } + + rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -483,18 +484,14 @@ test_async_monitor(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); + if (rc < 0) { + crm_err("monitor failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + register_callback_helper(rc); +@@ -531,7 +528,7 @@ test_register_async_devices(int check_event) + params); + stonith_key_value_freeall(params, 1, 1); + +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + + static void +@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event) + int rc = stonith_api_connect_retry(st, crm_system_name, 10); + + if (rc == pcmk_ok) { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + return; + } + crm_err("API CONNECTION FAILURE"); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + static void +-- +2.27.0 + + +From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 10:27:45 -0600 +Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo + +caught in review +--- + doc/sphinx/Pacemaker_Development/components.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index 68158484ce..c4d10fc9f5 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -171,7 +171,7 @@ messaging layer callback, which calls: + + * ``fenced_process_fencing_reply()``, which calls either + ``request_peer_fencing()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ device in a topology if appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or + ``finalize_op()`` (if the operation is definitively failed or + successful). +-- +2.27.0 + + +From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:05:40 -0600 +Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how + fencing history works + +--- + doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index c4d10fc9f5..760da77c9b 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call + * ``finalize_op()``, which sends the result to all local clients. + + ++.. index:: ++ single: fence history ++ ++Fencing History ++_______________ ++ ++The fencer keeps a running history of all fencing operations. The bulk of the ++relevant code is in `fenced_history.c` and ensures the history is synchronized ++across all nodes even if a node leaves and rejoins the cluster. ++ ++In libstonithd, this information is represented by `stonith_history_t` and is ++queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and ++`stonith_admin` use this API to display the history. ++ ++ + .. index:: + single: scheduler + single: pacemaker-schedulerd +-- +2.27.0 + + +From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:25:31 -0600 +Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a + callback + +--- + lib/fencing/st_client.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 9d93ffd481..4823751267 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + cb_info->user_data, cb_info->callback); + + } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { +- crm_warn("Fencing action without registered callback failed: %d (%s)", ++ crm_warn("Fencing action without registered callback failed: %d (%s%s%s)", + result.exit_status, +- pcmk_exec_status_str(result.execution_status)); ++ pcmk_exec_status_str(result.execution_status), ++ ((result.exit_reason == NULL)? "" : ": "), ++ ((result.exit_reason == NULL)? "" : result.exit_reason)); + crm_log_xml_debug(msg, "Failed fence update"); + } + +-- +2.27.0 + + +From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:28:27 -0600 +Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent + +... even if the line runs a little long +--- + daemons/execd/execd_commands.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 7ae309d94c..bc3b392b2c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2012-2021 the Pacemaker project contributors ++ * Copyright 2012-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + stonith_action_complete(cmd, + ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), + stonith__legacy2status(rc), +- rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); ++ ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); + } + + static int +-- +2.27.0 + + +From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:29:03 -0600 +Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution + status for completed tests + +It doesn't matter since the value is only checked against a couple of specific +failure values, but this is less confusing. +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2739f57804..e222a59f9f 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass) + crm_info("SUCCESS - %s", origin); + mainloop_iter++; + mainloop_set_trigger(trig); +- result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.execution_status = PCMK_EXEC_DONE; + result.exit_status = CRM_EX_OK; + } else { + crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, +-- +2.27.0 + diff --git a/SOURCES/012-notify-crash.patch b/SOURCES/012-notify-crash.patch new file mode 100644 index 0000000..c18e4f5 --- /dev/null +++ b/SOURCES/012-notify-crash.patch @@ -0,0 +1,65 @@ +From ed8b2c86ab77aaa3d7fd688c049ad5e1b922a9c6 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Thu, 13 Jan 2022 02:56:55 -0800 +Subject: [PATCH] Fix: liblrmd: Avoid double-free during notify operation + +This commit fixes a regression introduced by 31c7fa8a, causing a +double-free in notify operations. lrmd_dispatch_internal() assigns the +exit_reason string directly from an XML node to a new lrmd_event_data_t +object (without duplicating), and this string gets freed twice. + +Free #1: pcmk__create_history_xml() (reached via callback) calls +lrmd__set_result(), which frees event.exit_reason and sets it to NULL. +Free #2: lrmd_ipc_dispatch() frees the XML node, which contains a +pointer to the exit_reason string just freed, after +lrmd_dispatch_internal() returns. + +Prior to 31c7fa8a, pcmk__create_history_xml reset event.rc and +event.op_status but **not** event.exit_reason. + +In this commit we simply make a copy of event.exit_reason in +lrmd_dispatch_internal() before the callback. This way we don't have to +worry about whatever happens in the callback, and we can continue to +unset the exit_reason alongside the rc and op_status. The added overhead +should be minimal. + +This commit also makes a copy of output. That's not strictly necessary +but adds some futureproofing and allows us to call lrmd__reset_result() +at the end of lrmd_dispatch_internal(). + +Resolves: RHBZ#2039675 + +Signed-off-by: Reid Wahl +--- + lib/lrmd/lrmd_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index ee31bb5ae9..5131a648b7 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -305,9 +305,10 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR); + event.type = lrmd_event_exec_complete; + +- // No need to duplicate the memory, so don't use setter functions +- event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT); +- event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON); ++ /* output and exit_reason may be freed by a callback */ ++ event.output = crm_element_value_copy(msg, F_LRMD_RSC_OUTPUT); ++ lrmd__set_result(&event, event.rc, event.op_status, ++ crm_element_value(msg, F_LRMD_RSC_EXIT_REASON)); + + event.params = xml2list(msg); + } else if (pcmk__str_eq(type, LRMD_OP_NEW_CLIENT, pcmk__str_none)) { +@@ -324,6 +325,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + if (event.params) { + g_hash_table_destroy(event.params); + } ++ lrmd__reset_result(&event); + } + + // \return Always 0, to indicate that IPC mainloop source should be kept +-- +2.27.0 + diff --git a/SOURCES/013-probe-failures.patch b/SOURCES/013-probe-failures.patch new file mode 100644 index 0000000..c13867e --- /dev/null +++ b/SOURCES/013-probe-failures.patch @@ -0,0 +1,26 @@ +From 186d5a02fba919c455fd6eeb050b4be107f82159 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 13 Jan 2022 17:02:47 -0500 +Subject: [PATCH] Low: scheduler: Use the old RC code to log maskable probe + failures. + +--- + lib/pengine/unpack.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 8a2d2a6d6d..b01f86257a 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3780,7 +3780,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if (maskable_probe_failure) { + crm_notice("Treating probe result '%s' for %s on %s as 'not running'", +- services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ services_ocf_exitcode_str(old_rc), rsc->id, node->details->uname); + update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, + on_fail, data_set); + crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); +-- +2.27.0 + diff --git a/SOURCES/014-pcmk_delay_base.patch b/SOURCES/014-pcmk_delay_base.patch new file mode 100644 index 0000000..8aba265 --- /dev/null +++ b/SOURCES/014-pcmk_delay_base.patch @@ -0,0 +1,43 @@ +From 9d812b0401d4cedef53a3cc3653ec782a5c49e37 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 13 Jan 2022 10:42:02 -0600 +Subject: [PATCH] Doc: fencer: improve pcmk_delay_base meta-data + +Update its type, since its value can now be a node map as well as a string, +and add more detail to its description. +--- + daemons/fenced/pacemaker-fenced.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 1b954be5a4..12f331496c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -1548,13 +1548,17 @@ main(int argc, char **argv) + PCMK_STONITH_DELAY_BASE); + printf(" Enable a base delay for " + "fencing actions and specify base delay value.\n"); +- printf(" This prevents double fencing when " +- "different delays are configured on the nodes.\nUse this to " +- "enable a static delay for fencing actions.\nThe overall delay " +- "is derived from a random delay value adding this static delay " +- "so that the sum is kept below the maximum delay.\nSet to eg. " +- "node1:1s;node2:5 to set different value per node.\n"); +- printf(" \n"); ++ printf(" This enables a static delay for " ++ "fencing actions, which can help avoid \"death matches\" where " ++ "two nodes try to fence each other at the same time. If " ++ PCMK_STONITH_DELAY_MAX " is also used, a random delay will be " ++ "added such that the total delay is kept below that value.\n" ++ "This can be set to a single time value to apply to any node " ++ "targeted by this device (useful if a separate device is " ++ "configured for each target), or to a node map (for example, " ++ "\"node1:1s;node2:5\") to set a different value per target.\n" ++ " \n"); ++ printf(" \n"); + printf(" \n"); + + printf(" \n", +-- +2.27.0 + diff --git a/SOURCES/015-fencing-reasons.patch b/SOURCES/015-fencing-reasons.patch new file mode 100644 index 0000000..c53b6c9 --- /dev/null +++ b/SOURCES/015-fencing-reasons.patch @@ -0,0 +1,1093 @@ +From 87365f49b1bee0baa536783865fbd835a9cacc97 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 2 Dec 2021 16:12:24 -0600 +Subject: [PATCH 01/11] Refactor: libstonithd: functionize getting notification + data XML + +Also, only get the data when needed. +--- + lib/fencing/st_client.c | 32 +++++++++++++++++++++++--------- + 1 file changed, 23 insertions(+), 9 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 4823751267..72a0a49408 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1312,6 +1312,23 @@ stonith_dump_pending_callbacks(stonith_t * stonith) + return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); + } + ++/*! ++ * \internal ++ * \brief Get the data section of a fencer notification ++ * ++ * \param[in] msg Notification XML ++ * \param[in] ntype Notification type ++ */ ++static xmlNode * ++get_event_data_xml(xmlNode *msg, const char *ntype) ++{ ++ char *data_addr = crm_strdup_printf("//%s", ntype); ++ xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); ++ ++ free(data_addr); ++ return data; ++} ++ + /* + + +@@ -1336,17 +1353,18 @@ xml_to_event(xmlNode * msg) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); +- char *data_addr = crm_strdup_printf("//%s", ntype); +- xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); + + crm_log_xml_trace(msg, "stonith_notify"); + + crm_element_value_int(msg, F_STONITH_RC, &(event->result)); + + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); ++ xmlNode *data = get_event_data_xml(msg, ntype); + +- if (data) { ++ if (data == NULL) { ++ crm_err("No data for %s event", ntype); ++ crm_log_xml_notice(msg, "BadEvent"); ++ } else { + event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); + event->action = crm_element_value_copy(data, F_STONITH_ACTION); + event->target = crm_element_value_copy(data, F_STONITH_TARGET); +@@ -1354,14 +1372,10 @@ xml_to_event(xmlNode * msg) + event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); + event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); + event->device = crm_element_value_copy(data, F_STONITH_DEVICE); +- +- } else { +- crm_err("No data for %s event", ntype); +- crm_log_xml_notice(msg, "BadEvent"); + } ++ event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); + } + +- free(data_addr); + return event; + } + +-- +2.27.0 + + +From 448f86a029d5d7e3c255d813929003a8cc2cffba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:01:23 -0600 +Subject: [PATCH 02/11] Refactor: fencing: parse full result from fencer + notifications + +stonith_event_t previously contained only the legacy return code for the +notification event. Use its new opaque member to store the full result, along +with accessors (available only internally for now). Nothing uses them yet. +--- + include/crm/fencing/internal.h | 5 +++ + lib/fencing/st_client.c | 68 ++++++++++++++++++++++++++++++++-- + 2 files changed, 70 insertions(+), 3 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index eff689e59b..acc16d05e9 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,10 +187,15 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++ + int stonith__exit_status(stonith_callback_data_t *data); + int stonith__execution_status(stonith_callback_data_t *data); + const char *stonith__exit_reason(stonith_callback_data_t *data); + ++int stonith__event_exit_status(stonith_event_t *event); ++int stonith__event_execution_status(stonith_event_t *event); ++const char *stonith__event_exit_reason(stonith_event_t *event); ++ + /*! + * \internal + * \brief Is a fencing operation in pending state? +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 72a0a49408..f58b3a6745 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1349,15 +1349,23 @@ get_event_data_xml(xmlNode *msg, const char *ntype) + + */ + static stonith_event_t * +-xml_to_event(xmlNode * msg) ++xml_to_event(xmlNode *msg, pcmk__action_result_t *result) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); + ++ CRM_ASSERT((event != NULL) && (result != NULL)); ++ + crm_log_xml_trace(msg, "stonith_notify"); + +- crm_element_value_int(msg, F_STONITH_RC, &(event->result)); ++ // All notification types have the operation result ++ event->opaque = result; ++ stonith__xe_get_result(msg, result); ++ ++ // @COMPAT The API originally provided the result as a legacy return code ++ event->result = pcmk_rc2legacy(stonith__result2rc(result)); + ++ // Fence notifications have additional information + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { + xmlNode *data = get_event_data_xml(msg, ntype); + +@@ -1392,6 +1400,7 @@ event_free(stonith_event_t * event) + free(event->executioner); + free(event->device); + free(event->client_origin); ++ pcmk__reset_result((pcmk__action_result_t *) (event->opaque)); + free(event); + } + +@@ -1402,6 +1411,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + stonith_notify_client_t *entry = data; + stonith_event_t *st_event = NULL; + const char *event = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + if (blob->xml == NULL) { + crm_warn("Skipping callback - NULL message"); +@@ -1427,7 +1437,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + return; + } + +- st_event = xml_to_event(blob->xml); ++ st_event = xml_to_event(blob->xml, &result); + + crm_trace("Invoking callback for %p/%s event...", entry, event); + entry->notify(blob->stonith, st_event); +@@ -2366,6 +2376,58 @@ stonith__exit_reason(stonith_callback_data_t *data) + return ((pcmk__action_result_t *) data->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit status from event ++ */ ++int ++stonith__event_exit_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Execution status from event ++ */ ++int ++stonith__event_execution_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit reason from event ++ */ ++const char * ++stonith__event_exit_reason(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_reason; ++} ++ ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 8dab65e65fe760052d1151749a7bfb2203445813 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:02:28 -0600 +Subject: [PATCH 03/11] Refactor: fencing: parse full result from synchronous + fencer replies + +stonith_send_command() now parses the full result from synchronous fencer +replies, and maps that to a legacy return code, rather than parse the legacy +return code directly. + +The full result is not used yet, and won't be until we can break backward API +compatibility, since the API functions that call stonith_send_command() +currently return a legacy code. +--- + lib/fencing/st_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index f58b3a6745..5fec7529e3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1537,11 +1537,13 @@ stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNod + crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); + + if (reply_id == stonith->call_id) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Synchronous reply %d received", reply_id); + +- if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { +- rc = -ENOMSG; +- } ++ stonith__xe_get_result(op_reply, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); ++ pcmk__reset_result(&result); + + if ((call_options & st_opt_discard_reply) || output_data == NULL) { + crm_trace("Discarding reply"); +-- +2.27.0 + + +From 1beb319d8c62ab93b4c08b26a4e03151906c6189 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 6 Dec 2021 17:13:44 -0600 +Subject: [PATCH 04/11] Log: fencing: improve cts-fence-helper result logs + +Use the full result from the fencing event +--- + daemons/fenced/cts-fence-helper.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index e222a59f9f..858cddc9de 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -125,10 +125,14 @@ st_callback(stonith_t * st, stonith_event_t * e) + crm_exit(CRM_EX_DISCONNECT); + } + +- crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)", +- e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed", +- e->target, e->executioner ? e->executioner : "", +- pcmk_strerror(e->result), e->id); ++ crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)", ++ ((e->operation == NULL)? "unknown" : e->operation), ++ ((e->target == NULL)? "no node" : e->target), ++ ((e->executioner == NULL)? "any node" : e->executioner), ++ ((e->origin == NULL)? "unknown client" : e->origin), ++ pcmk_exec_status_str(stonith__event_execution_status(e)), ++ stonith__event_exit_status(e), ++ ((e->id == NULL)? "none" : e->id)); + + if (expected_notifications) { + expected_notifications--; +-- +2.27.0 + + +From b26f701833ade5d7441fba317832d6e827bd16d0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:52:09 -0600 +Subject: [PATCH 05/11] Test: cts-fence-helper: update expected return code + +Before recent changes, libstonithd obtained the fence API's legacy result code +directly from the fencer's XML reply, meaning that the legacy code was the +result of the fencer's mapping of the full result (including the action stderr). + +After those changes, libstonithd now ignores the legacy code in the fencer's +reply, and instead maps the legacy code itself from the full result in the +fencer's reply. + +However, the fencer's reply does not have the action stderr, so failures that +mapped to -pcmk_err_generic on the server side now map to -ENODATA on the +client side. Update cts-fence-helper's expected return code to match (neither +code is particularly useful, so there wouldn't be much benefit from having the +fencer pass the action stderr with replies, which would be considerable +additional work). +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 858cddc9de..e3113452ef 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -207,10 +207,10 @@ run_fence_failure_test(void) + "Register device1 for failure test", 1, 0); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), +- "Fence failure results off", 1, -pcmk_err_generic); ++ "Fence failure results off", 1, -ENODATA); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0), +- "Fence failure results reboot", 1, -pcmk_err_generic); ++ "Fence failure results reboot", 1, -ENODATA); + + single_test(st->cmds->remove_device(st, st_opts, "test-id1"), + "Remove device1 for failure test", 1, 0); +-- +2.27.0 + + +From 123429de229c2148e320c76530b95e6ba458b9f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:28:48 -0600 +Subject: [PATCH 06/11] Low: controller: compare fencing targets + case-insensitively + +... since they are node names +--- + daemons/controld/controld_fencing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f8d2fc13f4..70e141dc28 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -466,7 +466,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + return; + + } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { ++ && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { + + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster +-- +2.27.0 + + +From 3a067b8e58b3aefb49b2af1c35d0ad28b2de8784 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:37:56 -0600 +Subject: [PATCH 07/11] Refactor: controller: best practices for handling + fencing notifications + +Rename tengine_stonith_notify() to handle_fence_notification(), rename its +st_event argument to event, add a doxygen block, and use some new variables and +reformatting to make it easier to follow (and change later). +--- + daemons/controld/controld_fencing.c | 131 ++++++++++++++++------------ + 1 file changed, 75 insertions(+), 56 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 70e141dc28..00626444da 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -435,39 +435,59 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) + } + } + ++/*! ++ * \internal ++ * \brief Handle an event notification from the fencing API ++ * ++ * \param[in] st Fencing API connection ++ * \param[in] event Fencing API event notification ++ */ + static void +-tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) ++handle_fence_notification(stonith_t *st, stonith_event_t *event) + { ++ bool succeeded = true; ++ const char *executioner = "the cluster"; ++ const char *client = "a client"; ++ + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, + (unsigned long) getpid()); + } + +- if (st_event == NULL) { ++ if (event == NULL) { + crm_err("Notify data not found"); + return; + } + +- crmd_alert_fencing_op(st_event); ++ if (event->executioner != NULL) { ++ executioner = event->executioner; ++ } ++ if (event->client_origin != NULL) { ++ client = event->client_origin; ++ } + +- if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- st_event->origin); +- /* TODO: Hook up st_event->device */ +- return; ++ if (event->result != pcmk_ok) { ++ succeeded = false; ++ } + +- } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_err("Unfencing of %s by %s failed: %s (%d)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- pcmk_strerror(st_event->result), st_event->result); +- return; ++ crmd_alert_fencing_op(event); + +- } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { ++ if (pcmk__str_eq("on", event->action, pcmk__str_none)) { ++ // Unfencing doesn't need special handling, just a log message ++ if (succeeded) { ++ crm_notice("%s was successfully unfenced by %s (at the request of %s)", ++ event->target, executioner, event->origin); ++ /* TODO: Hook up event->device */ ++ } else { ++ crm_err("Unfencing of %s by %s failed: %s (%d)", ++ event->target, executioner, ++ pcmk_strerror(st_event->result), st_event->result); ++ } ++ return; ++ } + ++ if (succeeded ++ && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) { + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster + * communication is in use. +@@ -478,44 +498,41 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * our subsequent election votes as "not part of our cluster". + */ + crm_crit("We were allegedly just fenced by %s for %s!", +- st_event->executioner? st_event->executioner : "the cluster", +- st_event->origin); /* Dumps blackbox if enabled */ ++ executioner, event->origin); // Dumps blackbox if enabled + if (fence_reaction_panic) { + pcmk__panic(__func__); + } else { + crm_exit(CRM_EX_FATAL); + } +- return; ++ return; // Should never get here + } + +- /* Update the count of stonith failures for this target, in case we become ++ /* Update the count of fencing failures for this target, in case we become + * DC later. The current DC has already updated its fail count in + * tengine_stonith_callback(). + */ +- if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- if (st_event->result == pcmk_ok) { +- st_fail_count_reset(st_event->target); ++ if (!AM_I_DC ++ && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, ++ pcmk__str_casei)) { ++ ++ if (succeeded) { ++ st_fail_count_reset(event->target); + } else { +- st_fail_count_increment(st_event->target); ++ st_fail_count_increment(event->target); + } + } + + crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " + CRM_XS " initiator=%s ref=%s", +- st_event->target, st_event->result == pcmk_ok ? "" : " not", +- st_event->action, +- st_event->executioner ? st_event->executioner : "", +- (st_event->client_origin? st_event->client_origin : ""), +- pcmk_strerror(st_event->result), +- st_event->origin, st_event->id); +- +- if (st_event->result == pcmk_ok) { +- crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, ++ event->target, (succeeded? "" : " not"), ++ event->action, executioner, client, ++ pcmk_strerror(event->result), ++ event->origin, event->id); ++ ++ if (succeeded) { ++ crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, + CRM_GET_PEER_ANY); + const char *uuid = NULL; +- gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, +- fsa_our_uname, +- pcmk__str_casei); + + if (peer == NULL) { + return; +@@ -523,10 +540,9 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + + uuid = crm_peer_uuid(peer); + +- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); +- if(AM_I_DC) { ++ if (AM_I_DC) { + /* The DC always sends updates */ +- send_stonith_update(NULL, st_event->target, uuid); ++ send_stonith_update(NULL, event->target, uuid); + + /* @TODO Ideally, at this point, we'd check whether the fenced node + * hosted any guest nodes, and call remote_node_down() for them. +@@ -536,31 +552,33 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * on the scheduler creating fence pseudo-events for the guests. + */ + +- if (st_event->client_origin +- && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { +- +- /* Abort the current transition graph if it wasn't us +- * that invoked stonith to fence someone ++ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { ++ /* Abort the current transition if it wasn't the cluster that ++ * initiated fencing. + */ +- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); +- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); ++ crm_info("External fencing operation from %s fenced %s", ++ client, event->target); ++ abort_transition(INFINITY, tg_restart, ++ "External Fencing Operation", NULL); + } + + /* Assume it was our leader if we don't currently have one */ +- } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) ++ } else if (pcmk__str_eq(fsa_our_dc, event->target, ++ pcmk__str_null_matches|pcmk__str_casei) + && !pcmk_is_set(peer->flags, crm_remote_node)) { + + crm_notice("Fencing target %s %s our leader", +- st_event->target, (fsa_our_dc? "was" : "may have been")); ++ event->target, (fsa_our_dc? "was" : "may have been")); + + /* Given the CIB resyncing that occurs around elections, + * have one node update the CIB now and, if the new DC is different, + * have them do so too after the election + */ +- if (we_are_executioner) { +- send_stonith_update(NULL, st_event->target, uuid); ++ if (pcmk__str_eq(event->executioner, fsa_our_uname, ++ pcmk__str_casei)) { ++ send_stonith_update(NULL, event->target, uuid); + } +- add_stonith_cleanup(st_event->target); ++ add_stonith_cleanup(event->target); + } + + /* If the target is a remote node, and we host its connection, +@@ -569,7 +587,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * so the failure might not otherwise be detected until the next poke. + */ + if (pcmk_is_set(peer->flags, crm_remote_node)) { +- remote_ra_fail(st_event->target); ++ remote_ra_fail(event->target); + } + + crmd_peer_down(peer, TRUE); +@@ -632,7 +650,7 @@ te_connect_stonith(gpointer user_data) + tengine_stonith_connection_destroy); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_FENCE, +- tengine_stonith_notify); ++ handle_fence_notification); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_HISTORY_SYNCED, + tengine_stonith_history_synced); +@@ -837,7 +855,8 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +- * check it. Non-DC nodes will increment it in tengine_stonith_notify(). ++ * check it. Non-DC nodes will increment it in ++ * handle_fence_notification(). + */ + st_fail_count_increment(target); + abort_for_stonith_failure(abort_action, target, NULL); +-- +2.27.0 + + +From 5ec9dcbbe1ee7f6252968f87d7df5a5ea17244fb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:40:21 -0600 +Subject: [PATCH 08/11] Log: controller: improve messages when handling fencing + notifications + +Now that the fencing API provides a full result including exit reasons with +fencing event notifications, make the controller logs more useful and +consistent. +--- + daemons/controld/controld_fencing.c | 34 ++++++++++++++++++++--------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 00626444da..0aa9ef083c 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -448,6 +448,8 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + bool succeeded = true; + const char *executioner = "the cluster"; + const char *client = "a client"; ++ const char *reason = NULL; ++ int exec_status; + + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, +@@ -466,22 +468,31 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + client = event->client_origin; + } + +- if (event->result != pcmk_ok) { ++ exec_status = stonith__event_execution_status(event); ++ if ((stonith__event_exit_status(event) != CRM_EX_OK) ++ || (exec_status != PCMK_EXEC_DONE)) { + succeeded = false; ++ if (exec_status == PCMK_EXEC_DONE) { ++ exec_status = PCMK_EXEC_ERROR; ++ } + } ++ reason = stonith__event_exit_reason(event); + + crmd_alert_fencing_op(event); + + if (pcmk__str_eq("on", event->action, pcmk__str_none)) { + // Unfencing doesn't need special handling, just a log message + if (succeeded) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- event->target, executioner, event->origin); ++ crm_notice("%s was unfenced by %s at the request of %s@%s", ++ event->target, executioner, client, event->origin); + /* TODO: Hook up event->device */ + } else { +- crm_err("Unfencing of %s by %s failed: %s (%d)", ++ crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", + event->target, executioner, +- pcmk_strerror(st_event->result), st_event->result); ++ pcmk_exec_status_str(exec_status), ++ ((reason == NULL)? "" : ": "), ++ ((reason == NULL)? "" : reason), ++ stonith__event_exit_status(event)); + } + return; + } +@@ -522,12 +533,15 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + } + } + +- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " +- CRM_XS " initiator=%s ref=%s", ++ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " ++ "%s%s%s%s " CRM_XS " event=%s", + event->target, (succeeded? "" : " not"), +- event->action, executioner, client, +- pcmk_strerror(event->result), +- event->origin, event->id); ++ event->action, executioner, client, event->origin, ++ (succeeded? "OK" : pcmk_exec_status_str(exec_status)), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); + + if (succeeded) { + crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, +-- +2.27.0 + + +From fb484933ce7c8f3325300a9e01a114db1bbb5b70 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:33:15 -0600 +Subject: [PATCH 09/11] Refactor: controller: move alert functions into own + source file + +--- + daemons/controld/Makefile.am | 1 + + daemons/controld/controld_alerts.c | 92 +++++++++++++++++++++++++ + daemons/controld/controld_execd_state.c | 75 -------------------- + 3 files changed, 93 insertions(+), 75 deletions(-) + create mode 100644 daemons/controld/controld_alerts.c + +diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am +index db45bcba4a..0a29925c0b 100644 +--- a/daemons/controld/Makefile.am ++++ b/daemons/controld/Makefile.am +@@ -43,6 +43,7 @@ pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ + $(CLUSTERLIBS) + + pacemaker_controld_SOURCES = pacemaker-controld.c \ ++ controld_alerts.c \ + controld_attrd.c \ + controld_callbacks.c \ + controld_based.c \ +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +new file mode 100644 +index 0000000000..bd92795cf0 +--- /dev/null ++++ b/daemons/controld/controld_alerts.c +@@ -0,0 +1,92 @@ ++/* ++ * Copyright 2012-2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU General Public License version 2 ++ * or later (GPLv2+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static GList *crmd_alert_list = NULL; ++ ++void ++crmd_unpack_alerts(xmlNode *alerts) ++{ ++ pe_free_alert_list(crmd_alert_list); ++ crmd_alert_list = pe_unpack_alerts(alerts); ++} ++ ++void ++crmd_alert_node_event(crm_node_t *node) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ node->uname, node->id, node->state); ++} ++ ++void ++crmd_alert_fencing_op(stonith_event_t * e) ++{ ++ char *desc; ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", ++ e->action, e->target, ++ (e->executioner? e->executioner : ""), ++ e->client_origin, e->origin, ++ pcmk_strerror(e->result), e->id); ++ ++ lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ e->target, e->operation, desc, e->result); ++ free(desc); ++} ++ ++void ++crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, ++ op); ++} +diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c +index 67c376a426..5dce6c6d59 100644 +--- a/daemons/controld/controld_execd_state.c ++++ b/daemons/controld/controld_execd_state.c +@@ -777,78 +777,3 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state, + */ + return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options); + } +- +-/* +- * Functions for sending alerts via local executor connection +- */ +- +-static GList *crmd_alert_list = NULL; +- +-void +-crmd_unpack_alerts(xmlNode *alerts) +-{ +- pe_free_alert_list(crmd_alert_list); +- crmd_alert_list = pe_unpack_alerts(alerts); +-} +- +-void +-crmd_alert_node_event(crm_node_t *node) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- node->uname, node->id, node->state); +-} +- +-void +-crmd_alert_fencing_op(stonith_event_t * e) +-{ +- char *desc; +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- +- lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- e->target, e->operation, desc, e->result); +- free(desc); +-} +- +-void +-crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, +- op); +-} +-- +2.27.0 + + +From 3d0b57406bcde6682623e9d62c8ee95878345eb1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:25:41 -0600 +Subject: [PATCH 10/11] Feature: controller,tools: improve description for + fencing alerts/traps + +This functionizes creating a description for fencing events, so it can be used +by both the controller for alerts and crm_mon for traps, for consistency. + +Now that we have the full result including exit reason, we can improve the +description, but the format is kept similar to before to minimize the change. + +The alert/trap also includes the legacy return code for the event, but we can't +change that now because lrmd_send_fencing_alert() and the alert/trap +environment variables are public API. +--- + daemons/controld/controld_alerts.c | 8 ++----- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_client.c | 38 ++++++++++++++++++++++++++++++ + tools/crm_mon.c | 5 ++-- + 4 files changed, 43 insertions(+), 9 deletions(-) + +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +index bd92795cf0..2e0a67dba2 100644 +--- a/daemons/controld/controld_alerts.c ++++ b/daemons/controld/controld_alerts.c +@@ -12,6 +12,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -62,12 +63,7 @@ crmd_alert_fencing_op(stonith_event_t * e) + return; + } + +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- ++ desc = stonith__event_description(e); + lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, + e->target, e->operation, desc, e->result); + free(desc); +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index acc16d05e9..d2b49f831a 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -195,6 +195,7 @@ const char *stonith__exit_reason(stonith_callback_data_t *data); + int stonith__event_exit_status(stonith_event_t *event); + int stonith__event_execution_status(stonith_event_t *event); + const char *stonith__event_exit_reason(stonith_event_t *event); ++char *stonith__event_description(stonith_event_t *event); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 5fec7529e3..b1de912b2a 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -2429,6 +2429,44 @@ stonith__event_exit_reason(stonith_event_t *event) + return ((pcmk__action_result_t *) event->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return a human-friendly description of a fencing event ++ * ++ * \param[in] event Event to describe ++ * ++ * \return Newly allocated string with description of \p event ++ * \note The caller is responsible for freeing the return value. ++ * This function asserts on memory errors and never returns NULL. ++ * \note This currently is useful only for events of type ++ * T_STONITH_NOTIFY_FENCE. ++ */ ++char * ++stonith__event_description(stonith_event_t *event) ++{ ++ const char *reason; ++ const char *status; ++ ++ if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) { ++ status = pcmk_exec_status_str(stonith__event_execution_status(event)); ++ } else if (stonith__event_exit_status(event) != CRM_EX_OK) { ++ status = pcmk_exec_status_str(PCMK_EXEC_ERROR); ++ } else { ++ status = crm_exit_str(CRM_EX_OK); ++ } ++ reason = stonith__event_exit_reason(event); ++ ++ return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)", ++ event->action, event->target, ++ (event->executioner? event->executioner : "the cluster"), ++ (event->client_origin? event->client_origin : "a client"), ++ event->origin, status, ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); ++} ++ + + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START +diff --git a/tools/crm_mon.c b/tools/crm_mon.c +index a6c459aaf7..e7b4fe2847 100644 +--- a/tools/crm_mon.c ++++ b/tools/crm_mon.c +@@ -2237,9 +2237,8 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e) + /* disconnect cib as well and have everything reconnect */ + mon_cib_connection_destroy(NULL); + } else if (options.external_agent) { +- char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", +- e->operation, e->origin, e->target, pcmk_strerror(e->result), +- e->id); ++ char *desc = stonith__event_description(e); ++ + send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); + free(desc); + } +-- +2.27.0 + + +From 2fe03c2165680c717a1f6106c5150be7d117f1a5 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 14 Jan 2022 10:45:03 -0600 +Subject: [PATCH 11/11] Low: controller: compare case-sensitively where + appropriate + +--- + daemons/controld/controld_fencing.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 0aa9ef083c..15954b2358 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -524,7 +524,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + */ + if (!AM_I_DC + && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, +- pcmk__str_casei)) { ++ pcmk__str_none)) { + + if (succeeded) { + st_fail_count_reset(event->target); +-- +2.27.0 + diff --git a/SOURCES/016-fencing-crash.patch b/SOURCES/016-fencing-crash.patch new file mode 100644 index 0000000..c514c64 --- /dev/null +++ b/SOURCES/016-fencing-crash.patch @@ -0,0 +1,56 @@ +From e330568504ec379ea42460d21a2e20b1652d9445 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Fri, 14 Jan 2022 01:35:35 -0800 +Subject: [PATCH] Fix: fencing: Don't set stonith action to pending if fork + fails + +Currently, we set a stonith action to pending if +services_action_async_fork_notify() returns true. However, "true" means +that the svc_action should not be freed. This might be because the +svc_action forked successfully and is pending, or it might be because +the svc_action has already been freed. + +In the case of stonith actions, if we fail to fork, the stonith_action_t +object stored in svc_action->cb_data gets freed by the done callback, +and services_action_async_fork_notify() returns true. If we try to set +the action to pending, it causes a segfault. + +This commit moves the "set to pending" step to the +stonith_action_async_forked() callback. We avoid the segfault and only +set it to pending if it's actually pending. + +A slight difference in ordering was required to achieve this. Now, the +action gets set to pending immediately before being added to the +mainloop, instead of immediately after. + +Signed-off-by: Reid Wahl +--- + lib/fencing/st_actions.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index e4e43225cd..306001af69 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -550,6 +550,9 @@ stonith_action_async_forked(svc_action_t *svc_action) + (action->fork_cb) (svc_action->pid, action->userdata); + } + ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, ++ NULL); ++ + crm_trace("Child process %d performing action '%s' successfully forked", + action->pid, action->action); + } +@@ -619,8 +622,6 @@ internal_stonith_action_execute(stonith_action_t * action) + if (services_action_async_fork_notify(svc_action, + &stonith_action_async_done, + &stonith_action_async_forked)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, +- PCMK_EXEC_PENDING, NULL); + return pcmk_ok; + } + +-- +2.27.0 + diff --git a/SOURCES/017-fencing-reasons.patch b/SOURCES/017-fencing-reasons.patch new file mode 100644 index 0000000..1e100ec --- /dev/null +++ b/SOURCES/017-fencing-reasons.patch @@ -0,0 +1,875 @@ +From 523f62eb235836a01ea039c23ada261a494f7b32 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:22:47 -0600 +Subject: [PATCH 01/11] Feature: libpacemaker: improve result for high-level + fencing API + +Previously, pcmk__fencing_action()'s helpers for asynchronous fencing actions +initialized the result to a generic error, and then overrode that only on +success. + +Now, set a detailed result for early failures, and use the full result when +available from the fencing API. + +A standard return code is still returned to callers at this point. +--- + lib/pacemaker/pcmk_fence.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 7d6acd0de6..125e1b268b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -32,8 +32,8 @@ static struct { + unsigned int timeout; + unsigned int tolerance; + int delay; +- int rc; +-} async_fence_data; ++ pcmk__action_result_t result; ++} async_fence_data = { NULL, }; + + static int + handle_level(stonith_t *st, char *target, int fence_level, +@@ -76,14 +76,13 @@ handle_level(stonith_t *st, char *target, int fence_level, + static void + notify_callback(stonith_t * st, stonith_event_t * e) + { +- if (e->result != pcmk_ok) { +- return; +- } ++ if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { + +- if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) && +- pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { +- +- async_fence_data.rc = e->result; ++ pcmk__set_result(&async_fence_data.result, ++ stonith__event_exit_status(e), ++ stonith__event_execution_status(e), ++ stonith__event_exit_reason(e)); + g_main_loop_quit(mainloop); + } + } +@@ -91,8 +90,9 @@ notify_callback(stonith_t * st, stonith_event_t * e) + static void + fence_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- async_fence_data.rc = data->rc; +- ++ pcmk__set_result(&async_fence_data.result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + g_main_loop_quit(mainloop); + } + +@@ -106,6 +106,8 @@ async_fence_helper(gpointer user_data) + if (rc != pcmk_ok) { + fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, NULL); + return TRUE; + } + +@@ -121,6 +123,8 @@ async_fence_helper(gpointer user_data) + + if (call_id < 0) { + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_ERROR, pcmk_strerror(call_id)); + return TRUE; + } + +@@ -146,7 +150,8 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + async_fence_data.timeout = timeout; + async_fence_data.tolerance = tolerance; + async_fence_data.delay = delay; +- async_fence_data.rc = pcmk_err_generic; ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, PCMK_EXEC_UNKNOWN, ++ NULL); + + trig = mainloop_add_trigger(G_PRIORITY_HIGH, async_fence_helper, NULL); + mainloop_set_trigger(trig); +@@ -156,7 +161,7 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- return pcmk_legacy2rc(async_fence_data.rc); ++ return stonith__result2rc(&async_fence_data.result); + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 008868fae5d1b0d6d8dc61f7acfb3856801ddd52 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:36:10 -0600 +Subject: [PATCH 02/11] Refactor: libpacemaker: add exit reason to high-level + fencing API + +Nothing uses it as of this commit +--- + include/pacemaker.h | 5 ++++- + include/pcmki/pcmki_fence.h | 5 ++++- + lib/pacemaker/pcmk_fence.c | 10 +++++++--- + tools/stonith_admin.c | 6 +++--- + 4 files changed, 18 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index a8523c969e..0daa4c5945 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -189,12 +189,15 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max. ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index d4cef68f5c..c3da0361d7 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -28,12 +28,15 @@ + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 125e1b268b..dbf084fb6b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -139,7 +139,7 @@ async_fence_helper(gpointer user_data) + int + pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -161,6 +161,9 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + ++ if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { ++ *reason = strdup(async_fence_data.result.exit_reason); ++ } + return stonith__result2rc(&async_fence_data.result); + } + +@@ -168,9 +171,10 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + int + pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, delay); ++ return pcmk__fence_action(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 2d48326e1b..fdc7c46d49 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -571,17 +571,17 @@ main(int argc, char **argv) + + case 'B': + rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'F': + rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'U': + rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 7570510f9985ba75ef73fb824f28109e135ace0a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:40:48 -0600 +Subject: [PATCH 03/11] Refactor: libpacemaker: rename high-level fencing API + +Rename pcmk_fence_action() to pcmk_request_fencing(), and its internal +equivalent pcmk__fence_action() to pcmk__request_fencing(). The change is +backward-compatible because pcmk_fence_action() has not been exposed publicly +yet. + +"Fence action" can be easily confused with libcrmservice actions, liblrmd +actions, libstonithd actions, scheduler actions, and so forth. + +Also, the new name makes it clearer that the caller is requesting that the +cluster perform fencing, and not directly performing fencing. +--- + include/pacemaker.h | 20 ++++++++++---------- + include/pcmki/pcmki_fence.h | 16 ++++++++-------- + lib/pacemaker/pcmk_fence.c | 16 ++++++++-------- + tools/stonith_admin.c | 18 ++++++++++++------ + 4 files changed, 38 insertions(+), 32 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index 0daa4c5945..e581f975a9 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -177,27 +177,27 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + #ifdef BUILD_PUBLIC_LIBPACEMAKER + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action +- * again. ++ * again + * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max. ++ * static/random fencing delays from pcmk_delay_base/max + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index c3da0361d7..e3a7e27264 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -13,14 +13,14 @@ + # include + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \note This is the internal version of pcmk_fence_action(). External users ++ * \note This is the internal version of pcmk_request_fencing(). External users + * of the pacemaker API should use that function instead. + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? + * \param[in] timeout How long to wait for the operation to complete (in ms). + * \param[in] tolerance If a successful action for \p target happened within +@@ -34,9 +34,9 @@ + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index dbf084fb6b..1b7feb54b2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -137,9 +137,9 @@ async_fence_helper(gpointer user_data) + } + + int +-pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -169,12 +169,12 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + #ifdef BUILD_PUBLIC_LIBPACEMAKER + int +-pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, +- delay, reason); ++ return pcmk__request_fencing(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index fdc7c46d49..56948b3875 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -570,18 +570,24 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "reboot", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'F': +- rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "off", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'U': +- rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "on", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 247eb303df934944c0b72b162bb661cee6e0ed8b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:52:37 -0600 +Subject: [PATCH 04/11] Refactor: tools: drop unnecessary string duplication in + stonith_admin + +--- + tools/stonith_admin.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 56948b3875..c11e302e76 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -360,8 +360,6 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + +- name = strdup(crm_system_name); +- + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -496,7 +494,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, name, NULL); ++ rc = st->cmds->connect(st, crm_system_name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -570,21 +568,21 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", name, ++ rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", name, ++ rc = pcmk__request_fencing(st, target, "off", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", name, ++ rc = pcmk__request_fencing(st, target, "on", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); +@@ -619,7 +617,6 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } +- free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + + +From a7888bf6868d8d9d9c77f65ae9983cf748bb0548 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:56:34 -0600 +Subject: [PATCH 05/11] Refactor: tools: functionize requesting fencing in + stonith_admin + +... to reduce code duplication and improve readability +--- + tools/stonith_admin.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index c11e302e76..f738a9c888 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -331,6 +331,18 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + return context; + } + ++// \return Standard Pacemaker return code ++static int ++request_fencing(stonith_t *st, const char *target, const char *command) ++{ ++ int rc = pcmk__request_fencing(st, target, command, crm_system_name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); ++ ++ return rc; ++} ++ + int + main(int argc, char **argv) + { +@@ -568,24 +580,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "reboot"); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "off"); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "on"); + break; + + case 'h': +-- +2.27.0 + + +From 2da32df780983ec1197e857eed5eeb5bf1101889 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:05:19 -0600 +Subject: [PATCH 06/11] Feature: tools: display failure reasons for + stonith_admin fencing commands + +Previously, stonith_admin's --fence/--unfence/--reboot options did not output +any error message on failure. Now, they do, including the exit reason, if +available. +--- + tools/stonith_admin.c | 30 +++++++++++++++++++++++++----- + 1 file changed, 25 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index f738a9c888..5590faf11e 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -333,13 +333,33 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + + // \return Standard Pacemaker return code + static int +-request_fencing(stonith_t *st, const char *target, const char *command) ++request_fencing(stonith_t *st, const char *target, const char *command, ++ GError **error) + { ++ char *reason = NULL; + int rc = pcmk__request_fencing(st, target, command, crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, +- options.delay, NULL); ++ options.delay, &reason); + ++ if (rc != pcmk_rc_ok) { ++ const char *rc_str = pcmk_rc_str(rc); ++ ++ // If reason is identical to return code string, don't display it twice ++ if (pcmk__str_eq(rc_str, reason, pcmk__str_none)) { ++ free(reason); ++ reason = NULL; ++ } ++ ++ g_set_error(error, PCMK__RC_ERROR, rc, ++ "Couldn't %sfence %s: %s%s%s%s", ++ ((strcmp(command, "on") == 0)? "un" : ""), ++ target, pcmk_rc_str(rc), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")")); ++ } ++ free(reason); + return rc; + } + +@@ -580,15 +600,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = request_fencing(st, target, "reboot"); ++ rc = request_fencing(st, target, "reboot", &error); + break; + + case 'F': +- rc = request_fencing(st, target, "off"); ++ rc = request_fencing(st, target, "off", &error); + break; + + case 'U': +- rc = request_fencing(st, target, "on"); ++ rc = request_fencing(st, target, "on", &error); + break; + + case 'h': +-- +2.27.0 + + +From 2d99eba4c326d3b13dbbe446971ea5febd5d05be Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:08:49 -0600 +Subject: [PATCH 07/11] Feature: libpacemaker: return exit reason for fencer + connection failures + +... instead of outputting to stderr directly, so that the caller (i.e. +stonith_admin) can output the error in the correct output format. +--- + lib/pacemaker/pcmk_fence.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 1b7feb54b2..d17b07cda2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -104,10 +104,9 @@ async_fence_helper(gpointer user_data) + int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); + + if (rc != pcmk_ok) { +- fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); + pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, +- PCMK_EXEC_NOT_CONNECTED, NULL); ++ PCMK_EXEC_NOT_CONNECTED, pcmk_strerror(rc)); + return TRUE; + } + +-- +2.27.0 + + +From 4480ef0602f47450bdddfbde360a6a8327710927 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:39:39 -0600 +Subject: [PATCH 08/11] Low: libpacemaker: compare fence action names + case-sensitively + +--- + lib/pacemaker/pcmk_fence.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index d17b07cda2..2a8f50a555 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -77,7 +77,7 @@ static void + notify_callback(stonith_t * st, stonith_event_t * e) + { + if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) +- && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_none)) { + + pcmk__set_result(&async_fence_data.result, + stonith__event_exit_status(e), +@@ -549,7 +549,7 @@ pcmk__reduce_fence_history(stonith_history_t *history) + if ((hp->state == st_done) || (hp->state == st_failed)) { + /* action not in progress */ + if (pcmk__str_eq(hp->target, np->target, pcmk__str_casei) && +- pcmk__str_eq(hp->action, np->action, pcmk__str_casei) && ++ pcmk__str_eq(hp->action, np->action, pcmk__str_none) && + (hp->state == np->state) && + ((hp->state == st_done) || + pcmk__str_eq(hp->delegate, np->delegate, pcmk__str_casei))) { +-- +2.27.0 + + +From fe4c65a3b9e715c2b535709f989f2369d3637b78 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:45:24 -0600 +Subject: [PATCH 09/11] Refactor: libpacemaker: avoid unnecessary string + duplication + +... and don't leave any dynamic memory hanging around +--- + lib/pacemaker/pcmk_fence.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 2a8f50a555..260fa5ab8e 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -141,6 +141,7 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; ++ int rc = pcmk_rc_ok; + + async_fence_data.st = st; + async_fence_data.name = strdup(name); +@@ -160,10 +161,14 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { +- *reason = strdup(async_fence_data.result.exit_reason); ++ if (reason != NULL) { ++ // Give the caller ownership of the exit reason ++ *reason = async_fence_data.result.exit_reason; ++ async_fence_data.result.exit_reason = NULL; + } +- return stonith__result2rc(&async_fence_data.result); ++ rc = stonith__result2rc(&async_fence_data.result); ++ pcmk__reset_result(&async_fence_data.result); ++ return rc; + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 7b7af07796f05a1adabdac655582be2e17106f81 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:07:10 -0600 +Subject: [PATCH 10/11] Doc: libpacemaker: improve pcmk__request_fencing() + doxygen block + +--- + include/pacemaker.h | 6 ++++-- + include/pcmki/pcmki_fence.h | 15 +++++++++------ + 2 files changed, 13 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index e581f975a9..266a844892 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -187,8 +187,10 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action + * again +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index e3a7e27264..4a2fe3c481 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -22,17 +22,20 @@ + * \param[in] target The node that should be fenced + * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within +- * this many ms, return 0 without performing the action +- * again. +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * this many milliseconds, return success without ++ * performing the action again ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. ++ * \todo delay is eventually used with g_timeout_add() and should be guint + */ + int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, +-- +2.27.0 + + +From 61fb7271712e1246eb6d9472dc1afc7cd10e0a79 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:18:02 -0600 +Subject: [PATCH 11/11] Fix: tools: get stonith_admin -T option working again + +Regression introduced in 2.0.3 by 3910b6fec + +This reverts commit 247eb303df934944c0b72b162bb661cee6e0ed8b +("Refactor: tools: drop unnecessary string duplication in stonith_admin") +and fixes a regression introduced when stonith_admin was converted to use +GOption. + +The -T option is intended to override the client name passed to the fencer API, +but the client name was set to the default (crm_system_name) after option +processing had already been done, so any value for -T was overwritten by the +default, and its memory was leaked. + +This commit sets the default only if -T was not used. +--- + tools/stonith_admin.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 5590faf11e..54774b6fee 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -337,10 +337,10 @@ request_fencing(stonith_t *st, const char *target, const char *command, + GError **error) + { + char *reason = NULL; +- int rc = pcmk__request_fencing(st, target, command, crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, &reason); ++ int rc = pcmk__request_fencing(st, target, command, name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, &reason); + + if (rc != pcmk_rc_ok) { + const char *rc_str = pcmk_rc_str(rc); +@@ -392,6 +392,10 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + ++ if (name == NULL) { ++ name = strdup(crm_system_name); ++ } ++ + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -526,7 +530,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, crm_system_name, NULL); ++ rc = st->cmds->connect(st, name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -640,6 +644,7 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } ++ free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + diff --git a/SOURCES/018-failure-messages.patch b/SOURCES/018-failure-messages.patch new file mode 100644 index 0000000..3a2f249 --- /dev/null +++ b/SOURCES/018-failure-messages.patch @@ -0,0 +1,796 @@ +From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 18 Jan 2022 16:04:49 -0600 +Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not + connected" status + +PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor +connection", but it can also now mean no fencer connection, so change it to +"Internal communication failure" which is probably less mysterious to end users +anyway (especially since it should be accompanied by a more descriptive exit +reason). +--- + include/crm/common/results.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/common/results.h b/include/crm/common/results.h +index 873faf5c43..3d322a7ce6 100644 +--- a/include/crm/common/results.h ++++ b/include/crm/common/results.h +@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status) + case PCMK_EXEC_ERROR_HARD: return "Hard error"; + case PCMK_EXEC_ERROR_FATAL: return "Fatal error"; + case PCMK_EXEC_NOT_INSTALLED: return "Not installed"; +- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection"; ++ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure"; + case PCMK_EXEC_INVALID: return "Cannot execute now"; + case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device"; + case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable"; +-- +2.27.0 + + +From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:12:36 -0600 +Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error + redefinitions + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +This was a bad idea, since there's no way to ensure the defined values don't +conflict with existing system codes. However, we use a number of them, so it's +probably best to keep them, at least until we can make a backward compatibility +break. + +However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those. +--- + include/portability.h | 12 ------------ + lib/common/results.c | 9 ++++++--- + 2 files changed, 6 insertions(+), 15 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index 9a60c583a7..ee065a376d 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -131,10 +131,6 @@ typedef union + # define EREMOTEIO 193 + # endif + +-# ifndef EUNATCH +-# define EUNATCH 194 +-# endif +- + # ifndef ENOKEY + # define ENOKEY 195 + # endif +@@ -147,14 +143,6 @@ typedef union + # define ETIME 197 + # endif + +-# ifndef ENOSR +-# define ENOSR 198 +-# endif +- +-# ifndef ENOSTR +-# define ENOSTR 199 +-# endif +- + # ifndef EKEYREJECTED + # define EKEYREJECTED 200 + # endif +diff --git a/lib/common/results.c b/lib/common/results.c +index 6d120694cd..96cd4e5659 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -118,9 +118,6 @@ pcmk_strerror(int rc) + case EREMOTEIO: + return "Remote I/O error"; + /* coverity[dead_error_condition] False positive on non-Linux */ +- case EUNATCH: +- return "Protocol driver not attached"; +- /* coverity[dead_error_condition] False positive on non-Linux */ + case ENOKEY: + return "Required key not available"; + } +@@ -342,8 +339,12 @@ pcmk_rc_name(int rc) + case ENOMSG: return "ENOMSG"; + case ENOPROTOOPT: return "ENOPROTOOPT"; + case ENOSPC: return "ENOSPC"; ++#ifdef ENOSR + case ENOSR: return "ENOSR"; ++#endif ++#ifdef ENOSTR + case ENOSTR: return "ENOSTR"; ++#endif + case ENOSYS: return "ENOSYS"; + case ENOTBLK: return "ENOTBLK"; + case ENOTCONN: return "ENOTCONN"; +@@ -376,7 +377,9 @@ pcmk_rc_name(int rc) + case ETIME: return "ETIME"; + case ETIMEDOUT: return "ETIMEDOUT"; + case ETXTBSY: return "ETXTBSY"; ++#ifdef EUNATCH + case EUNATCH: return "EUNATCH"; ++#endif + case EUSERS: return "EUSERS"; + /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ + case EXDEV: return "EXDEV"; +-- +2.27.0 + + +From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:33:12 -0600 +Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h + error codes + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when +the system doesn't have the value, so we can detect that when relevant. + +Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values. +--- + include/portability.h | 8 ++++++++ + lib/common/results.c | 32 ++++++++++++++++++++++++++++++-- + 2 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index ee065a376d..5d5fbf21cb 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -116,34 +116,42 @@ typedef union + # include + + # ifndef ENOTUNIQ ++# define PCMK__ENOTUNIQ + # define ENOTUNIQ 190 + # endif + + # ifndef ECOMM ++# define PCMK__ECOMM + # define ECOMM 191 + # endif + + # ifndef ELIBACC ++# define PCMK__ELIBACC + # define ELIBACC 192 + # endif + + # ifndef EREMOTEIO ++# define PCMK__EREMOTIO + # define EREMOTEIO 193 + # endif + + # ifndef ENOKEY ++# define PCMK__ENOKEY + # define ENOKEY 195 + # endif + + # ifndef ENODATA ++# define PCMK__ENODATA + # define ENODATA 196 + # endif + + # ifndef ETIME ++# define PCMK__ETIME + # define ETIME 197 + # endif + + # ifndef EKEYREJECTED ++# define PCMK__EKEYREJECTED + # define EKEYREJECTED 200 + # endif + +diff --git a/lib/common/results.c b/lib/common/results.c +index 96cd4e5659..bcf289d0d6 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -395,9 +395,9 @@ pcmk_rc_name(int rc) + #ifdef EISNAM // Not available on OS X, Illumos, Solaris + case EISNAM: return "EISNAM"; + case EKEYEXPIRED: return "EKEYEXPIRED"; +- case EKEYREJECTED: return "EKEYREJECTED"; + case EKEYREVOKED: return "EKEYREVOKED"; + #endif ++ case EKEYREJECTED: return "EKEYREJECTED"; + case EL2HLT: return "EL2HLT"; + case EL2NSYNC: return "EL2NSYNC"; + case EL3HLT: return "EL3HLT"; +@@ -443,7 +443,35 @@ pcmk_rc_str(int rc) + if (rc < 0) { + return "Unknown error"; + } +- return strerror(rc); ++ ++ // Handle values that could be defined by system or by portability.h ++ switch (rc) { ++#ifdef PCMK__ENOTUNIQ ++ case ENOTUNIQ: return "Name not unique on network"; ++#endif ++#ifdef PCMK__ECOMM ++ case ECOMM: return "Communication error on send"; ++#endif ++#ifdef PCMK__ELIBACC ++ case ELIBACC: return "Can not access a needed shared library"; ++#endif ++#ifdef PCMK__EREMOTEIO ++ case EREMOTEIO: return "Remote I/O error"; ++#endif ++#ifdef PCMK__ENOKEY ++ case ENOKEY: return "Required key not available"; ++#endif ++#ifdef PCMK__ENODATA ++ case ENODATA: return "No data available"; ++#endif ++#ifdef PCMK__ETIME ++ case ETIME: return "Timer expired"; ++#endif ++#ifdef PCMK__EKEYREJECTED ++ case EKEYREJECTED: return "Key was rejected by service"; ++#endif ++ default: return strerror(rc); ++ } + } + + // This returns negative values for errors +-- +2.27.0 + + +From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:39:19 -0600 +Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of + pcmk_rc_str() + +... to reduce code duplication. This causes minor differences in the string for +a few values. +--- + lib/common/results.c | 67 +------------------------------------------- + 1 file changed, 1 insertion(+), 66 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index bcf289d0d6..b2c6e8d553 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -57,72 +57,7 @@ pcmk_errorname(int rc) + const char * + pcmk_strerror(int rc) + { +- if (rc == 0) { +- return "OK"; +- } +- +- rc = abs(rc); +- +- // Of course rc > 0 ... unless someone passed INT_MIN as rc +- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) { +- return strerror(rc); +- } +- +- switch (rc) { +- case pcmk_err_generic: +- return "Generic Pacemaker error"; +- case pcmk_err_no_quorum: +- return "Operation requires quorum"; +- case pcmk_err_schema_validation: +- return "Update does not conform to the configured schema"; +- case pcmk_err_transform_failed: +- return "Schema transform failed"; +- case pcmk_err_old_data: +- return "Update was older than existing configuration"; +- case pcmk_err_diff_failed: +- return "Application of an update diff failed"; +- case pcmk_err_diff_resync: +- return "Application of an update diff failed, requesting a full refresh"; +- case pcmk_err_cib_modified: +- return "The on-disk configuration was manually modified"; +- case pcmk_err_cib_backup: +- return "Could not archive the previous configuration"; +- case pcmk_err_cib_save: +- return "Could not save the new configuration to disk"; +- case pcmk_err_cib_corrupt: +- return "Could not parse on-disk configuration"; +- case pcmk_err_multiple: +- return "Resource active on multiple nodes"; +- case pcmk_err_node_unknown: +- return "Node not found"; +- case pcmk_err_already: +- return "Situation already as requested"; +- case pcmk_err_bad_nvpair: +- return "Bad name/value pair given"; +- case pcmk_err_schema_unchanged: +- return "Schema is already the latest available"; +- case pcmk_err_unknown_format: +- return "Unknown output format"; +- +- /* The following cases will only be hit on systems for which they are non-standard */ +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOTUNIQ: +- return "Name not unique on network"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ECOMM: +- return "Communication error on send"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ELIBACC: +- return "Can not access a needed shared library"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case EREMOTEIO: +- return "Remote I/O error"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOKEY: +- return "Required key not available"; +- } +- crm_err("Unknown error code: %d", rc); +- return "Unknown error"; ++ return pcmk_rc_str(pcmk_legacy2rc(rc)); + } + + // Standard Pacemaker API return codes +-- +2.27.0 + + +From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:41:24 -0600 +Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error" + +... which is unhelpful and annoying to users +--- + lib/common/results.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index b2c6e8d553..5ffac76549 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -376,7 +376,7 @@ pcmk_rc_str(int rc) + return pcmk__rcs[pcmk_rc_error - rc].desc; + } + if (rc < 0) { +- return "Unknown error"; ++ return "Error"; + } + + // Handle values that could be defined by system or by portability.h +@@ -768,7 +768,7 @@ bz2_strerror(int rc) + case BZ_OUTBUFF_FULL: + return "output data will not fit into the buffer provided"; + } +- return "Unknown error"; ++ return "Data compression error"; + } + + crm_exit_t +-- +2.27.0 + + +From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:01:39 -0600 +Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog + device + +--- + lib/fencing/st_client.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index b1de912b2a..a0f3119f3b 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + * we drop in here - so as not to make remote nodes + * panic on that answer + */ +- crm_warn("watchdog-fencing-query failed"); ++ if (rc == -ENODEV) { ++ crm_notice("Cluster does not have watchdog fencing device"); ++ } else { ++ crm_warn("Could not check for watchdog fencing device: %s", ++ pcmk_strerror(rc)); ++ } + } else if (list[0] == '\0') { + rv = TRUE; + } else { +-- +2.27.0 + + +From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:22:49 -0600 +Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for + recent change + +--- + lib/common/tests/results/pcmk__results_test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c +index 57a520c501..e08d4b6261 100644 +--- a/lib/common/tests/results/pcmk__results_test.c ++++ b/lib/common/tests/results/pcmk__results_test.c +@@ -30,7 +30,7 @@ static void + test_for_pcmk_rc_str(void **state) { + assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format"); + assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK"); +- assert_string_equal(pcmk_rc_str(-1), "Unknown error"); ++ assert_string_equal(pcmk_rc_str(-1), "Error"); + } + + static void +-- +2.27.0 + + +From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 10:20:38 -0600 +Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent + changes + +--- + cts/lab/CTStests.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py +index 62c832eb45..f4be998cfb 100644 +--- a/cts/lab/CTStests.py ++++ b/cts/lab/CTStests.py +@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver): + r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", + r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", + r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", +- r"error: Result of monitor operation for .* on remote-.*: No executor connection", ++ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure", + ] + + ignore_pats.extend(RemoteDriver.errorstoignore(self)) +-- +2.27.0 + + +From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 15:40:49 -0600 +Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation + less chatty + +Other messages with the same info will already be logged at higher severity +--- + daemons/controld/controld_execd.c | 3 +-- + daemons/controld/controld_te_actions.c | 7 ++----- + include/pcmki/pcmki_sched_utils.h | 3 +-- + lib/pacemaker/pcmk_injections.c | 3 +-- + lib/pacemaker/pcmk_sched_actions.c | 12 +++++------- + 5 files changed, 10 insertions(+), 18 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 15784e7687..52157fa5d4 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_ + caller_version = CRM_FEATURE_SET; + } + +- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version); + xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc, +- fsa_our_uname, src, LOG_DEBUG); ++ fsa_our_uname, src); + if (xml_op == NULL) { + return TRUE; + } +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index 63b7c72359..b0bcb8b2e4 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action) + lrmd_event_data_t *op = NULL; + xmlNode *state = NULL; + xmlNode *rsc = NULL; +- xmlNode *xml_op = NULL; + xmlNode *action_rsc = NULL; + + int rc = pcmk_ok; +@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action) + op->user_data = pcmk__transition_key(transition_graph->id, action->id, + target_rc, te_uuid); + +- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, +- target, __func__, LOG_INFO); ++ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, ++ __func__); + lrmd_free_event(op); + +- crm_log_xml_trace(xml_op, "Action timeout"); +- + rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); + fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); + free_xml(state); +diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h +index 68d60fc7db..144424a609 100644 +--- a/include/pcmki/pcmki_sched_utils.h ++++ b/include/pcmki/pcmki_sched_utils.h +@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor + + xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event, + const char *caller_version, int target_rc, +- const char *node, const char *origin, +- int level); ++ const char *node, const char *origin); + + # define LOAD_STOPPED "load_stopped" + +diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c +index 678c3f5dd2..1aa90a5a0b 100644 +--- a/lib/pacemaker/pcmk_sched_transition.c ++++ b/lib/pacemaker/pcmk_sched_transition.c +@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + { + return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET, +- target_rc, NULL, crm_system_name, +- LOG_TRACE); ++ target_rc, NULL, crm_system_name); + } + + static xmlNode * +diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c +index f8200b0efc..4f63d3374d 100644 +--- a/lib/pacemaker/pcmk_sched_utils.c ++++ b/lib/pacemaker/pcmk_sched_utils.c +@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update) + * \param[in] target_rc Expected result of operation + * \param[in] node Name of node on which operation was performed + * \param[in] origin Arbitrary description of update source +- * \param[in] level A log message will be logged at this level + * + * \return Newly created XML node for history update + */ + xmlNode * + pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *caller_version, int target_rc, +- const char *node, const char *origin, int level) ++ const char *node, const char *origin) + { + char *key = NULL; + char *magic = NULL; +@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *task = NULL; + + CRM_CHECK(op != NULL, return NULL); +- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)", +- origin, op->rsc_id, op->op_type, +- pcmk_exec_status_str(op->op_status), op->interval_ms); +- +- crm_trace("DC version: %s", caller_version); ++ crm_trace("Creating history XML for %s-interval %s action for %s on %s " ++ "(DC version: %s, origin: %s)", ++ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id, ++ ((node == NULL)? "no node" : node), caller_version, origin); + + task = op->op_type; + +-- +2.27.0 + + +From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 17:09:44 -0600 +Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal + timeouts + +Functionize the part of controld_record_action_timeout() that creates a fake +executor event, into a new function synthesize_timeout_event(), and have it set +a more detailed exit reason describing what timed out. +--- + daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------ + 1 file changed, 48 insertions(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index b0bcb8b2e4..de2fbb82bf 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action) + return TRUE; + } + ++/*! ++ * \internal ++ * \brief Synthesize an executor event for a resource action timeout ++ * ++ * \param[in] action Resource action that timed out ++ * \param[in] target_rc Expected result of action that timed out ++ * ++ * Synthesize an executor event for a resource action timeout. (If the executor ++ * gets a timeout while waiting for a resource action to complete, that will be ++ * reported via the usual callback. This timeout means we didn't hear from the ++ * executor itself or the controller that relayed the action to the executor.) ++ * ++ * \return Newly created executor event for result of \p action ++ * \note The caller is responsible for freeing the return value using ++ * lrmd_free_event(). ++ */ ++static lrmd_event_data_t * ++synthesize_timeout_event(crm_action_t *action, int target_rc) ++{ ++ lrmd_event_data_t *op = NULL; ++ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ const char *reason = NULL; ++ char *dynamic_reason = NULL; ++ ++ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) { ++ reason = "Local executor did not return result in time"; ++ } else { ++ const char *router_node = NULL; ++ ++ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); ++ if (router_node == NULL) { ++ router_node = target; ++ } ++ dynamic_reason = crm_strdup_printf("Controller on %s did not return " ++ "result in time", router_node); ++ reason = dynamic_reason; ++ } ++ ++ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, ++ PCMK_OCF_UNKNOWN_ERROR, reason); ++ op->call_id = -1; ++ op->user_data = pcmk__transition_key(transition_graph->id, action->id, ++ target_rc, te_uuid); ++ free(dynamic_reason); ++ return op; ++} ++ + void + controld_record_action_timeout(crm_action_t *action) + { +@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action) + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS); + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER); + +- /* If the executor gets a timeout while waiting for the action to complete, +- * that will be reported via the usual callback. This timeout means that we +- * didn't hear from the executor or the controller that relayed the action +- * to the executor. +- */ +- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, +- PCMK_OCF_UNKNOWN_ERROR, +- "Cluster communication timeout " +- "(no response from executor)"); +- op->call_id = -1; +- op->user_data = pcmk__transition_key(transition_graph->id, action->id, +- target_rc, te_uuid); +- ++ op = synthesize_timeout_event(action, target_rc); + pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, + __func__); + lrmd_free_event(op); +-- +2.27.0 + + +From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 16:35:06 -0600 +Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing + timeouts + +Troubleshooting timeouts is one of the more difficult aspects of cluster +maintenance. We want to give as much of a hint as possible, but for fencing in +particular it is difficult because an operation might involve multiple retries +of multiple devices. + +Barring another major project to track exactly which devices, retries, etc., +were used in a given operation, these changes in wording are probably the best +we can do. +--- + daemons/fenced/fenced_remote.c | 8 +++++--- + lib/fencing/st_client.c | 2 +- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 1e237150c5..6eebb7381e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata, "Fencing could not be completed " +- "within overall timeout"); ++ finalize_timed_out_op(userdata, "Fencing did not complete within a " ++ "total timeout based on the " ++ "configured timeout and retries for " ++ "any devices attempted"); + } + return G_SOURCE_REMOVE; + } +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index a0f3119f3b..718739b321 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + if (msg == NULL) { + // Fencer didn't reply in time + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Timeout waiting for reply from fencer"); ++ "Fencer accepted request but did not reply in time"); + CRM_LOG_ASSERT(call_id > 0); + + } else { +-- +2.27.0 + + +From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:09:09 -0600 +Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for + timeouts + +The services library doesn't have enough information about an action to say +(for example) what configuration parameters might be relevant, but we can at +least distinguish what kind of agent timed out. +--- + lib/services/services_linux.c | 12 +++++++++++- + lib/services/systemd.c | 2 +- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c +index f15eee860e..d6aafcfe46 100644 +--- a/lib/services/services_linux.c ++++ b/lib/services/services_linux.c +@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + parse_exit_reason_from_stderr(op); + + } else if (mainloop_child_timeout(p)) { ++ const char *reason = NULL; ++ ++ if (op->rsc != NULL) { ++ reason = "Resource agent did not complete in time"; ++ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH, ++ pcmk__str_none)) { ++ reason = "Fence agent did not complete in time"; ++ } else { ++ reason = "Process did not complete in time"; ++ } + crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); + services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, +- "Process did not exit within specified timeout"); ++ reason); + + } else if (op->cancel) { + /* If an in-flight recurring operation was killed because it was +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 27a3b376db..d87b287424 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p) + crm_info("%s action for systemd unit %s named '%s' timed out", + op->action, op->agent, op->rsc); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, +- "Systemd action did not complete within specified timeout"); ++ "Systemd unit action did not complete in time"); + services__finalize_async_op(op); + return FALSE; + } +-- +2.27.0 + diff --git a/SOURCES/019-corosync-tracking.patch b/SOURCES/019-corosync-tracking.patch new file mode 100644 index 0000000..ac3ca96 --- /dev/null +++ b/SOURCES/019-corosync-tracking.patch @@ -0,0 +1,29 @@ +From e8bf0161b872267f1bb7143a9866fdc15ec218f2 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 18 Jan 2022 16:35:24 +0100 +Subject: [PATCH] Fix: corosync: Repeat corosync_cfg_trackstart + +corosync_cfg_trackstart can fail with CS_ERR_TRY_AGAIN failure so +(similarly as for corosync_cfg_local_get, ...) handle failure with +using cs_repeat macro. +--- + daemons/pacemakerd/pcmkd_corosync.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c +index 7990bc43c5..cd7a40321d 100644 +--- a/daemons/pacemakerd/pcmkd_corosync.c ++++ b/daemons/pacemakerd/pcmkd_corosync.c +@@ -186,7 +186,8 @@ cluster_connect_cfg(void) + crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid); + + #ifdef HAVE_COROSYNC_CFG_TRACKSTART +- rc = corosync_cfg_trackstart(cfg_handle, 0); ++ retries = 0; ++ cs_repeat(retries, 30, rc = corosync_cfg_trackstart(cfg_handle, 0)); + if (rc != CS_OK) { + crm_crit("Could not enable Corosync CFG shutdown tracker: %s " CRM_XS " rc=%d", + cs_strerror(rc), rc); +-- +2.27.0 + diff --git a/SOURCES/020-systemd-unit.patch b/SOURCES/020-systemd-unit.patch new file mode 100644 index 0000000..a425ae3 --- /dev/null +++ b/SOURCES/020-systemd-unit.patch @@ -0,0 +1,41 @@ +From e316840a7e1d2a72e3089ee194334244c959905a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 09:53:53 -0600 +Subject: [PATCH] Fix: pacemakerd: tweak systemd unit respawn settings + +If pacemaker exits immediately after starting, wait 1 second before trying to +respawn, since the default of 100ms is a bit aggressive for a Pacemaker +cluster. + +Also, allow 5 attempts in 25 seconds before giving up. +--- + daemons/pacemakerd/pacemaker.service.in | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in +index 0363a2259c..3fd53d9ffb 100644 +--- a/daemons/pacemakerd/pacemaker.service.in ++++ b/daemons/pacemakerd/pacemaker.service.in +@@ -31,6 +31,9 @@ After=rsyslog.service + After=corosync.service + Requires=corosync.service + ++# If Pacemaker respawns repeatedly, give up after this many tries in this time ++StartLimitBurst=5 ++StartLimitIntervalSec=25s + + [Install] + WantedBy=multi-user.target +@@ -57,6 +60,9 @@ TasksMax=infinity + # resource. Sending -KILL will just get the node fenced + SendSIGKILL=no + ++# Systemd's default of respawning a failed service after 100ms is too aggressive ++RestartSec=1s ++ + # If we ever hit the StartLimitInterval/StartLimitBurst limit, and the + # admin wants to stop the cluster while pacemakerd is not running, it + # might be a good idea to enable the ExecStopPost directive below. +-- +2.27.0 + diff --git a/SOURCES/021-daemon-tracking.patch b/SOURCES/021-daemon-tracking.patch new file mode 100644 index 0000000..8259921 --- /dev/null +++ b/SOURCES/021-daemon-tracking.patch @@ -0,0 +1,354 @@ +From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 18 Nov 2021 13:23:34 +0100 +Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for + liveness + +--- + daemons/pacemakerd/pacemakerd.c | 2 + + daemons/pacemakerd/pacemakerd.h | 3 +- + daemons/pacemakerd/pcmkd_messages.c | 6 +- + daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++--------- + 4 files changed, 98 insertions(+), 52 deletions(-) + +diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c +index 34d64c4053..062c2d5326 100644 +--- a/daemons/pacemakerd/pacemakerd.c ++++ b/daemons/pacemakerd/pacemakerd.c +@@ -259,6 +259,8 @@ main(int argc, char **argv) + pcmk_ipc_api_t *old_instance = NULL; + qb_ipcs_service_t *ipcs = NULL; + ++ subdaemon_check_progress = time(NULL); ++ + crm_log_preinit(NULL, argc, argv); + mainloop_add_signal(SIGHUP, pcmk_ignore); + mainloop_add_signal(SIGQUIT, pcmk_sigquit); +diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h +index 7c541bbf9e..424dbbcc5d 100644 +--- a/daemons/pacemakerd/pacemakerd.h ++++ b/daemons/pacemakerd/pacemakerd.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2010-2021 the Pacemaker project contributors ++ * Copyright 2010-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to; + extern gboolean shutdown_complete_state_reported_client_closed; + extern crm_trigger_t *shutdown_trigger; + extern crm_trigger_t *startup_trigger; ++extern time_t subdaemon_check_progress; + + gboolean mcp_read_config(void); + +diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c +index 0439986ecf..f2cddc353e 100644 +--- a/daemons/pacemakerd/pcmkd_messages.c ++++ b/daemons/pacemakerd/pcmkd_messages.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2010-2021 the Pacemaker project contributors ++ * Copyright 2010-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) + const char *value = NULL; + xmlNode *ping = NULL; + xmlNode *reply = NULL; +- time_t pinged = time(NULL); + const char *from = crm_element_value(msg, F_CRM_SYS_FROM); + + /* Pinged for status */ +@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) + value = crm_element_value(msg, F_CRM_SYS_TO); + crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); + crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state); +- crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged); ++ crm_xml_add_ll(ping, XML_ATTR_TSTAMP, ++ (long long) subdaemon_check_progress); + crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); + reply = create_reply(msg, ping); + free_xml(ping); +diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c +index a54fcce1ba..c03903c99e 100644 +--- a/daemons/pacemakerd/pcmkd_subdaemons.c ++++ b/daemons/pacemakerd/pcmkd_subdaemons.c +@@ -32,14 +32,16 @@ typedef struct pcmk_child_s { + const char *command; + const char *endpoint; /* IPC server name */ + bool needs_cluster; ++ int check_count; + + /* Anything below here will be dynamically initialized */ + bool needs_retry; + bool active_before_startup; + } pcmk_child_t; + +-#define PCMK_PROCESS_CHECK_INTERVAL 5 +-#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ ++#define PCMK_PROCESS_CHECK_INTERVAL 1 ++#define PCMK_PROCESS_CHECK_RETRIES 5 ++#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ + + /* Index into the array below */ + #define PCMK_CHILD_CONTROLD 5 +@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; + + crm_trigger_t *shutdown_trigger = NULL; + crm_trigger_t *startup_trigger = NULL; ++time_t subdaemon_check_progress = 0; + + /* When contacted via pacemakerd-api by a client having sbd in + * the name we assume it is sbd-daemon which wants to know +@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */ + GMainLoop *mainloop = NULL; + + static gboolean fatal_error = FALSE; +-static bool global_keep_tracking = false; + + static gboolean check_active_before_startup_processes(gpointer user_data); + static int child_liveness(pcmk_child_t *child); +@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void) + static gboolean + check_active_before_startup_processes(gpointer user_data) + { +- gboolean keep_tracking = FALSE; +- +- for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) { +- if (!pcmk_children[i].active_before_startup) { +- /* we are already tracking it as a child process. */ +- continue; +- } else { +- int rc = child_liveness(&pcmk_children[i]); +- +- switch (rc) { +- case pcmk_rc_ok: +- break; +- case pcmk_rc_ipc_unresponsive: +- case pcmk_rc_ipc_pid_only: // This case: it was previously OK +- if (pcmk_children[i].respawn) { +- crm_err("%s[%lld] terminated%s", pcmk_children[i].name, +- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), +- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); +- } else { +- /* orderly shutdown */ +- crm_notice("%s[%lld] terminated%s", pcmk_children[i].name, +- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), +- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); +- } +- pcmk_process_exit(&(pcmk_children[i])); +- continue; +- default: +- crm_exit(CRM_EX_FATAL); +- break; /* static analysis/noreturn */ ++ static int next_child = 0; ++ int rc = child_liveness(&pcmk_children[next_child]); ++ ++ crm_trace("%s[%lld] checked as %d", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ rc); ++ ++ switch (rc) { ++ case pcmk_rc_ok: ++ pcmk_children[next_child].check_count = 0; ++ next_child++; ++ subdaemon_check_progress = time(NULL); ++ break; ++ case pcmk_rc_ipc_pid_only: // This case: it was previously OK ++ pcmk_children[next_child].check_count++; ++ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) { ++ crm_err("%s[%lld] is unresponsive to ipc after %d tries but " ++ "we found the pid so have it killed that we can restart", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ pcmk_children[next_child].check_count); ++ stop_child(&pcmk_children[next_child], SIGKILL); ++ if (pcmk_children[next_child].respawn) { ++ /* as long as the respawn-limit isn't reached ++ give it another round of check retries ++ */ ++ pcmk_children[next_child].check_count = 0; ++ } ++ } else { ++ crm_notice("%s[%lld] is unresponsive to ipc after %d tries", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ pcmk_children[next_child].check_count); ++ if (pcmk_children[next_child].respawn) { ++ /* as long as the respawn-limit isn't reached ++ and we haven't run out of connect retries ++ we account this as progress we are willing ++ to tell to sbd ++ */ ++ subdaemon_check_progress = time(NULL); ++ } + } +- } +- /* at least one of the processes found at startup +- * is still going, so keep this recurring timer around */ +- keep_tracking = TRUE; ++ /* go to the next child and see if ++ we can make progress there ++ */ ++ next_child++; ++ break; ++ case pcmk_rc_ipc_unresponsive: ++ if (pcmk_children[next_child].respawn) { ++ crm_err("%s[%lld] terminated", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ } else { ++ /* orderly shutdown */ ++ crm_notice("%s[%lld] terminated", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ } ++ pcmk_process_exit(&(pcmk_children[next_child])); ++ if (!pcmk_children[next_child].respawn) { ++ /* if a subdaemon is down and we don't want it ++ to be restarted this is a success during ++ shutdown. if it isn't restarted anymore ++ due to MAX_RESPAWN it is ++ rather no success. ++ */ ++ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { ++ subdaemon_check_progress = time(NULL); ++ } ++ next_child++; ++ } ++ break; ++ default: ++ crm_exit(CRM_EX_FATAL); ++ break; /* static analysis/noreturn */ + } + +- global_keep_tracking = keep_tracking; +- return keep_tracking; ++ if (next_child >= PCMK__NELEM(pcmk_children)) { ++ next_child = 0; ++ } ++ ++ return G_SOURCE_CONTINUE; + } + + static gboolean +@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child) + child->name, child->endpoint); + /* need to monitor how it evolves, and start new process if badly */ + child->active_before_startup = true; +- if (!global_keep_tracking) { +- global_keep_tracking = true; +- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, +- check_active_before_startup_processes, NULL); +- } + + } else { + if (child->needs_cluster && !pcmkd_cluster_connected()) { +@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child) + int + find_and_track_existing_processes(void) + { +- bool tracking = false; + bool wait_in_progress; + int rc; + size_t i, rounds; +@@ -716,7 +762,6 @@ find_and_track_existing_processes(void) + pcmk_children[i].pid)); + pcmk_children[i].respawn_count = -1; /* 0~keep watching */ + pcmk_children[i].active_before_startup = true; +- tracking = true; + break; + case pcmk_rc_ipc_pid_only: + if (pcmk_children[i].respawn_count == WAIT_TRIES) { +@@ -751,10 +796,8 @@ find_and_track_existing_processes(void) + pcmk_children[i].respawn_count = 0; /* restore pristine state */ + } + +- if (tracking) { +- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, ++ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, + check_active_before_startup_processes, NULL); +- } + return pcmk_rc_ok; + } + +-- +2.27.0 + + +From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 9 Dec 2021 11:25:22 +0100 +Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect + +--- + configure.ac | 3 +++ + lib/common/ipc_client.c | 22 ++++++++++++++++++++++ + 2 files changed, 25 insertions(+) + +diff --git a/configure.ac b/configure.ac +index f43fb724c7..c747fe1193 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17) + CPPFLAGS="$libqb_CFLAGS $CPPFLAGS" + LIBS="$libqb_LIBS $LIBS" + ++dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01) ++AC_CHECK_FUNCS([qb_ipcc_connect_async]) ++ + dnl libqb 2.0.2+ (2020-10) + AC_CHECK_FUNCS(qb_ipcc_auth_get, + AC_DEFINE(HAVE_IPCC_AUTH_GET, 1, +diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c +index c5afdf3a3d..417b9ef175 100644 +--- a/lib/common/ipc_client.c ++++ b/lib/common/ipc_client.c +@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid, + int32_t qb_rc; + pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; + qb_ipcc_connection_t *c; ++#ifdef HAVE_QB_IPCC_CONNECT_ASYNC ++ struct pollfd pollfd = { 0, }; ++ int poll_rc; + ++ c = qb_ipcc_connect_async(name, 0, ++ &(pollfd.fd)); ++#else + c = qb_ipcc_connect(name, 0); ++#endif + if (c == NULL) { + crm_info("Could not connect to %s IPC: %s", name, strerror(errno)); + rc = pcmk_rc_ipc_unresponsive; + goto bail; + } ++#ifdef HAVE_QB_IPCC_CONNECT_ASYNC ++ pollfd.events = POLLIN; ++ do { ++ poll_rc = poll(&pollfd, 1, 2000); ++ } while ((poll_rc == -1) && (errno == EINTR)); ++ if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) { ++ crm_info("Could not connect to %s IPC: %s", name, ++ (poll_rc == 0)?"timeout":strerror(errno)); ++ rc = pcmk_rc_ipc_unresponsive; ++ if (poll_rc > 0) { ++ c = NULL; // qb_ipcc_connect_continue cleaned up for us ++ } ++ goto bail; ++ } ++#endif + + qb_rc = qb_ipcc_fd_get(c, &fd); + if (qb_rc != 0) { +-- +2.27.0 + diff --git a/SOURCES/022-failure-messages.patch b/SOURCES/022-failure-messages.patch new file mode 100644 index 0000000..fab1013 --- /dev/null +++ b/SOURCES/022-failure-messages.patch @@ -0,0 +1,1338 @@ +From 9ee3d6c9b0aba6aae022cc152a3b3472fe388fa3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 16:44:32 -0600 +Subject: [PATCH 01/15] Refactor: fencer: add exit reason to fencing operation + object + +In order to pass a fencing action's exit reason with the action history, +we need the exit reason in remote_fencing_op_t. Nothing sets or uses it as of +this commit. +--- + daemons/fenced/fenced_remote.c | 2 ++ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 6eebb7381e..0fa9706140 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -260,6 +260,8 @@ free_remote_op(gpointer data) + } + g_list_free_full(op->automatic_list, free); + g_list_free(op->duplicates); ++ ++ pcmk__reset_result(&op->result); + free(op); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 502fcc9a29..1a5c933ea7 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -151,6 +151,8 @@ typedef struct remote_fencing_op_s { + /*! The point at which the remote operation completed(nsec) */ + long long completed_nsec; + ++ /*! The (potentially intermediate) result of the operation */ ++ pcmk__action_result_t result; + } remote_fencing_op_t; + + void fenced_broadcast_op_result(remote_fencing_op_t *op, +-- +2.27.0 + + +From 97a2c318866adc5ef5e426c5c3b753df1fa3ab66 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:08:42 -0600 +Subject: [PATCH 02/15] Refactor: fencer: track full result in + remote_fencing_op_t + +Now that remote_fencing_op_t has a place for the full result, +set it before calling finalize_op(), instead of passing a separate result +object to finalize_op(). + +As a bonus, this simplifies the memory management, reducing the chance of +mistakes. +--- + daemons/fenced/fenced_remote.c | 161 ++++++++++++++++----------------- + 1 file changed, 77 insertions(+), 84 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 0fa9706140..30edbff890 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -82,8 +82,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); +-static void finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -485,7 +484,9 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- finalize_op(other, data, result, true); ++ pcmk__set_result(&other->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(other, data, true); + + } else { + // Possible if (for example) it timed out already +@@ -520,20 +521,20 @@ delegate_from_xml(xmlNode *xml) + * + * \param[in] op Fencer operation that completed + * \param[in] data If not NULL, XML reply of last delegated fencing operation +- * \param[in] result Full operation result + * \param[in] dup Whether this operation is a duplicate of another + * (in which case, do not broadcast the result) ++ * ++ * \note The operation result should be set before calling this function. + */ + static void +-finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + +- CRM_CHECK((op != NULL) && (result != NULL), return); ++ CRM_CHECK((op != NULL), return); + + if (op->notify_sent) { + // Most likely, this is a timed-out action that eventually completed +@@ -557,11 +558,11 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + local_data = data; + + } else if (op->delegate == NULL) { +- switch (result->execution_status) { ++ switch (op->result.execution_status) { + case PCMK_EXEC_NO_FENCE_DEVICE: + break; + case PCMK_EXEC_INVALID: +- if (result->exit_status == CRM_EX_EXPIRED) { ++ if (op->result.exit_status == CRM_EX_EXPIRED) { + break; + } + // else fall through +@@ -581,12 +582,12 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, result, op_merged); ++ fenced_broadcast_op_result(op, &op->result, op_merged); + free_xml(local_data); + return; + } + +- if (pcmk__result_ok(result) || dup ++ if (pcmk__result_ok(&op->result) || dup + || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +@@ -595,16 +596,17 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), +- pcmk_exec_status_str(result->execution_status), +- ((result->exit_reason == NULL)? "" : ": "), +- ((result->exit_reason == NULL)? "" : result->exit_reason), ++ (op_merged? " (merged)" : ""), ++ crm_exit_str(op->result.exit_status), ++ pcmk_exec_status_str(op->result.execution_status), ++ ((op->result.exit_reason == NULL)? "" : ": "), ++ ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, result); ++ handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, result); ++ finalize_op_duplicates(op, data, &op->result); + } + + /* Free non-essential parts of the record +@@ -634,7 +636,6 @@ static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -642,8 +643,8 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, NULL, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, false); + return G_SOURCE_REMOVE; + } + +@@ -676,8 +677,6 @@ remote_op_timeout_one(gpointer userdata) + static void + finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + op->op_timer_total = 0; + + crm_debug("Action '%s' targeting %s for client %s timed out " +@@ -690,13 +689,12 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + * devices, and return success. + */ + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { + op->state = st_failed; +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); ++ finalize_op(op, NULL, false); + } + + /*! +@@ -1094,13 +1092,9 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- { +- // For the fencer's purposes, the fencing operation is done +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, msg, &result, false); +- } ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1279,16 +1273,11 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + switch (op->state) { + case st_failed: + // advance_topology_level() exhausted levels +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, +- "All topology levels failed"); +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); +- } ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, false); + return op; + + case st_duplicate: +@@ -1613,10 +1602,6 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1644,6 +1629,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + } + + if (op->devices) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1659,7 +1648,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + } + } + +@@ -1868,7 +1858,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- finalize_op(op, NULL, result, false); ++ pcmk__set_result(&op->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(op, NULL, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2245,31 +2237,34 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- goto done; ++ pcmk__reset_result(&result); ++ return; + } + ++ op->result = result; // The operation takes ownership of the result ++ + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- goto done; ++ return; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { + crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")", op->id); +- if (pcmk__result_ok(&result)) { ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&op->result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2277,7 +2272,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- goto done; ++ return; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2286,58 +2281,58 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, + op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")"); ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + +- if ((op->phase == 2) && !pcmk__result_ok(&result)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&op->result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ + crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " + "after successful 'off'", +- device, pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : ": ", +- (result.exit_reason == NULL)? "" : result.exit_reason, ++ device, pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : ": ", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, + op->target); +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (pcmk__result_ok(&result)) { ++ if (pcmk__result_ok(&op->result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- goto done; ++ return; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + } + +- } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { ++ } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + +- } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else { + /* fall-through and attempt other fencing action using another peer */ +@@ -2346,10 +2341,8 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Retry on failure */ + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, +- pcmk_exec_status_str(result.execution_status)); +- request_peer_fencing(op, NULL, &result); +-done: +- pcmk__reset_result(&result); ++ pcmk_exec_status_str(op->result.execution_status)); ++ request_peer_fencing(op, NULL, &op->result); + } + + gboolean +-- +2.27.0 + + +From c59d062154f7c9e15e90929a20ea244d7efd7247 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:11:12 -0600 +Subject: [PATCH 03/15] Refactor: fencer: drop redundant argument from + finalize_op_duplicates() + +... now that the result is in the op +--- + daemons/fenced/fenced_remote.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 30edbff890..8b496e1042 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -468,11 +468,9 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data) + { + for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; +@@ -482,10 +480,11 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_exec_status_str(result->execution_status), ++ pcmk_exec_status_str(op->result.execution_status), + other->id); +- pcmk__set_result(&other->result, result->exit_status, +- result->execution_status, result->exit_reason); ++ pcmk__set_result(&other->result, op->result.exit_status, ++ op->result.execution_status, ++ op->result.exit_reason); + finalize_op(other, data, true); + + } else { +@@ -606,7 +605,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, &op->result); ++ finalize_op_duplicates(op, data); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 6c49675855323a52a534afa112a0861ba2e3b1ad Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:15:17 -0600 +Subject: [PATCH 04/15] Refactor: fencer: drop redundant argument from + fenced_broadcast_op_result() + +... now that the op includes the result +--- + daemons/fenced/fenced_history.c | 9 +++------ + daemons/fenced/fenced_remote.c | 8 +++----- + daemons/fenced/pacemaker-fenced.h | 3 +-- + 3 files changed, 7 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 0157deadb3..5cacf36ca8 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -359,8 +359,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); +@@ -378,10 +376,10 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() + * from setting a delegate + */ +- pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ pcmk__set_result(&op->result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, + "Initiated by earlier fencer " + "process and presumed failed"); +- fenced_broadcast_op_result(op, &result, false); ++ fenced_broadcast_op_result(op, false); + } + + g_hash_table_iter_steal(&iter); +@@ -396,7 +394,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + +- pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8b496e1042..fb5a5e980e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -390,16 +390,14 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + * \brief Broadcast a fence result notification to all CPG peers + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * \param[in] op_merged Whether this operation is a duplicate of another + */ + void +-fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, result); ++ xmlNode *notify_data = fencing_result2xml(op, &op->result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -581,7 +579,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, &op->result, op_merged); ++ fenced_broadcast_op_result(op, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 1a5c933ea7..6213407da3 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -155,8 +155,7 @@ typedef struct remote_fencing_op_s { + pcmk__action_result_t result; + } remote_fencing_op_t; + +-void fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 73994fc740b8833457b130368db479502d49f285 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:17:33 -0600 +Subject: [PATCH 05/15] Refactor: fencer: drop redundant argument from + handle_local_reply_and_notify() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index fb5a5e980e..2621cb2f19 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -424,11 +424,9 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -443,15 +441,15 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = fenced_construct_reply(op->request, data, result); ++ reply = fenced_construct_reply(op->request, data, &op->result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, result); +- fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ notify_data = fencing_result2xml(op, &op->result); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + +@@ -600,7 +598,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, &op->result); ++ handle_local_reply_and_notify(op, data); + + if (!dup) { + finalize_op_duplicates(op, data); +-- +2.27.0 + + +From 194056d18d3b550d3a53b94d558ceed03b5e5442 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:18:27 -0600 +Subject: [PATCH 06/15] Refactor: fencer: drop redundant argument from + fencing_result2xml() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 2621cb2f19..8d4f53eef6 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -362,13 +362,12 @@ undo_op_remap(remote_fencing_op_t *op) + * \brief Create notification data XML for a fencing operation result + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * + * \return Newly created XML to add as notification data + * \note The caller is responsible for freeing the result. + */ + static xmlNode * +-fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) ++fencing_result2xml(remote_fencing_op_t *op) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + +@@ -381,7 +380,7 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + +- stonith__xe_set_result(notify_data, result); ++ stonith__xe_set_result(notify_data, &op->result); + return notify_data; + } + +@@ -397,7 +396,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, &op->result); ++ xmlNode *notify_data = fencing_result2xml(op); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -448,7 +447,7 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, &op->result); ++ notify_data = fencing_result2xml(op); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); +-- +2.27.0 + + +From c5d38cb201a1219ca95127cba9c3a778e31966a2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:35:43 -0600 +Subject: [PATCH 07/15] Refactor: fencer: drop redundant argument from + request_peer_fencing() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 66 +++++++++++++--------------------- + 1 file changed, 25 insertions(+), 41 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8d4f53eef6..7fb7695fba 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -80,8 +80,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, +- peer_device_info_t *peer, +- pcmk__action_result_t *result); ++ peer_device_info_t *peer); + static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, +@@ -646,18 +645,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, + "Peer did not return fence result within timeout"); + +- + // Try another device, if appropriate +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + return FALSE; + } + +@@ -730,13 +727,10 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { +- // Result won't be used in this case, but we need to pass something +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1622,11 +1616,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op_phase_on(op); + } + +- if (op->devices) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ // This function is only called if the previous device succeeded ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + ++ if (op->devices) { + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1636,13 +1629,12 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + finalize_op(op, msg, false); + } + } +@@ -1673,13 +1665,9 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + * \param[in] op Fencing operation to be executed + * \param[in] peer If NULL or topology is in use, choose best peer to execute + * the fencing, otherwise use this peer +- * \param[in] result Full result of previous failed attempt, if any (used as +- * final result only if a previous attempt failed, topology +- * is not in use, and no devices remain to be attempted) + */ + static void +-request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, +- pcmk__action_result_t *result) ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) + { + const char *device = NULL; + int timeout; +@@ -1822,27 +1810,26 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + } + +- // This is the only case in which result will be used +- CRM_CHECK(result != NULL, return); +- + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, +- NULL); ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } +- /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from +- setting the correct delegate if needed. ++ /* ... else use existing result from previous failed attempt ++ * (topology is not in use, and no devices remain to be attempted). ++ * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would ++ * prevent finalize_op() from setting the correct delegate if ++ * needed. + */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " +@@ -1852,8 +1839,6 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- pcmk__set_result(&op->result, result->exit_status, +- result->execution_status, result->exit_reason); + finalize_op(op, NULL, false); + + } else { +@@ -2104,7 +2089,6 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2139,7 +2123,7 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, +@@ -2148,12 +2132,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } + + } else if (op->state == st_query) { +@@ -2165,12 +2149,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2336,7 +2320,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(op->result.execution_status)); +- request_peer_fencing(op, NULL, &op->result); ++ request_peer_fencing(op, NULL); + } + + gboolean +-- +2.27.0 + + +From be0a0b652c13161a82b05d3104449b7bfc06e8ac Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:56:24 -0600 +Subject: [PATCH 08/15] Feature: fencer: track full result in fencing history + +Add fencing operation results when creating XML in +stonith_local_history_diff_and_merge(), and parse the results from the received +XML in stonith_xml_history_to_list(). + +With this, the fencer now always has full results in its op list, and returns +them in the reply for STONITH_OP_FENCE_HISTORY requests (though nothing uses +that as of this commit). +--- + daemons/fenced/fenced_history.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 5cacf36ca8..3ebf016e67 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -257,6 +257,7 @@ stonith_xml_history_to_list(xmlNode *history) + op->completed_nsec = completed_nsec; + crm_element_value_int(xml_op, F_STONITH_STATE, &state); + op->state = (enum op_state) state; ++ stonith__xe_get_result(xml_op, &op->result); + + g_hash_table_replace(rv, id, op); + CRM_LOG_ASSERT(g_hash_table_lookup(rv, id) != NULL); +@@ -355,6 +356,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + crm_xml_add_ll(entry, F_STONITH_DATE, op->completed); + crm_xml_add_ll(entry, F_STONITH_DATE_NSEC, op->completed_nsec); + crm_xml_add_int(entry, F_STONITH_STATE, op->state); ++ stonith__xe_set_result(entry, &op->result); + } + } + +-- +2.27.0 + + +From afc5292036e212bcfc7475893e0b326b2a69ac58 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:17:21 -0600 +Subject: [PATCH 09/15] API: libstonithd: add exit_reason member to + stonith_history_t + +not yet used, but will be +--- + include/crm/stonith-ng.h | 3 ++- + lib/fencing/st_client.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h +index 3fe9cf54f8..2c79bfa579 100644 +--- a/include/crm/stonith-ng.h ++++ b/include/crm/stonith-ng.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -111,6 +111,7 @@ typedef struct stonith_history_s { + time_t completed; + struct stonith_history_s *next; + long completed_nsec; ++ char *exit_reason; + } stonith_history_t; + + typedef struct stonith_s stonith_t; +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 718739b321..57a2e03361 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -735,6 +735,7 @@ void stonith_history_free(stonith_history_t *history) + free(hp->origin); + free(hp->delegate); + free(hp->client); ++ free(hp->exit_reason); + } + } + +-- +2.27.0 + + +From 1b9e2896322849002a5c0a3a34c9375ea32571d6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 18:04:15 -0600 +Subject: [PATCH 10/15] Feature: fencing: return exit reason with fencing + history + +libstonithd's stonith_t:cmds->history() method now parses exit reasons from the +fencer reply, and returns them in the stonith_history_t results. +--- + lib/fencing/st_client.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 57a2e03361..d229b34805 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -698,6 +698,7 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + stonith_history_t *kvp; + long long completed; + long long completed_nsec = 0L; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + kvp = calloc(1, sizeof(stonith_history_t)); + kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); +@@ -711,6 +712,11 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + kvp->completed_nsec = completed_nsec; + crm_element_value_int(op, F_STONITH_STATE, &kvp->state); + ++ stonith__xe_get_result(op, &result); ++ kvp->exit_reason = result.exit_reason; ++ result.exit_reason = NULL; ++ pcmk__reset_result(&result); ++ + if (last) { + last->next = kvp; + } else { +-- +2.27.0 + + +From ba4e77242e9be4ebeb2843b444ee4afad43c29f3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:44:39 -0600 +Subject: [PATCH 11/15] Feature: fencing: display exit reasons with failed + fencing events + +... when available +--- + lib/fencing/st_output.c | 20 ++++++++++++++++---- + tools/crm_mon_curses.c | 9 +++++++-- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c +index e484278867..18924d795d 100644 +--- a/lib/fencing/st_output.c ++++ b/lib/fencing/st_output.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -11,6 +11,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -263,8 +264,12 @@ stonith_event_html(pcmk__output_t *out, va_list args) { + char *failed_s = time_t_string(event->completed); + + out->list_item(out, "failed-stonith-event", +- "%s of %s failed : delegate=%s, client=%s, origin=%s, %s='%s' %s", ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", +@@ -296,8 +301,13 @@ stonith_event_text(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- pcmk__indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s' %s\n", ++ pcmk__indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +@@ -341,7 +351,9 @@ stonith_event_xml(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- crm_xml_add(node, "status", "failed"); ++ pcmk__xe_set_props(node, "status", "failed", ++ XML_LRM_ATTR_EXIT_REASON, event->exit_reason, ++ NULL); + break; + + case st_done: +diff --git a/tools/crm_mon_curses.c b/tools/crm_mon_curses.c +index bae3710c44..73c8516a8c 100644 +--- a/tools/crm_mon_curses.c ++++ b/tools/crm_mon_curses.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -463,8 +463,13 @@ stonith_event_console(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s'%s\n", ++ curses_indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +-- +2.27.0 + + +From 8105fb4a3a786780fdf85b3d0308eaf6df1ea434 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:45:22 -0600 +Subject: [PATCH 12/15] Low: schemas: copy fence-event API schema in + preparation for changes + +--- + include/crm/common/output_internal.h | 2 +- + xml/api/fence-event-2.15.rng | 33 ++++++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 1 deletion(-) + create mode 100644 xml/api/fence-event-2.15.rng + +diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h +index 479f0e4b43..8c5dcee17c 100644 +--- a/include/crm/common/output_internal.h ++++ b/include/crm/common/output_internal.h +@@ -27,7 +27,7 @@ extern "C" { + # include + # include + +-# define PCMK__API_VERSION "2.14" ++# define PCMK__API_VERSION "2.15" + + #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) + # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +new file mode 100644 +index 0000000000..e54687cd25 +--- /dev/null ++++ b/xml/api/fence-event-2.15.rng +@@ -0,0 +1,33 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ failed ++ success ++ pending ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 46dd9b74d2ee8f7ab70a0c7fe3a998954d4029e8 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:47:16 -0600 +Subject: [PATCH 13/15] Low: schemas: update fence-event API schema for recent + change + +--- + xml/api/fence-event-2.15.rng | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +index e54687cd25..8e000cafa5 100644 +--- a/xml/api/fence-event-2.15.rng ++++ b/xml/api/fence-event-2.15.rng +@@ -18,6 +18,9 @@ + + + ++ ++ ++ + + + +-- +2.27.0 + + +From 350e71772f67f28af6b67f864cbabc481730035c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 11:32:09 -0600 +Subject: [PATCH 14/15] Build: libstonithd: bump shared library version + +... for stonith_history_t change since 2.1.2. + +The struct should only ever be returned by the library as a pointer, so the +changes can be considered backward-compatible. Normally we wouldn't bump shared +library versions mid-cycle, but this will simplify expected backports of this +change. +--- + lib/fencing/Makefile.am | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am +index 1ffa3e051b..a10ddb88ec 100644 +--- a/lib/fencing/Makefile.am ++++ b/lib/fencing/Makefile.am +@@ -2,7 +2,7 @@ + # Original Author: Sun Jiang Dong + # Copyright 2004 International Business Machines + # +-# with later changes copyright 2004-2021 the Pacemaker project contributors. ++# with later changes copyright 2004-2022 the Pacemaker project contributors. + # The version control history for this file may have further details. + # + # This source code is licensed under the GNU General Public License version 2 +@@ -14,7 +14,7 @@ noinst_HEADERS = fencing_private.h + + lib_LTLIBRARIES = libstonithd.la + +-libstonithd_la_LDFLAGS = -version-info 33:0:7 ++libstonithd_la_LDFLAGS = -version-info 34:0:8 + + libstonithd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) + libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) +-- +2.27.0 + + +From 63ea88620a62ff0759560a02bb5e284ebdd03eb6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 16:53:45 -0600 +Subject: [PATCH 15/15] Low: fencer: reset op result before grabbing new one + +just in case +--- + daemons/fenced/fenced_remote.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 7fb7695fba..dc4649e0fc 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2219,6 +2219,7 @@ fenced_process_fencing_reply(xmlNode *msg) + return; + } + ++ pcmk__reset_result(&op->result); + op->result = result; // The operation takes ownership of the result + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { +-- +2.27.0 + diff --git a/SOURCES/023-memory-leak.patch b/SOURCES/023-memory-leak.patch new file mode 100644 index 0000000..3970dd3 --- /dev/null +++ b/SOURCES/023-memory-leak.patch @@ -0,0 +1,82 @@ +From 8034a203bbff0aa3b53f2946dc58e409bd7246c9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 20 Jan 2022 15:03:31 -0600 +Subject: [PATCH] Fix: scheduler: avoid memory leak when displaying clones + +Previously, pe__clone_default() unconditionally created a hash table for +stopped instances, but didn't free it in every code path. + +Now, only create the table when we have something to put in it and might +actually use it, and ensure it always gets freed. +--- + lib/pengine/clone.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 742e2920b0..920a04c32c 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -761,7 +761,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + +- GHashTable *stopped = pcmk__strkey_table(free, free); ++ GHashTable *stopped = NULL; + + char *list_text = NULL; + size_t list_text_len = 0; +@@ -818,7 +818,11 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } else if (partially_active == FALSE) { + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) ++ && !pcmk_is_set(show_opts, pcmk_show_clone_detail) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + +@@ -873,7 +877,6 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -948,8 +951,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + + /* Custom stopped table for non-unique clones */ +- g_hash_table_destroy(stopped); +- stopped = pcmk__strkey_table(free, free); ++ if (stopped != NULL) { ++ g_hash_table_destroy(stopped); ++ stopped = NULL; ++ } + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -972,6 +977,9 @@ pe__clone_default(pcmk__output_t *out, va_list args) + state = "Stopped (disabled)"; + } + ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + if (probe_op != NULL) { + int rc; + +@@ -987,7 +995,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + g_list_free(list); + } + +- if (g_hash_table_size(stopped) > 0) { ++ if (stopped != NULL) { + GList *list = sorted_hash_table_values(stopped); + + clone_header(out, &rc, rsc, clone_data); +-- +2.27.0 + diff --git a/SOURCES/024-daemon-tracking.patch b/SOURCES/024-daemon-tracking.patch new file mode 100644 index 0000000..d9e15e2 --- /dev/null +++ b/SOURCES/024-daemon-tracking.patch @@ -0,0 +1,108 @@ +From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Mon, 24 Jan 2022 12:18:42 +0100 +Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost + processes + +regression from introduction of periodic subdaemon checking +in cases they are pacemakerd children - previously it was either +periodic checking or signal-handler per process. +--- + daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++----------- + 1 file changed, 22 insertions(+), 16 deletions(-) + +diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c +index c03903c99e..84ecdc1ee8 100644 +--- a/daemons/pacemakerd/pcmkd_subdaemons.c ++++ b/daemons/pacemakerd/pcmkd_subdaemons.c +@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data) + switch (rc) { + case pcmk_rc_ok: + pcmk_children[next_child].check_count = 0; +- next_child++; + subdaemon_check_progress = time(NULL); + break; + case pcmk_rc_ipc_pid_only: // This case: it was previously OK +@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data) + /* go to the next child and see if + we can make progress there + */ +- next_child++; + break; + case pcmk_rc_ipc_unresponsive: ++ if (!pcmk_children[next_child].respawn) { ++ /* if a subdaemon is down and we don't want it ++ to be restarted this is a success during ++ shutdown. if it isn't restarted anymore ++ due to MAX_RESPAWN it is ++ rather no success. ++ */ ++ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { ++ subdaemon_check_progress = time(NULL); ++ } ++ } ++ if (!pcmk_children[next_child].active_before_startup) { ++ crm_trace("found %s[%lld] missing - signal-handler " ++ "will take care of it", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ break; ++ } + if (pcmk_children[next_child].respawn) { + crm_err("%s[%lld] terminated", + pcmk_children[next_child].name, +@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data) + pcmk_children[next_child].pid)); + } + pcmk_process_exit(&(pcmk_children[next_child])); +- if (!pcmk_children[next_child].respawn) { +- /* if a subdaemon is down and we don't want it +- to be restarted this is a success during +- shutdown. if it isn't restarted anymore +- due to MAX_RESPAWN it is +- rather no success. +- */ +- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { +- subdaemon_check_progress = time(NULL); +- } +- next_child++; +- } + break; + default: + crm_exit(CRM_EX_FATAL); + break; /* static analysis/noreturn */ + } + ++ next_child++; + if (next_child >= PCMK__NELEM(pcmk_children)) { + next_child = 0; + } +@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child) + { + child->pid = 0; + child->active_before_startup = false; ++ child->check_count = 0; + + child->respawn_count += 1; + if (child->respawn_count > MAX_RESPAWN) { +@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child) + crm_warn("One-off suppressing strict respawning of a child process %s," + " appears alright per %s IPC end-point", + child->name, child->endpoint); +- /* need to monitor how it evolves, and start new process if badly */ +- child->active_before_startup = true; + + } else { + if (child->needs_cluster && !pcmkd_cluster_connected()) { +@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child) + const char *env_callgrind = getenv("PCMK_callgrind_enabled"); + + child->active_before_startup = false; ++ child->check_count = 0; + + if (child->command == NULL) { + crm_info("Nothing to do for child \"%s\"", child->name); +-- +2.27.0 + diff --git a/SOURCES/025-regression.patch b/SOURCES/025-regression.patch new file mode 100644 index 0000000..62d2a46 --- /dev/null +++ b/SOURCES/025-regression.patch @@ -0,0 +1,30 @@ +From 16928cfc69136bc56b1574bee9966e0d5de73abd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 26 Jan 2022 09:15:43 -0600 +Subject: [PATCH] Fix: controller: correctly match "node down" events + +regression introduced in 2.1.2 by 03ce7376e + +The symptom that led to this was that removing a remote node connection +resource would lead to the remote node getting fenced when the connection stop +was not recognized as an expected down event. +--- + daemons/controld/controld_te_events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c +index 36fd832ba0..1fd7129922 100644 +--- a/daemons/controld/controld_te_events.c ++++ b/daemons/controld/controld_te_events.c +@@ -304,7 +304,7 @@ match_down_event(const char *target) + gIter2 = gIter2->next) { + + match = (crm_action_t*)gIter2->data; +- if (pcmk_is_set(match->flags, pcmk__graph_action_confirmed)) { ++ if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { + xpath_ret = xpath_search(match->xml, xpath); + if (numXpathResults(xpath_ret) < 1) { + match = NULL; +-- +2.27.0 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index d4c1b5b..26b7bf4 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -36,7 +36,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 2.1.2 -%global specversion 2 +%global specversion 4 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175 @@ -250,6 +250,23 @@ Patch5: 005-fencing-reasons.patch Patch6: 006-stateful-metadata.patch Patch7: 007-memory-leak.patch Patch8: 008-fencing-history.patch +Patch9: 009-fencing-reasons.patch +Patch10: 010-probe-failures.patch +Patch11: 011-fencing-reasons.patch +Patch12: 012-notify-crash.patch +Patch13: 013-probe-failures.patch +Patch14: 014-pcmk_delay_base.patch +Patch15: 015-fencing-reasons.patch +Patch16: 016-fencing-crash.patch +Patch17: 017-fencing-reasons.patch +Patch18: 018-failure-messages.patch +Patch19: 019-corosync-tracking.patch +Patch20: 020-systemd-unit.patch +Patch21: 021-daemon-tracking.patch +Patch22: 022-failure-messages.patch +Patch23: 023-memory-leak.patch +Patch24: 024-daemon-tracking.patch +Patch25: 025-regression.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -269,8 +286,9 @@ Requires: %{python_path} BuildRequires: %{python_name}-devel # Pacemaker requires a minimum libqb functionality -Requires: libqb >= 0.17.0 -BuildRequires: libqb-devel >= 0.17.0 +# RHEL requires a higher version than upstream, for qb_ipcc_connect_async() +Requires: libqb >= 2.0.3-7 +BuildRequires: libqb-devel >= 2.0.3-7 # Required basic build tools BuildRequires: autoconf @@ -855,6 +873,24 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Wed Jan 26 2022 Ken Gaillot - 2.1.2-4 +- Fix regression in down event detection that affects remote nodes +- Resolves: rhbz2039399 + +* Mon Jan 24 2022 Ken Gaillot - 2.1.2-3 +- Detect an unresponsive subdaemon +- Handle certain probe failures as stopped instead of failed +- Update pcmk_delay_base option meta-data +- Avoid crash when using clone notifications +- Retry Corosync shutdown tracking if first attempt fails +- Improve display of failed actions +- Resolves: rhbz1707851 +- Resolves: rhbz2039982 +- Resolves: rhbz2032032 +- Resolves: rhbz2040443 +- Resolves: rhbz2042367 +- Resolves: rhbz2042546 + * Thu Dec 16 2021 Ken Gaillot - 2.1.2-2 - Correctly get metadata for systemd agent names that end in '@' - Use correct OCF 1.1 syntax in ocf:pacemaker:Stateful meta-data