diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5d39cf7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +SOURCES/nagios-agents-metadata-105ab8a7b2c16b9a29cf1c1596b80136eeef332b.tar.gz +SOURCES/pacemaker-ada5c3b36.tar.gz diff --git a/.pacemaker.metadata b/.pacemaker.metadata new file mode 100644 index 0000000..56942a1 --- /dev/null +++ b/.pacemaker.metadata @@ -0,0 +1,2 @@ +2cbec94ad67dfbeba75e38d2c3c5c44961b3cd16 SOURCES/nagios-agents-metadata-105ab8a7b2c16b9a29cf1c1596b80136eeef332b.tar.gz +1dec5b062ad8e9a89b4953e17a59e4597797a1e6 SOURCES/pacemaker-ada5c3b36.tar.gz diff --git a/SOURCES/001-acl-group-schema.patch b/SOURCES/001-acl-group-schema.patch new file mode 100644 index 0000000..4835e3e --- /dev/null +++ b/SOURCES/001-acl-group-schema.patch @@ -0,0 +1,230 @@ +From f5ffbaf1f537d3d5b00e594211cd322f97df51ac Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:39:39 -0400 +Subject: [PATCH 1/3] Low: xml: clone acls schema in preparation for changes + +--- + xml/acls-3.8.rng | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 80 insertions(+) + create mode 100644 xml/acls-3.8.rng + +diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng +new file mode 100644 +index 000000000..0fe6eed96 +--- /dev/null ++++ b/xml/acls-3.8.rng +@@ -0,0 +1,80 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ read ++ write ++ deny ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 7838213fc639236bdedf5f15320152d973f1bdad Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:40:48 -0400 +Subject: [PATCH 2/3] Add a 'name' attribute to acl_target and acl_group + elements + +--- + xml/acls-3.8.rng | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng +index 0fe6eed96..48bcdffe3 100644 +--- a/xml/acls-3.8.rng ++++ b/xml/acls-3.8.rng +@@ -13,6 +13,9 @@ + + + ++ ++ ++ + + + +@@ -22,6 +25,9 @@ + + + ++ ++ ++ + + + +-- +2.27.0 + + +From c3c498f4636f57e29670f8e385b625024ed222d7 Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:42:48 -0400 +Subject: [PATCH 3/3] Changes made by run of 'cts/cts-cli -s' + +--- + cts/cli/regression.upgrade.exp | 7 +++++-- + cts/cli/regression.validity.exp | 22 ++++++++++++++++++---- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/cts/cli/regression.upgrade.exp b/cts/cli/regression.upgrade.exp +index e38adebdd..7ce7ec13b 100644 +--- a/cts/cli/regression.upgrade.exp ++++ b/cts/cli/regression.upgrade.exp +@@ -91,8 +91,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 + update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + update_validation debug: Configuration valid for schema: pacemaker-3.7 +-update_validation trace: Stopping at pacemaker-3.7 +-update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.7 ++update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++update_validation debug: Configuration valid for schema: pacemaker-3.8 ++update_validation trace: Stopping at pacemaker-3.8 ++update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.8 + =#=#=#= Current cib after: Upgrade to latest CIB schema (trigger 2.10.xsl + the wrapping) =#=#=#= + + +diff --git a/cts/cli/regression.validity.exp b/cts/cli/regression.validity.exp +index 5ace430e7..125035a47 100644 +--- a/cts/cli/regression.validity.exp ++++ b/cts/cli/regression.validity.exp +@@ -121,7 +121,11 @@ update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + update_validation trace: pacemaker-3.7 validation failed +-Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.7 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++update_validation trace: pacemaker-3.8 validation failed ++Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.8 + =#=#=#= End test: Run crm_simulate with invalid CIB (enum violation) - Invalid configuration (78) =#=#=#= + * Passed: crm_simulate - Run crm_simulate with invalid CIB (enum violation) + =#=#=#= Begin test: Try to make resulting CIB invalid (unrecognized validate-with) =#=#=#= +@@ -226,7 +230,10 @@ update_validation trace: pacemaker-3.6 validation failed + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + element cib: Relax-NG validity error : Invalid attribute validate-with for element cib + update_validation trace: pacemaker-3.7 validation failed +-Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.7 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++element cib: Relax-NG validity error : Invalid attribute validate-with for element cib ++update_validation trace: pacemaker-3.8 validation failed ++Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.8 + =#=#=#= End test: Run crm_simulate with invalid CIB (unrecognized validate-with) - Invalid configuration (78) =#=#=#= + * Passed: crm_simulate - Run crm_simulate with invalid CIB (unrecognized validate-with) + =#=#=#= Begin test: Try to make resulting CIB invalid, but possibly recoverable (valid with X.Y+1) =#=#=#= +@@ -326,8 +333,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 + update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + update_validation debug: Configuration valid for schema: pacemaker-3.7 +-update_validation trace: Stopping at pacemaker-3.7 +-update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.7 ++update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++update_validation debug: Configuration valid for schema: pacemaker-3.8 ++update_validation trace: Stopping at pacemaker-3.8 ++update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.8 + unpack_resources error: Resource start-up disabled since no STONITH resources have been defined + unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option + unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity +@@ -437,6 +447,8 @@ element rsc_order: Relax-NG validity error : Invalid attribute first-action for + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + =#=#=#= Current cib after: Make resulting CIB invalid, and without validate-with attribute =#=#=#= + + +@@ -502,6 +514,8 @@ validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attrib + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + unpack_resources error: Resource start-up disabled since no STONITH resources have been defined + unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option + unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity +-- +2.27.0 + diff --git a/SOURCES/002-fencing-reasons.patch b/SOURCES/002-fencing-reasons.patch new file mode 100644 index 0000000..f89cbec --- /dev/null +++ b/SOURCES/002-fencing-reasons.patch @@ -0,0 +1,2100 @@ +From 95b4f87aae5fb2cf771cf9a8f8e5420b65fb213f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 10:47:51 -0500 +Subject: [PATCH 01/12] Refactor: fencing: use pcmk__action_result_t in + stonith_action_t + +stonith_action_t previously had an rc member for a legacy return code, along +with output and error members for action stdout/stderr. When setting rc based +on the svc_action_t result, it used a mapping function svc_action_to_errno(). + +This replaces those with a pcmk__action_result_t member, which means we now +track the exit status and execution status as originally set by libcrmservice, +rather than the mapped rc. The library now calls the mapping function, now +returning standard codes and called result2rc(), when calling the client +callback. + +The exit_reason member is unused as of this commit. + +The behavior should be identical, with the small exception of +services_action_async() failure leaving the exit status as set by the services +library, which means callers will get the result2rc() mapping of the actual +result instead of the former -ECONNABORTED. +--- + lib/fencing/st_client.c | 118 +++++++++++++++++++++++----------------- + 1 file changed, 68 insertions(+), 50 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 08adb51c6..6c607b010 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + +@@ -57,9 +58,7 @@ struct stonith_action_s { + int max_retries; + + int pid; +- int rc; +- char *output; +- char *error; ++ pcmk__action_result_t result; + }; + + typedef struct stonith_private_s { +@@ -120,6 +119,7 @@ static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); + static int internal_stonith_action_execute(stonith_action_t * action); + static void log_action(stonith_action_t *action, pid_t pid); ++static int result2rc(const pcmk__action_result_t *result); + + /*! + * \brief Get agent namespace by name +@@ -196,6 +196,23 @@ stonith_get_namespace(const char *agent, const char *namespace_s) + return st_namespace_invalid; + } + ++/*! ++ * \internal ++ * \brief Set an action's result based on services library result ++ * ++ * \param[in] action Fence action to set result for ++ * \param[in] svc_action Service action to get result from ++ */ ++static void ++set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) ++{ ++ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, ++ NULL); ++ pcmk__set_result_output(&(action->result), ++ services__grab_stdout(svc_action), ++ services__grab_stderr(svc_action)); ++} ++ + gboolean + stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + { +@@ -259,19 +276,19 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + static void + log_action(stonith_action_t *action, pid_t pid) + { +- if (action->output) { ++ if (action->result.action_stdout != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); + +- crm_log_output(LOG_TRACE, prefix, action->output); ++ crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); + free(prefix); + } + +- if (action->error) { ++ if (action->result.action_stderr != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); + +- crm_log_output(LOG_WARNING, prefix, action->error); ++ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); + free(prefix); + } + } +@@ -645,8 +662,7 @@ stonith__destroy_action(stonith_action_t *action) + if (action->svc_action) { + services_action_free(action->svc_action); + } +- free(action->output); +- free(action->error); ++ pcmk__reset_result(&(action->result)); + free(action); + } + } +@@ -678,15 +694,15 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, + } + if (action != NULL) { + if (rc) { +- *rc = action->rc; ++ *rc = pcmk_rc2legacy(result2rc(&(action->result))); + } +- if (output && action->output) { +- *output = action->output; +- action->output = NULL; // hand off memory management to caller ++ if ((output != NULL) && (action->result.action_stdout != NULL)) { ++ *output = action->result.action_stdout; ++ action->result.action_stdout = NULL; // hand off ownership to caller + } +- if (error_output && action->error) { +- *error_output = action->error; +- action->error = NULL; // hand off memory management to caller ++ if ((error_output != NULL) && (action->result.action_stderr != NULL)) { ++ *error_output = action->result.action_stderr; ++ action->result.action_stderr = NULL; // hand off ownership to caller + } + } + } +@@ -715,6 +731,9 @@ stonith_action_create(const char *agent, + action->timeout = action->remaining_timeout = timeout; + action->max_retries = FAILURE_MAX_RETRIES; + ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, ++ NULL); ++ + if (device_args) { + char buffer[512]; + const char *value = NULL; +@@ -739,7 +758,8 @@ update_remaining_timeout(stonith_action_t * action) + crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", + action->agent, action->action, action->max_retries); + action->remaining_timeout = 0; +- } else if ((action->rc != -ETIME) && diff < (action->timeout * 0.7)) { ++ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) ++ && (diff < (action->timeout * 0.7))) { + /* only set remaining timeout period if there is 30% + * or greater of the original timeout period left */ + action->remaining_timeout = action->timeout - diff; +@@ -750,31 +770,31 @@ update_remaining_timeout(stonith_action_t * action) + } + + static int +-svc_action_to_errno(svc_action_t *svc_action) { +- int rv = pcmk_ok; ++result2rc(const pcmk__action_result_t *result) { ++ int rc = pcmk_rc_ok; + +- if (svc_action->status == PCMK_EXEC_TIMEOUT) { +- rv = -ETIME; ++ if (result->execution_status == PCMK_EXEC_TIMEOUT) { ++ rc = ETIME; + +- } else if (svc_action->rc != PCMK_OCF_OK) { ++ } else if (result->exit_status != CRM_EX_OK) { + /* Try to provide a useful error code based on the fence agent's + * error output. + */ +- if (svc_action->stderr_data == NULL) { +- rv = -ENODATA; ++ if (result->action_stderr == NULL) { ++ rc = ENODATA; + +- } else if (strstr(svc_action->stderr_data, "imed out")) { ++ } else if (strstr(result->action_stderr, "imed out")) { + /* Some agents have their own internal timeouts */ +- rv = -ETIME; ++ rc = ETIME; + +- } else if (strstr(svc_action->stderr_data, "Unrecognised action")) { +- rv = -EOPNOTSUPP; ++ } else if (strstr(result->action_stderr, "Unrecognised action")) { ++ rc = EOPNOTSUPP; + + } else { +- rv = -pcmk_err_generic; ++ rc = pcmk_rc_error; + } + } +- return rv; ++ return rc; + } + + static void +@@ -782,11 +802,7 @@ stonith_action_async_done(svc_action_t *svc_action) + { + stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; + +- action->rc = svc_action_to_errno(svc_action); +- action->output = svc_action->stdout_data; +- svc_action->stdout_data = NULL; +- action->error = svc_action->stderr_data; +- svc_action->stderr_data = NULL; ++ set_result_from_svc_action(action, svc_action); + + svc_action->params = NULL; + +@@ -795,7 +811,9 @@ stonith_action_async_done(svc_action_t *svc_action) + + log_action(action, action->pid); + +- if (action->rc != pcmk_ok && update_remaining_timeout(action)) { ++ if ((action->result.exit_status != CRM_EX_OK) ++ && update_remaining_timeout(action)) { ++ + int rc = internal_stonith_action_execute(action); + if (rc == pcmk_ok) { + return; +@@ -803,7 +821,8 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, action->rc, action->output, action->userdata); ++ action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), ++ action->result.action_stdout, action->userdata); + } + + action->svc_action = NULL; // don't remove our caller +@@ -835,9 +854,13 @@ internal_stonith_action_execute(stonith_action_t * action) + static int stonith_sequence = 0; + char *buffer = NULL; + +- if ((action == NULL) || (action->action == NULL) || (action->args == NULL) ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ if ((action->action == NULL) || (action->args == NULL) + || (action->agent == NULL)) { +- return -EPROTO; ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_ERROR_FATAL, NULL); ++ return -EINVAL; + } + + if (!action->tries) { +@@ -857,6 +880,7 @@ internal_stonith_action_execute(stonith_action_t * action) + free(buffer); + + if (svc_action->rc != PCMK_OCF_UNKNOWN) { ++ set_result_from_svc_action(action, svc_action); + services_action_free(svc_action); + return -E2BIG; + } +@@ -877,10 +901,7 @@ internal_stonith_action_execute(stonith_action_t * action) + + /* keep retries from executing out of control and free previous results */ + if (is_retry) { +- free(action->output); +- action->output = NULL; +- free(action->error); +- action->error = NULL; ++ pcmk__reset_result(&(action->result)); + sleep(1); + } + +@@ -889,22 +910,19 @@ internal_stonith_action_execute(stonith_action_t * action) + if (services_action_async_fork_notify(svc_action, + &stonith_action_async_done, + &stonith_action_async_forked)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, ++ PCMK_EXEC_PENDING, NULL); + return pcmk_ok; + } + + } else if (services_action_sync(svc_action)) { // sync success + rc = pcmk_ok; +- action->rc = svc_action_to_errno(svc_action); +- action->output = svc_action->stdout_data; +- svc_action->stdout_data = NULL; +- action->error = svc_action->stderr_data; +- svc_action->stderr_data = NULL; + + } else { // sync failure +- action->rc = -ECONNABORTED; +- rc = action->rc; ++ rc = -ECONNABORTED; + } + ++ set_result_from_svc_action(action, svc_action); + svc_action->params = NULL; + services_action_free(svc_action); + return rc; +-- +2.27.0 + + +From 4c8e0b0ecc53cb3883f0da0eede20b900fff48d1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 11:14:31 -0500 +Subject: [PATCH 02/12] Low: fencing: improve return code given back to library + callers + +Expose result2rc() internally for future reuse, and expand it to handle more +cases. In theory, this can give us better log messages and status output for +failures. +--- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_client.c | 63 +++++++++++++++++++++------------- + 2 files changed, 41 insertions(+), 23 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index fa9059e6f..0d23967bb 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -60,6 +60,7 @@ stonith_action_t *stonith_action_create(const char *agent, + void stonith__destroy_action(stonith_action_t *action); + void stonith__action_result(stonith_action_t *action, int *rc, char **output, + char **error_output); ++int stonith__result2rc(const pcmk__action_result_t *result); + + int + stonith_action_execute_async(stonith_action_t * action, +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 6c607b010..809be1640 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -119,7 +119,6 @@ static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); + static int internal_stonith_action_execute(stonith_action_t * action); + static void log_action(stonith_action_t *action, pid_t pid); +-static int result2rc(const pcmk__action_result_t *result); + + /*! + * \brief Get agent namespace by name +@@ -694,7 +693,7 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, + } + if (action != NULL) { + if (rc) { +- *rc = pcmk_rc2legacy(result2rc(&(action->result))); ++ *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); + } + if ((output != NULL) && (action->result.action_stdout != NULL)) { + *output = action->result.action_stdout; +@@ -769,32 +768,49 @@ update_remaining_timeout(stonith_action_t * action) + return action->remaining_timeout ? TRUE : FALSE; + } + +-static int +-result2rc(const pcmk__action_result_t *result) { +- int rc = pcmk_rc_ok; ++/*! ++ * \internal ++ * \brief Map a fencing action result to a standard return code ++ * ++ * \param[in] result Fencing action result to map ++ * ++ * \return Standard Pacemaker return code that best corresponds to \p result ++ */ ++int ++stonith__result2rc(const pcmk__action_result_t *result) ++{ ++ switch (result->execution_status) { ++ case PCMK_EXEC_CANCELLED: return ECANCELED; ++ case PCMK_EXEC_TIMEOUT: return ETIME; ++ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; ++ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; ++ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; ++ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; ++ case PCMK_EXEC_NO_SECRETS: return EACCES; ++ default: break; ++ } + +- if (result->execution_status == PCMK_EXEC_TIMEOUT) { +- rc = ETIME; ++ if (result->exit_status == CRM_EX_OK) { ++ return pcmk_rc_ok; ++ } + +- } else if (result->exit_status != CRM_EX_OK) { +- /* Try to provide a useful error code based on the fence agent's +- * error output. +- */ +- if (result->action_stderr == NULL) { +- rc = ENODATA; ++ // Try to provide useful error code based on result's error output + +- } else if (strstr(result->action_stderr, "imed out")) { +- /* Some agents have their own internal timeouts */ +- rc = ETIME; ++ if (result->action_stderr == NULL) { ++ return ENODATA; + +- } else if (strstr(result->action_stderr, "Unrecognised action")) { +- rc = EOPNOTSUPP; ++ } else if (strcasestr(result->action_stderr, "timed out") ++ || strcasestr(result->action_stderr, "timeout")) { ++ return ETIME; + +- } else { +- rc = pcmk_rc_error; +- } ++ } else if (strcasestr(result->action_stderr, "unrecognised action") ++ || strcasestr(result->action_stderr, "unrecognized action") ++ || strcasestr(result->action_stderr, "unsupported action")) { ++ return EOPNOTSUPP; + } +- return rc; ++ ++ // Oh well, we tried ++ return pcmk_rc_error; + } + + static void +@@ -821,7 +837,8 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), ++ action->done_cb(action->pid, ++ pcmk_rc2legacy(stonith__result2rc(&(action->result))), + action->result.action_stdout, action->userdata); + } + +-- +2.27.0 + + +From 153c9b552a5bad9dd36e8635fa478ed9cad1f240 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 11:35:44 -0500 +Subject: [PATCH 03/12] Refactor: fencing: return full result from + stonith__action_result() + +Previously, stonith__action_result() grabbed an action's legacy rc, stdout, and +stderr separately. Now, directly return a pointer to the action's result +object, and map that to a legacy rc in the callers when needed. +--- + include/crm/fencing/internal.h | 3 +-- + lib/fencing/st_client.c | 36 ++++--------------------- + lib/fencing/st_rhcs.c | 48 ++++++++++++++++++++++++---------- + 3 files changed, 40 insertions(+), 47 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 0d23967bb..4e9f50fe8 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -58,8 +58,7 @@ stonith_action_t *stonith_action_create(const char *agent, + GHashTable * port_map, + const char * host_arg); + void stonith__destroy_action(stonith_action_t *action); +-void stonith__action_result(stonith_action_t *action, int *rc, char **output, +- char **error_output); ++pcmk__action_result_t *stonith__action_result(stonith_action_t *action); + int stonith__result2rc(const pcmk__action_result_t *result); + + int +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 809be1640..b9df18465 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -670,40 +670,14 @@ stonith__destroy_action(stonith_action_t *action) + * \internal + * \brief Get the result of an executed stonith action + * +- * \param[in,out] action Executed action +- * \param[out] rc Where to store result code (or NULL) +- * \param[out] output Where to store standard output (or NULL) +- * \param[out] error_output Where to store standard error output (or NULL) ++ * \param[in] action Executed action + * +- * \note If output or error_output is not NULL, the caller is responsible for +- * freeing the memory. ++ * \return Pointer to action's result (or NULL if \p action is NULL) + */ +-void +-stonith__action_result(stonith_action_t *action, int *rc, char **output, +- char **error_output) ++pcmk__action_result_t * ++stonith__action_result(stonith_action_t *action) + { +- if (rc) { +- *rc = pcmk_ok; +- } +- if (output) { +- *output = NULL; +- } +- if (error_output) { +- *error_output = NULL; +- } +- if (action != NULL) { +- if (rc) { +- *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); +- } +- if ((output != NULL) && (action->result.action_stdout != NULL)) { +- *output = action->result.action_stdout; +- action->result.action_stdout = NULL; // hand off ownership to caller +- } +- if ((error_output != NULL) && (action->result.action_stderr != NULL)) { +- *error_output = action->result.action_stderr; +- action->result.action_stderr = NULL; // hand off ownership to caller +- } +- } ++ return (action == NULL)? NULL : &(action->result); + } + + #define FAILURE_MAX_RETRIES 2 +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 89a2625bd..23e694975 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2020 the Pacemaker project contributors ++ * Copyright 2004-2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -123,10 +123,10 @@ stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter) + static int + stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + { +- char *buffer = NULL; + xmlNode *xml = NULL; + xmlNode *actions = NULL; + xmlXPathObject *xpathObj = NULL; ++ pcmk__action_result_t *result = NULL; + stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0, + 5, NULL, NULL, NULL); + int rc = stonith__execute(action); +@@ -138,23 +138,31 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + return rc; + } + +- stonith__action_result(action, &rc, &buffer, NULL); +- stonith__destroy_action(action); +- if (rc < 0) { +- crm_warn("Metadata action for %s failed: %s " CRM_XS "rc=%d", +- agent, pcmk_strerror(rc), rc); +- free(buffer); +- return rc; ++ result = stonith__action_result(action); ++ ++ if (result->execution_status != PCMK_EXEC_DONE) { ++ crm_warn("Could not execute metadata action for %s: %s", ++ agent, pcmk_exec_status_str(result->execution_status)); ++ stonith__destroy_action(action); ++ return pcmk_rc2legacy(stonith__result2rc(result)); + } + +- if (buffer == NULL) { ++ if (result->exit_status != CRM_EX_OK) { ++ crm_warn("Metadata action for %s returned error code %d", ++ agent, result->exit_status); ++ stonith__destroy_action(action); ++ return pcmk_rc2legacy(stonith__result2rc(result)); ++ } ++ ++ if (result->action_stdout == NULL) { + crm_warn("Metadata action for %s returned no data", agent); ++ stonith__destroy_action(action); + return -ENODATA; + } + +- xml = string2xml(buffer); +- free(buffer); +- buffer = NULL; ++ xml = string2xml(result->action_stdout); ++ stonith__destroy_action(action); ++ + if (xml == NULL) { + crm_warn("Metadata for %s is invalid", agent); + return -pcmk_err_schema_validation; +@@ -289,7 +297,19 @@ stonith__rhcs_validate(stonith_t *st, int call_options, const char *target, + + rc = stonith__execute(action); + if (rc == pcmk_ok) { +- stonith__action_result(action, &rc, output, error_output); ++ pcmk__action_result_t *result = stonith__action_result(action); ++ ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); ++ ++ // Take ownership of output so stonith__destroy_action() doesn't free it ++ if (output != NULL) { ++ *output = result->action_stdout; ++ result->action_stdout = NULL; ++ } ++ if (error_output != NULL) { ++ *error_output = result->action_stderr; ++ result->action_stderr = NULL; ++ } + } + stonith__destroy_action(action); + return rc; +-- +2.27.0 + + +From 7f7067014357cccb229a0bef091e234eb3765f7a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 13:05:54 -0500 +Subject: [PATCH 04/12] Refactor: fencing: pass full result to async action + callback + +When executing an asynchronous fence agent command, the fencing library gets +the full result (exit status, execution status, and exit reason) from the +services library, then maps that to a legacy return code. + +Now, pass the full result object to the fencing async callback, rather than +separate arguments for legacy code and stdout. The mapping to a legacy code now +happens in the fencer rather than the fencing library. + +The goal of this and following commits is to push the full result object +further down the code path, so that ultimately the full result is always +available internally, and the legacy code mapping is only done for backward +compatibility when sending the result back to a client. + +This commit focuses on the async callback (done_cb() in both the fencer's +async_command_t and the fencing library's stonith_action_t). Later commits will +follow the chain: + + st_child_done() and stonith_fence_get_devices_cb() + -> stonith_send_async_reply() + -> stonith_construct_async_reply() and log_async_result() +--- + daemons/fenced/fenced_commands.c | 78 +++++++++++++++++++++----------- + include/crm/fencing/internal.h | 3 +- + lib/fencing/st_client.c | 10 ++-- + 3 files changed, 58 insertions(+), 33 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index b5ae28d90..d5d04ae69 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -62,7 +62,8 @@ struct device_search_s { + }; + + static gboolean stonith_device_dispatch(gpointer user_data); +-static void st_child_done(int pid, int rc, const char *output, void *user_data); ++static void st_child_done(int pid, const pcmk__action_result_t *result, ++ void *user_data); + static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, + const char *client_id); + +@@ -99,7 +100,8 @@ typedef struct async_command_s { + GList *device_next; + + void *internal_user_data; +- void (*done_cb) (int pid, int rc, const char *output, void *user_data); ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); + guint timer_sigterm; + guint timer_sigkill; + /*! If the operation timed out, this is the last signal +@@ -377,13 +379,25 @@ get_agent_metadata_cb(gpointer data) { + * \internal + * \brief Call a command's action callback for an internal (not library) result + * +- * \param[in] cmd Command to report result for +- * \param[in] rc Legacy return code to pass to callback ++ * \param[in] cmd Command to report result for ++ * \param[in] execution_status Execution status to use for result ++ * \param[in] exit_status Exit status to use for result ++ * \param[in] exit_reason Exit reason to use for result + */ + static void +-report_internal_result(async_command_t *cmd, int rc) ++report_internal_result(async_command_t *cmd, int exit_status, ++ int execution_status, const char *exit_reason) + { +- cmd->done_cb(0, rc, NULL, cmd); ++ pcmk__action_result_t result = { ++ // Ensure we don't pass garbage to free() ++ .exit_reason = NULL, ++ .action_stdout = NULL, ++ .action_stderr = NULL ++ }; ++ ++ pcmk__set_result(&result, exit_status, execution_status, exit_reason); ++ cmd->done_cb(0, &result, cmd); ++ pcmk__reset_result(&result); + } + + static gboolean +@@ -446,7 +460,7 @@ stonith_device_execute(stonith_device_t * device) + } + } else { + crm_info("Faking success for %s watchdog operation", cmd->action); +- report_internal_result(cmd, pcmk_ok); ++ report_internal_result(cmd, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + goto done; + } + } +@@ -462,7 +476,8 @@ stonith_device_execute(stonith_device_t * device) + crm_err("Considering %s unconfigured " + "because unable to load CIB secrets: %s", + device->id, pcmk_rc_str(exec_rc)); +- report_internal_result(cmd, -EACCES); ++ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, ++ NULL); + goto done; + } + } +@@ -501,7 +516,7 @@ stonith_device_execute(stonith_device_t * device) + cmd->done_cb, fork_cb); + if (exec_rc < 0) { + cmd->activating_on = NULL; +- report_internal_result(cmd, exec_rc); ++ cmd->done_cb(0, stonith__action_result(action), cmd); + stonith__destroy_action(action); + } + +@@ -625,7 +640,8 @@ free_device(gpointer data) + async_command_t *cmd = gIter->data; + + crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); +- report_internal_result(cmd, -ENODEV); ++ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); + } + g_list_free(device->pending_ops); + +@@ -1079,7 +1095,8 @@ schedule_internal_command(const char *origin, + const char *victim, + int timeout, + void *internal_user_data, +- void (*done_cb) (int pid, int rc, const char *output, ++ void (*done_cb) (int pid, ++ const pcmk__action_result_t *result, + void *user_data)) + { + async_command_t *cmd = NULL; +@@ -1111,7 +1128,7 @@ enum fence_status_code { + }; + + static void +-status_search_cb(int pid, int rc, const char *output, void *user_data) ++status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) + { + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; +@@ -1127,7 +1144,7 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) + + mainloop_set_trigger(dev->work); + +- switch (rc) { ++ switch (result->exit_status) { + case fence_status_unknown: + crm_trace("%s reported it cannot fence %s", dev->id, search->host); + break; +@@ -1141,14 +1158,15 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) + default: + crm_warn("Assuming %s cannot fence %s " + "(status returned unknown code %d)", +- dev->id, search->host, rc); ++ dev->id, search->host, result->exit_status); + break; + } + search_devices_record_result(search, dev->id, can); + } + + static void +-dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) ++dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, ++ void *user_data) + { + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; +@@ -1169,21 +1187,21 @@ dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) + + mainloop_set_trigger(dev->work); + +- if (rc == CRM_EX_OK) { ++ if (result->exit_status == CRM_EX_OK) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); +- dev->targets = stonith__parse_targets(output); ++ dev->targets = stonith__parse_targets(result->action_stdout); + dev->targets_age = time(NULL); + + } else if (dev->targets != NULL) { + crm_info("Reusing most recent target list for %s " + "because list returned error code %d", +- dev->id, rc); ++ dev->id, result->exit_status); + + } else { // We have never successfully executed list + crm_warn("Assuming %s cannot fence %s " + "because list returned error code %d", +- dev->id, search->host, rc); ++ dev->id, search->host, result->exit_status); + + /* Fall back to pcmk_host_check="status" if the user didn't explicitly + * specify "dynamic-list". +@@ -2407,7 +2425,7 @@ cancel_stonith_command(async_command_t * cmd) + } + + static void +-st_child_done(int pid, int rc, const char *output, void *user_data) ++st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + { + stonith_device_t *device = NULL; + stonith_device_t *next_device = NULL; +@@ -2423,7 +2441,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + /* The device is ready to do something else now */ + device = g_hash_table_lookup(device_list, cmd->device); + if (device) { +- if (!device->verified && (rc == pcmk_ok) && ++ if (!device->verified && (result->exit_status == CRM_EX_OK) && + (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { + + device->verified = TRUE; +@@ -2432,7 +2450,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + mainloop_set_trigger(device->work); + } + +- if (rc == 0) { ++ if (result->exit_status == CRM_EX_OK) { + GList *iter; + /* see if there are any required devices left to execute for this op */ + for (iter = cmd->device_next; iter != NULL; iter = iter->next) { +@@ -2445,7 +2463,8 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + next_device = NULL; + } + +- } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) { ++ } else if ((cmd->device_next != NULL) ++ && !is_action_required(cmd->action, device)) { + /* if this device didn't work out, see if there are any others we can try. + * if the failed device was 'required', we can't pick another device. */ + next_device = g_hash_table_lookup(device_list, cmd->device_next->data); +@@ -2454,16 +2473,19 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + + /* this operation requires more fencing, hooray! */ + if (next_device) { +- log_async_result(cmd, rc, pid, next_device->id, output, FALSE); ++ log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, ++ next_device->id, result->action_stdout, FALSE); + schedule_stonith_command(cmd, next_device); + /* Prevent cmd from being freed */ + cmd = NULL; + goto done; + } + +- stonith_send_async_reply(cmd, output, rc, pid, false); ++ stonith_send_async_reply(cmd, result->action_stdout, ++ pcmk_rc2legacy(stonith__result2rc(result)), pid, ++ false); + +- if (rc != 0) { ++ if (result->exit_status != CRM_EX_OK) { + goto done; + } + +@@ -2509,7 +2531,9 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + + cmd_list = g_list_remove_link(cmd_list, gIter); + +- stonith_send_async_reply(cmd_other, output, rc, pid, true); ++ stonith_send_async_reply(cmd_other, result->action_stdout, ++ pcmk_rc2legacy(stonith__result2rc(result)), ++ pid, true); + cancel_stonith_command(cmd_other); + + free_async_command(cmd_other); +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 4e9f50fe8..6a7e4232c 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -64,7 +64,8 @@ int stonith__result2rc(const pcmk__action_result_t *result); + int + stonith_action_execute_async(stonith_action_t * action, + void *userdata, +- void (*done) (int pid, int rc, const char *output, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, + void *user_data), + void (*fork_cb) (int pid, void *user_data)); + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index b9df18465..59dcab9a3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -46,7 +46,8 @@ struct stonith_action_s { + int timeout; + int async; + void *userdata; +- void (*done_cb) (int pid, int status, const char *output, void *user_data); ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); + void (*fork_cb) (int pid, void *user_data); + + svc_action_t *svc_action; +@@ -811,9 +812,7 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, +- pcmk_rc2legacy(stonith__result2rc(&(action->result))), +- action->result.action_stdout, action->userdata); ++ action->done_cb(action->pid, &(action->result), action->userdata); + } + + action->svc_action = NULL; // don't remove our caller +@@ -933,7 +932,8 @@ internal_stonith_action_execute(stonith_action_t * action) + int + stonith_action_execute_async(stonith_action_t * action, + void *userdata, +- void (*done) (int pid, int rc, const char *output, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, + void *user_data), + void (*fork_cb) (int pid, void *user_data)) + { +-- +2.27.0 + + +From bbd022306df7a873c0ecb2be2d33c56fbf327b8c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 11:51:28 -0500 +Subject: [PATCH 05/12] Feature: fencing: set exit reason for internal + execution errors + +... most importantly, copying any exit reason set by the services library. +This ensures that the stonith_action_t exit reason is set when appropriate. +However, nothing uses it as of this commit. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + lib/fencing/st_client.c | 6 +++--- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index d5d04ae69..f55a32649 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -477,7 +477,7 @@ stonith_device_execute(stonith_device_t * device) + "because unable to load CIB secrets: %s", + device->id, pcmk_rc_str(exec_rc)); + report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, +- NULL); ++ "Failed to get CIB secrets"); + goto done; + } + } +@@ -641,7 +641,7 @@ free_device(gpointer data) + + crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); + report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, +- NULL); ++ "Device was removed before action could be executed"); + } + g_list_free(device->pending_ops); + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 59dcab9a3..3d4127eff 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -207,7 +207,7 @@ static void + set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) + { + pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, +- NULL); ++ services__exit_reason(svc_action)); + pcmk__set_result_output(&(action->result), + services__grab_stdout(svc_action), + services__grab_stderr(svc_action)); +@@ -706,7 +706,7 @@ stonith_action_create(const char *agent, + action->max_retries = FAILURE_MAX_RETRIES; + + pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, +- NULL); ++ "Initialization bug in fencing library"); + + if (device_args) { + char buffer[512]; +@@ -849,7 +849,7 @@ internal_stonith_action_execute(stonith_action_t * action) + if ((action->action == NULL) || (action->args == NULL) + || (action->agent == NULL)) { + pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, +- PCMK_EXEC_ERROR_FATAL, NULL); ++ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); + return -EINVAL; + } + +-- +2.27.0 + + +From ed08f600688af1d25412d2427502ba5d4a55c0d6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 12:06:10 -0500 +Subject: [PATCH 06/12] Fix: fencer: handle dynamic target query failures + better + +Previously, the callbacks for list and status queries checked only the result's +exit status. However, the services library will use PCMK_OCF_UNKNOWN_ERROR (1) +as the exit status for internal failures, and that value signifies a recognized +node (not an error) for fence list actions. + +Now, the callbacks check the execution status as well. +--- + daemons/fenced/fenced_commands.c | 46 +++++++++++++++++++++++++++----- + 1 file changed, 39 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f55a32649..7b3fb25a1 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1144,6 +1144,18 @@ status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) + + mainloop_set_trigger(dev->work); + ++ if (result->execution_status != PCMK_EXEC_DONE) { ++ crm_warn("Assuming %s cannot fence %s " ++ "because status could not be executed: %s%s%s%s", ++ dev->id, search->host, ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ search_devices_record_result(search, dev->id, FALSE); ++ return; ++ } ++ + switch (result->exit_status) { + case fence_status_unknown: + crm_trace("%s reported it cannot fence %s", dev->id, search->host); +@@ -1187,21 +1199,41 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, + + mainloop_set_trigger(dev->work); + +- if (result->exit_status == CRM_EX_OK) { ++ if ((result->execution_status == PCMK_EXEC_DONE) ++ && (result->exit_status == CRM_EX_OK)) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); + dev->targets = stonith__parse_targets(result->action_stdout); + dev->targets_age = time(NULL); + + } else if (dev->targets != NULL) { +- crm_info("Reusing most recent target list for %s " +- "because list returned error code %d", +- dev->id, result->exit_status); ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ crm_info("Reusing most recent target list for %s " ++ "because list returned error code %d", ++ dev->id, result->exit_status); ++ } else { ++ crm_info("Reusing most recent target list for %s " ++ "because list could not be executed: %s%s%s%s", ++ dev->id, pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ } + + } else { // We have never successfully executed list +- crm_warn("Assuming %s cannot fence %s " +- "because list returned error code %d", +- dev->id, search->host, result->exit_status); ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ crm_warn("Assuming %s cannot fence %s " ++ "because list returned error code %d", ++ dev->id, search->host, result->exit_status); ++ } else { ++ crm_warn("Assuming %s cannot fence %s " ++ "because list could not be executed: %s%s%s%s", ++ dev->id, search->host, ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ } + + /* Fall back to pcmk_host_check="status" if the user didn't explicitly + * specify "dynamic-list". +-- +2.27.0 + + +From 5a30238a3b8691a5fc20f53906c0efcc50193306 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 15:57:50 -0500 +Subject: [PATCH 07/12] Refactor: fencer: pass result object when sending an + async reply + +... via stonith_send_async_reply(), instead of sending the mapped legacy code +and action stdout separately. Also, drop the "stonith_" prefix since the +function is static. + +This moves the mapping from the stonith_send_async_reply() callers to the +function itself, so we use the result object and standard codes as long as +possible, and map to a legacy code only where needed. +--- + daemons/fenced/fenced_commands.c | 62 +++++++++++++++++++------------- + daemons/fenced/fenced_remote.c | 2 +- + 2 files changed, 39 insertions(+), 25 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 7b3fb25a1..e5f8162ce 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2376,12 +2376,28 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + } + } + ++/*! ++ * \internal ++ * \brief Reply to requester after asynchronous command completion ++ * ++ * \param[in] cmd Command that completed ++ * \param[in] result Result of command ++ * \param[in] pid Process ID of command, if available ++ * \param[in] merged If true, command was merged with another, not executed ++ */ + static void +-stonith_send_async_reply(async_command_t *cmd, const char *output, int rc, +- int pid, bool merged) ++send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, ++ int pid, bool merged) + { + xmlNode *reply = NULL; + gboolean bcast = FALSE; ++ const char *output = NULL; ++ int rc = pcmk_ok; ++ ++ CRM_CHECK((cmd != NULL) && (result != NULL), return); ++ ++ output = result->action_stdout; ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + + reply = stonith_construct_async_reply(cmd, output, NULL, rc); + +@@ -2513,9 +2529,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + goto done; + } + +- stonith_send_async_reply(cmd, result->action_stdout, +- pcmk_rc2legacy(stonith__result2rc(result)), pid, +- false); ++ send_async_reply(cmd, result, pid, false); + + if (result->exit_status != CRM_EX_OK) { + goto done; +@@ -2563,9 +2577,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + cmd_list = g_list_remove_link(cmd_list, gIter); + +- stonith_send_async_reply(cmd_other, result->action_stdout, +- pcmk_rc2legacy(stonith__result2rc(result)), +- pid, true); ++ send_async_reply(cmd_other, result, pid, true); + cancel_stonith_command(cmd_other); + + free_async_command(cmd_other); +@@ -2604,26 +2616,28 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + /* Order based on priority */ + devices = g_list_sort(devices, sort_device_priority); + device = g_hash_table_lookup(device_list, devices->data); +- +- if (device) { +- cmd->device_list = devices; +- cmd->device_next = devices->next; +- devices = NULL; /* list owned by cmd now */ +- } + } + +- /* we have a device, schedule it for fencing. */ +- if (device) { +- schedule_stonith_command(cmd, device); +- /* in progress */ +- return; +- } ++ if (device == NULL) { // No device found ++ pcmk__action_result_t result = { ++ // Ensure we don't pass garbage to free() ++ .exit_reason = NULL, ++ .action_stdout = NULL, ++ .action_stderr = NULL ++ }; + +- /* no device found! */ +- stonith_send_async_reply(cmd, NULL, -ENODEV, 0, false); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "No fence device configured for target"); ++ send_async_reply(cmd, &result, 0, false); ++ pcmk__reset_result(&result); ++ free_async_command(cmd); ++ g_list_free_full(devices, free); + +- free_async_command(cmd); +- g_list_free_full(devices, free); ++ } else { // Device found, schedule it for fencing ++ cmd->device_list = devices; ++ cmd->device_next = devices->next; ++ schedule_stonith_command(cmd, device); ++ } + } + + static int +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index ffaf60018..b09d2865e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -996,7 +996,7 @@ stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) + + remote_op_done(op, msg, pcmk_ok, FALSE); + +- /* Replies are sent via done_cb->stonith_send_async_reply()->do_local_reply() */ ++ // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() + return -EINPROGRESS; + } + +-- +2.27.0 + + +From c67b6bfbe0baa1253058417ddfb9bc4cf0844e27 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 17:25:38 -0500 +Subject: [PATCH 08/12] Refactor: fencer: pass result object when building + async reply + +... via stonith_construct_async_reply(), instead of passing a mapped legacy rc +and action output separately, which will be helpful when we add the exit reason +to the reply. Also, drop the "stonith_" prefix since the function is static, and +drop an unused argument. +--- + daemons/fenced/fenced_commands.c | 33 +++++++++++++++----------------- + 1 file changed, 15 insertions(+), 18 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index e5f8162ce..6bc12e6c4 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -112,8 +112,8 @@ typedef struct async_command_s { + stonith_device_t *activating_on; + } async_command_t; + +-static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output, +- xmlNode * data, int rc); ++static xmlNode *construct_async_reply(async_command_t *cmd, ++ const pcmk__action_result_t *result); + + static gboolean + is_action_required(const char *action, stonith_device_t *device) +@@ -2399,7 +2399,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + output = result->action_stdout; + rc = pcmk_rc2legacy(stonith__result2rc(result)); + +- reply = stonith_construct_async_reply(cmd, output, NULL, rc); ++ reply = construct_async_reply(cmd, result); + + // Only replies for certain actions are broadcast + if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", +@@ -2732,17 +2732,20 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + return reply; + } + ++/*! ++ * \internal ++ * \brief Build an XML reply to an asynchronous fencing command ++ * ++ * \param[in] cmd Fencing command that reply is for ++ * \param[in] result Command result ++ */ + static xmlNode * +-stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc) ++construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + { +- xmlNode *reply = NULL; +- +- crm_trace("Creating a basic reply"); +- reply = create_xml_node(NULL, T_STONITH_REPLY); ++ xmlNode *reply = create_xml_node(NULL, T_STONITH_REPLY); + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- + crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); + crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); + crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); +@@ -2753,15 +2756,9 @@ stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode + crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); + crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); +- +- crm_xml_add_int(reply, F_STONITH_RC, rc); +- +- crm_xml_add(reply, "st_output", output); +- +- if (data != NULL) { +- crm_info("Attaching reply output"); +- add_message_xml(reply, F_STONITH_CALLDATA, data); +- } ++ crm_xml_add_int(reply, F_STONITH_RC, ++ pcmk_rc2legacy(stonith__result2rc(result))); ++ crm_xml_add(reply, "st_output", result->action_stdout); + return reply; + } + +-- +2.27.0 + + +From 2686caeb3b74f687ddd86a4e483250ca8096ba7c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 19 Oct 2021 18:27:31 -0500 +Subject: [PATCH 09/12] Log: fencer: improve messages for asynchronous results + +Now that we have the full result object, pass it to log_async_result(). +Instead of logging a mapped legacy rc, log the execution status or exit status +as appropriate, along with the exit reason. +--- + daemons/fenced/fenced_commands.c | 43 +++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 20 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 6bc12e6c4..9d06c68dc 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2305,15 +2305,14 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + * \brief Log the result of an asynchronous command + * + * \param[in] cmd Command the result is for +- * \param[in] rc Legacy return code corresponding to result ++ * \param[in] result Result of command + * \param[in] pid Process ID of command, if available + * \param[in] next Alternate device that will be tried if command failed +- * \param[in] output Command output, if any + * \param[in] op_merged Whether this command was merged with an earlier one + */ + static void +-log_async_result(async_command_t *cmd, int rc, int pid, const char *next, +- const char *output, gboolean op_merged) ++log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, ++ int pid, const char *next, bool op_merged) + { + int log_level = LOG_ERR; + int output_log_level = LOG_NEVER; +@@ -2321,17 +2320,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + + GString *msg = g_string_sized_new(80); // Reasonable starting size + +- // Choose log levels appropriately +- if (rc == 0) { // Success ++ // Choose log levels appropriately if we have a result ++ if ((result->execution_status == PCMK_EXEC_DONE) ++ && (result->exit_status == CRM_EX_OK)) { // Success + log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; +- if ((output != NULL) ++ if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_DEBUG; + } + next = NULL; + } else { // Failure + log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; +- if ((output != NULL) ++ if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_WARNING; + } +@@ -2347,10 +2347,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + } + g_string_append_printf(msg, "using %s ", cmd->device); + +- // Add result +- g_string_append_printf(msg, "returned %d (%s)", rc, pcmk_strerror(rc)); ++ // Add exit status or execution status as appropriate ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ g_string_append_printf(msg, "returned %d", result->exit_status); ++ } else { ++ g_string_append_printf(msg, "could not be executed: %s", ++ pcmk_exec_status_str(result->execution_status)); ++ } + +- // Add next device if appropriate ++ // Add exit reason and next device if appropriate ++ if (result->exit_reason != NULL) { ++ g_string_append_printf(msg, " (%s)", result->exit_reason); ++ } + if (next != NULL) { + g_string_append_printf(msg, ", retrying with %s", next); + } +@@ -2371,7 +2379,7 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + if (output_log_level != LOG_NEVER) { + char *prefix = crm_strdup_printf("%s[%d]", cmd->device, pid); + +- crm_log_output(output_log_level, prefix, output); ++ crm_log_output(output_log_level, prefix, result->action_stdout); + free(prefix); + } + } +@@ -2391,14 +2399,9 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + { + xmlNode *reply = NULL; + gboolean bcast = FALSE; +- const char *output = NULL; +- int rc = pcmk_ok; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + +- output = result->action_stdout; +- rc = pcmk_rc2legacy(stonith__result2rc(result)); +- + reply = construct_async_reply(cmd, result); + + // Only replies for certain actions are broadcast +@@ -2412,7 +2415,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + bcast = TRUE; + } + +- log_async_result(cmd, rc, pid, NULL, output, merged); ++ log_async_result(cmd, result, pid, NULL, merged); + crm_log_xml_trace(reply, "Reply"); + + if (merged) { +@@ -2436,6 +2439,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); ++ int rc = pcmk_rc2legacy(stonith__result2rc(result)); + + crm_xml_add_int(notify_data, F_STONITH_RC, rc); + crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); +@@ -2521,8 +2525,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + /* this operation requires more fencing, hooray! */ + if (next_device) { +- log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, +- next_device->id, result->action_stdout, FALSE); ++ log_async_result(cmd, result, pid, next_device->id, false); + schedule_stonith_command(cmd, next_device); + /* Prevent cmd from being freed */ + cmd = NULL; +-- +2.27.0 + + +From 9f9dea518da50f629589d505ea0f330a47111d76 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 28 Oct 2021 13:29:31 -0500 +Subject: [PATCH 10/12] Test: cts-fencing: update expected log messages + +... which now log the original exit status rather than a mapped legacy rc +--- + cts/cts-fencing.in | 28 ++++++++++++++-------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in +index babfb6351..5cd9f7b8f 100644 +--- a/cts/cts-fencing.in ++++ b/cts/cts-fencing.in +@@ -886,7 +886,7 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") + + test.add_stonith_log_pattern("Total timeout set to 40") +- test.add_stonith_log_pattern("targeting node3 using false returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false returned 1") + test.add_stonith_log_pattern("targeting node3 using true returned 0") + + # test what happens when the first fencing level fails. +@@ -920,8 +920,8 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3") + + test.add_stonith_log_pattern("Total timeout set to 18") +- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") +- test.add_stonith_log_pattern("targeting node3 using false2 returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") ++ test.add_stonith_log_pattern("targeting node3 using false2 returned 1") + test.add_stonith_log_pattern("targeting node3 using true3 returned 0") + test.add_stonith_log_pattern("targeting node3 using true4 returned 0") + +@@ -987,7 +987,7 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") + + test.add_stonith_log_pattern("Total timeout set to 8") +- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") + test.add_stonith_neg_log_pattern("targeting node3 using false2 returned ") + test.add_stonith_log_pattern("targeting node3 using true3 returned 0") + test.add_stonith_log_pattern("targeting node3 using true4 returned 0") +@@ -1147,7 +1147,7 @@ class Tests(object): + "--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") + test.add_stonith_log_pattern("does not support reboot") +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") + + # make sure reboot is used when reboot action is advertised + for test_type in test_types: +@@ -1158,7 +1158,7 @@ class Tests(object): + "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") + test.add_stonith_neg_log_pattern("does not advertise support for 'reboot', performing 'off'") +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") + + # make sure requested fencing delay is applied only for the first device in the first level + # make sure static delay from pcmk_delay_base is added +@@ -1240,8 +1240,8 @@ class Tests(object): + '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) + # both devices should be executed +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") + + ### verify unfencing using automatic unfencing fails if any of the required agents fail + test = self.new_test("cpg_unfence_required_2", +@@ -1264,8 +1264,8 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") + + ### verify unfencing using automatic devices with topology + test = self.new_test("cpg_unfence_required_4", +@@ -1296,10 +1296,10 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") +- test.add_stonith_log_pattern("using true3 returned 0 (OK)") +- test.add_stonith_log_pattern("using true4 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") ++ test.add_stonith_log_pattern("using true3 returned 0") ++ test.add_stonith_log_pattern("using true4 returned 0") + + def build_unfence_on_target_tests(self): + """ Register tests that verify unfencing that runs on the target """ +-- +2.27.0 + + +From be72166ed9ccb53c218529783660503df95da719 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 16 Sep 2021 16:50:23 -0500 +Subject: [PATCH 11/12] Log: libcrmservice: downgrade failed action messages + +Previously, we would often get duplicate log messages for failed actions, +from the service library and again from its callers. + +Now that the service library tracks and provides exit reasons, callers can log +sufficient detail with better context, so downgrade the library's messages to +info level or lower. Similarly, avoid duplicate logs of process output. + +Certain messages (such as out-of-memory) remain at higher severity. +--- + daemons/controld/controld_execd.c | 15 +++--- + lib/fencing/st_client.c | 11 ++--- + lib/services/services.c | 14 +++--- + lib/services/services_linux.c | 80 ++++++++++++++++--------------- + lib/services/systemd.c | 20 ++++---- + lib/services/upstart.c | 19 ++++---- + 6 files changed, 80 insertions(+), 79 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index bded6e6b6..3ddff6e13 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -2684,16 +2684,15 @@ log_executor_event(lrmd_event_data_t *op, const char *op_key, + do_crm_log(log_level, "%s", str->str); + g_string_free(str, TRUE); + +- if (op->output != NULL) { +- char *prefix = crm_strdup_printf("%s-" PCMK__OP_FMT ":%d", node_name, ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to notice if it looks like a failure. ++ */ ++ if ((op->output != NULL) && (op->rc != PCMK_OCF_OK)) { ++ char *prefix = crm_strdup_printf(PCMK__OP_FMT "@%s output", + op->rsc_id, op->op_type, +- op->interval_ms, op->call_id); ++ op->interval_ms, node_name); + +- if (op->rc) { +- crm_log_output(LOG_NOTICE, prefix, op->output); +- } else { +- crm_log_output(LOG_DEBUG, prefix, op->output); +- } ++ crm_log_output(LOG_NOTICE, prefix, op->output); + free(prefix); + } + } +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 3d4127eff..2fbff7f24 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -276,14 +276,9 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + static void + log_action(stonith_action_t *action, pid_t pid) + { +- if (action->result.action_stdout != NULL) { +- /* Logging the whole string confuses syslog when the string is xml */ +- char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); +- +- crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); +- free(prefix); +- } +- ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to warning for stderr. ++ */ + if (action->result.action_stderr != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); +diff --git a/lib/services/services.c b/lib/services/services.c +index 86a0a213c..cf8bbc70e 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -319,13 +319,13 @@ services__create_resource_action(const char *name, const char *standard, + rc = services__nagios_prepare(op); + #endif + } else { +- crm_err("Unknown resource standard: %s", op->standard); ++ crm_info("Unknown resource standard: %s", op->standard); + rc = ENOENT; + } + + if (rc != pcmk_rc_ok) { +- crm_err("Cannot prepare %s operation for %s: %s", +- action, name, strerror(rc)); ++ crm_info("Cannot prepare %s operation for %s: %s", ++ action, name, strerror(rc)); + services__handle_exec_error(op, rc); + } + return op; +@@ -967,14 +967,14 @@ execute_metadata_action(svc_action_t *op) + const char *class = op->standard; + + if (op->agent == NULL) { +- crm_err("meta-data requested without specifying agent"); ++ crm_info("Meta-data requested without specifying agent"); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_FATAL, "Agent not specified"); + return EINVAL; + } + + if (class == NULL) { +- crm_err("meta-data requested for agent %s without specifying class", ++ crm_info("Meta-data requested for agent %s without specifying class", + op->agent); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_FATAL, +@@ -986,8 +986,8 @@ execute_metadata_action(svc_action_t *op) + class = resources_find_service_class(op->agent); + } + if (class == NULL) { +- crm_err("meta-data requested for %s, but could not determine class", +- op->agent); ++ crm_info("Meta-data requested for %s, but could not determine class", ++ op->agent); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_HARD, + "Agent standard could not be determined"); +diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c +index b2ff27a0d..9a4c6cf80 100644 +--- a/lib/services/services_linux.c ++++ b/lib/services/services_linux.c +@@ -64,8 +64,8 @@ sigchld_setup(struct sigchld_data_s *data) + + // Block SIGCHLD (saving previous set of blocked signals to restore later) + if (sigprocmask(SIG_BLOCK, &(data->mask), &(data->old_mask)) < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=sigprocmask", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=sigprocmask", pcmk_strerror(errno)); + return false; + } + return true; +@@ -81,8 +81,8 @@ sigchld_open(struct sigchld_data_s *data) + + fd = signalfd(-1, &(data->mask), SFD_NONBLOCK); + if (fd < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=signalfd", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=signalfd", pcmk_strerror(errno)); + } + return fd; + } +@@ -108,8 +108,8 @@ sigchld_received(int fd) + } + s = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); + if (s != sizeof(struct signalfd_siginfo)) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=read", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=read", pcmk_strerror(errno)); + + } else if (fdsi.ssi_signo == SIGCHLD) { + return true; +@@ -149,8 +149,8 @@ sigchld_handler() + if ((last_sigchld_data != NULL) + && (last_sigchld_data->pipe_fd[1] >= 0) + && (write(last_sigchld_data->pipe_fd[1], "", 1) == -1)) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=write", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=write", pcmk_strerror(errno)); + } + } + +@@ -162,19 +162,19 @@ sigchld_setup(struct sigchld_data_s *data) + data->pipe_fd[0] = data->pipe_fd[1] = -1; + + if (pipe(data->pipe_fd) == -1) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=pipe", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=pipe", pcmk_strerror(errno)); + return false; + } + + rc = pcmk__set_nonblocking(data->pipe_fd[0]); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", ++ crm_info("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", + pcmk_rc_str(rc), rc); + } + rc = pcmk__set_nonblocking(data->pipe_fd[1]); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", ++ crm_info("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", + pcmk_rc_str(rc), rc); + } + +@@ -183,8 +183,8 @@ sigchld_setup(struct sigchld_data_s *data) + data->sa.sa_flags = 0; + sigemptyset(&(data->sa.sa_mask)); + if (sigaction(SIGCHLD, &(data->sa), &(data->old_sa)) < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=sigaction", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=sigaction", pcmk_strerror(errno)); + } + + // Remember data for use in signal handler +@@ -585,7 +585,11 @@ log_op_output(svc_action_t *op) + { + char *prefix = crm_strdup_printf("%s[%d] error output", op->id, op->pid); + +- crm_log_output(LOG_NOTICE, prefix, op->stderr_data); ++ /* The library caller has better context to know how important the output ++ * is, so log it at info and debug severity here. They can log it again at ++ * higher severity if appropriate. ++ */ ++ crm_log_output(LOG_INFO, prefix, op->stderr_data); + strcpy(prefix + strlen(prefix) - strlen("error output"), "output"); + crm_log_output(LOG_DEBUG, prefix, op->stdout_data); + free(prefix); +@@ -673,7 +677,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + parse_exit_reason_from_stderr(op); + + } else if (mainloop_child_timeout(p)) { +- crm_warn("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); ++ crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); + services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, + "Process did not exit within specified timeout"); + +@@ -686,7 +690,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_CANCELLED, NULL); + + } else { +- crm_warn("%s[%d] terminated with signal %d (%s)", ++ crm_info("%s[%d] terminated with signal %d (%s)", + op->id, op->pid, signo, strsignal(signo)); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, + "Process interrupted by signal"); +@@ -908,12 +912,12 @@ action_launch_child(svc_action_t *op) + sp.sched_priority = 0; + + if (sched_setscheduler(0, SCHED_OTHER, &sp) == -1) { +- crm_warn("Could not reset scheduling policy for %s", op->id); ++ crm_info("Could not reset scheduling policy for %s", op->id); + } + } + #endif + if (setpriority(PRIO_PROCESS, 0, 0) == -1) { +- crm_warn("Could not reset process priority for %s", op->id); ++ crm_info("Could not reset process priority for %s", op->id); + } + + /* Man: The call setpgrp() is equivalent to setpgid(0,0) +@@ -941,7 +945,7 @@ action_launch_child(svc_action_t *op) + } else { + crm_err("Considering %s unconfigured " + "because unable to load CIB secrets: %s", +- op->rsc, pcmk_rc_str(rc)); ++ op->rsc, pcmk_rc_str(rc)); + exit_child(op, services__configuration_error(op, false), + "Unable to load CIB secrets"); + } +@@ -1043,7 +1047,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + } else if (wait_rc < 0) { + wait_reason = pcmk_rc_str(errno); +- crm_warn("Wait for completion of %s[%d] failed: %s " ++ crm_info("Wait for completion of %s[%d] failed: %s " + CRM_XS " source=waitpid", + op->id, op->pid, wait_reason); + wait_rc = 0; // Act as if process is still running +@@ -1057,8 +1061,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + } else if ((poll_rc < 0) && (errno != EINTR)) { + wait_reason = pcmk_rc_str(errno); +- crm_err("Wait for completion of %s[%d] failed: %s " +- CRM_XS " source=poll", op->id, op->pid, wait_reason); ++ crm_info("Wait for completion of %s[%d] failed: %s " ++ CRM_XS " source=poll", op->id, op->pid, wait_reason); + break; + } + +@@ -1078,7 +1082,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + services__set_result(op, services__generic_error(op), + PCMK_EXEC_TIMEOUT, + "Process did not exit within specified timeout"); +- crm_warn("%s[%d] timed out after %dms", ++ crm_info("%s[%d] timed out after %dms", + op->id, op->pid, op->timeout); + + } else { +@@ -1110,8 +1114,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + services__set_result(op, services__generic_error(op), PCMK_EXEC_ERROR, + "Process interrupted by signal"); +- crm_err("%s[%d] terminated with signal %d (%s)", +- op->id, op->pid, signo, strsignal(signo)); ++ crm_info("%s[%d] terminated with signal %d (%s)", ++ op->id, op->pid, signo, strsignal(signo)); + + #ifdef WCOREDUMP + if (WCOREDUMP(status)) { +@@ -1155,7 +1159,7 @@ services__execute_file(svc_action_t *op) + // Catch common failure conditions early + if (stat(op->opaque->exec, &st) != 0) { + rc = errno; +- crm_warn("Cannot execute '%s': %s " CRM_XS " stat rc=%d", ++ crm_info("Cannot execute '%s': %s " CRM_XS " stat rc=%d", + op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; +@@ -1163,8 +1167,8 @@ services__execute_file(svc_action_t *op) + + if (pipe(stdout_fd) < 0) { + rc = errno; +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1174,8 +1178,8 @@ services__execute_file(svc_action_t *op) + + close_pipe(stdout_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1187,8 +1191,8 @@ services__execute_file(svc_action_t *op) + close_pipe(stdout_fd); + close_pipe(stderr_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1212,8 +1216,8 @@ services__execute_file(svc_action_t *op) + close_pipe(stdout_fd); + close_pipe(stderr_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " fork rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " fork rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + if (op->synchronous) { + sigchld_cleanup(&data); +@@ -1271,7 +1275,7 @@ services__execute_file(svc_action_t *op) + op->opaque->stdout_fd = stdout_fd[0]; + rc = pcmk__set_nonblocking(op->opaque->stdout_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' output non-blocking: %s " ++ crm_info("Could not set '%s' output non-blocking: %s " + CRM_XS " rc=%d", + op->opaque->exec, pcmk_rc_str(rc), rc); + } +@@ -1279,7 +1283,7 @@ services__execute_file(svc_action_t *op) + op->opaque->stderr_fd = stderr_fd[0]; + rc = pcmk__set_nonblocking(op->opaque->stderr_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' error output non-blocking: %s " ++ crm_info("Could not set '%s' error output non-blocking: %s " + CRM_XS " rc=%d", + op->opaque->exec, pcmk_rc_str(rc), rc); + } +@@ -1290,7 +1294,7 @@ services__execute_file(svc_action_t *op) + // as long as no other standard uses stdin_fd assume stonith + rc = pcmk__set_nonblocking(op->opaque->stdin_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' input non-blocking: %s " ++ crm_info("Could not set '%s' input non-blocking: %s " + CRM_XS " fd=%d,rc=%d", op->opaque->exec, + pcmk_rc_str(rc), op->opaque->stdin_fd, rc); + } +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 6f5bef960..8e9fff484 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -232,7 +232,8 @@ systemd_daemon_reload_complete(DBusPendingCall *pending, void *user_data) + } + + if (pcmk_dbus_find_error(pending, reply, &error)) { +- crm_err("Could not issue systemd reload %d: %s", reload_count, error.message); ++ crm_warn("Could not issue systemd reload %d: %s", ++ reload_count, error.message); + dbus_error_free(&error); + + } else { +@@ -291,8 +292,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) + PCMK_EXEC_NOT_INSTALLED, "systemd unit not found"); + } + +- crm_err("DBus request for %s of systemd unit %s for resource %s failed: %s", +- op->action, op->agent, crm_str(op->rsc), error->message); ++ crm_info("DBus request for %s of systemd unit %s for resource %s failed: %s", ++ op->action, op->agent, crm_str(op->rsc), error->message); + } + + /*! +@@ -325,11 +326,11 @@ execute_after_loadunit(DBusMessage *reply, svc_action_t *op) + if (op != NULL) { + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, + "systemd DBus method had unexpected reply"); +- crm_err("Could not load systemd unit %s for %s: " +- "DBus reply has unexpected type", op->agent, op->id); ++ crm_info("Could not load systemd unit %s for %s: " ++ "DBus reply has unexpected type", op->agent, op->id); + } else { +- crm_err("Could not load systemd unit: " +- "DBus reply has unexpected type"); ++ crm_info("Could not load systemd unit: " ++ "DBus reply has unexpected type"); + } + + } else { +@@ -688,7 +689,7 @@ process_unit_method_reply(DBusMessage *reply, svc_action_t *op) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("DBus request for %s of %s succeeded but " ++ crm_info("DBus request for %s of %s succeeded but " + "return type was unexpected", op->action, crm_str(op->rsc)); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, + "systemd DBus method had unexpected reply"); +@@ -981,7 +982,8 @@ systemd_timeout_callback(gpointer p) + svc_action_t * op = p; + + op->opaque->timerid = 0; +- crm_warn("%s operation on systemd unit %s named '%s' timed out", op->action, op->agent, op->rsc); ++ crm_info("%s action for systemd unit %s named '%s' timed out", ++ op->action, op->agent, op->rsc); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, + "Systemd action did not complete within specified timeout"); + services__finalize_async_op(op); +diff --git a/lib/services/upstart.c b/lib/services/upstart.c +index 2fdc229ad..2ece803e1 100644 +--- a/lib/services/upstart.c ++++ b/lib/services/upstart.c +@@ -308,21 +308,21 @@ get_first_instance(const gchar * job, int timeout) + dbus_message_unref(msg); + + if (dbus_error_is_set(&error)) { +- crm_err("Call to %s failed: %s", method, error.message); ++ crm_info("Call to %s failed: %s", method, error.message); + dbus_error_free(&error); + goto done; + + } else if(reply == NULL) { +- crm_err("Call to %s failed: no reply", method); ++ crm_info("Call to %s failed: no reply", method); + goto done; + + } else if (!dbus_message_iter_init(reply, &args)) { +- crm_err("Call to %s failed: Message has no arguments", method); ++ crm_info("Call to %s failed: Message has no arguments", method); + goto done; + } + + if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __func__, __LINE__)) { +- crm_err("Call to %s failed: Message has invalid arguments", method); ++ crm_info("Call to %s failed: Message has invalid arguments", method); + goto done; + } + +@@ -432,8 +432,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) + return; + } + +- crm_err("DBus request for %s of Upstart job %s for resource %s failed: %s", +- op->action, op->agent, crm_str(op->rsc), error->message); ++ crm_info("DBus request for %s of Upstart job %s for resource %s failed: %s", ++ op->action, op->agent, crm_str(op->rsc), error->message); + } + + /*! +@@ -468,7 +468,7 @@ job_method_complete(DBusPendingCall *pending, void *user_data) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("DBus request for %s of %s succeeded but " ++ crm_info("DBus request for %s of %s succeeded but " + "return type was unexpected", op->action, crm_str(op->rsc)); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + +@@ -667,7 +667,8 @@ services__execute_upstart(svc_action_t *op) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("Call to %s passed but return type was unexpected", op->action); ++ crm_info("Call to %s passed but return type was unexpected", ++ op->action); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + + } else { +@@ -675,7 +676,7 @@ services__execute_upstart(svc_action_t *op) + + dbus_message_get_args(reply, NULL, DBUS_TYPE_OBJECT_PATH, &path, + DBUS_TYPE_INVALID); +- crm_info("Call to %s passed: %s", op->action, path); ++ crm_debug("Call to %s passed: %s", op->action, path); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + } + +-- +2.27.0 + + +From 39f6861c72eb9dd76d2cf3da287fe7485615631b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 8 Nov 2021 09:43:38 -0600 +Subject: [PATCH 12/12] Low: fencing: avoid use-after-free with new result + object + +itnroduced by 153c9b552 (not released) +--- + lib/fencing/st_rhcs.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 23e694975..6c8cbedc7 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -143,15 +143,17 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + if (result->execution_status != PCMK_EXEC_DONE) { + crm_warn("Could not execute metadata action for %s: %s", + agent, pcmk_exec_status_str(result->execution_status)); ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + stonith__destroy_action(action); +- return pcmk_rc2legacy(stonith__result2rc(result)); ++ return rc; + } + + if (result->exit_status != CRM_EX_OK) { + crm_warn("Metadata action for %s returned error code %d", + agent, result->exit_status); ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + stonith__destroy_action(action); +- return pcmk_rc2legacy(stonith__result2rc(result)); ++ return rc; + } + + if (result->action_stdout == NULL) { +-- +2.27.0 + diff --git a/SOURCES/003-fencing-reasons.patch b/SOURCES/003-fencing-reasons.patch new file mode 100644 index 0000000..666a12a --- /dev/null +++ b/SOURCES/003-fencing-reasons.patch @@ -0,0 +1,2476 @@ +From 8e6362cb2129bd56f817d449a195f3da87a545fa Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 14:28:56 -0600 +Subject: [PATCH 01/13] Refactor: libcrmcommon,fencer: convenience macro for + initializing results + +for future reuse +--- + daemons/fenced/fenced_commands.c | 14 ++------------ + include/crm/common/results_internal.h | 15 +++++++++++++++ + 2 files changed, 17 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 87600573e..9f2f1cc40 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -388,12 +388,7 @@ static void + report_internal_result(async_command_t *cmd, int exit_status, + int execution_status, const char *exit_reason) + { +- pcmk__action_result_t result = { +- // Ensure we don't pass garbage to free() +- .exit_reason = NULL, +- .action_stdout = NULL, +- .action_stderr = NULL +- }; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + pcmk__set_result(&result, exit_status, execution_status, exit_reason); + cmd->done_cb(0, &result, cmd); +@@ -2616,12 +2611,7 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + } + + if (device == NULL) { // No device found +- pcmk__action_result_t result = { +- // Ensure we don't pass garbage to free() +- .exit_reason = NULL, +- .action_stdout = NULL, +- .action_stderr = NULL +- }; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, + "No fence device configured for target"); +diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h +index 804bf2a7a..6befaa0ed 100644 +--- a/include/crm/common/results_internal.h ++++ b/include/crm/common/results_internal.h +@@ -30,6 +30,21 @@ typedef struct { + char *action_stderr; // Action error output + } pcmk__action_result_t; + ++/*! ++ * \internal ++ * \brief Static initialization for an action result ++ * ++ * \note Importantly, this ensures pcmk__reset_result() won't try to free ++ * garbage. ++ */ ++#define PCMK__UNKNOWN_RESULT { \ ++ .exit_status = CRM_EX_OK, \ ++ .execution_status = PCMK_EXEC_UNKNOWN, \ ++ .exit_reason = NULL, \ ++ .action_stdout = NULL, \ ++ .action_stderr = NULL, \ ++ } ++ + void pcmk__set_result(pcmk__action_result_t *result, int exit_status, + enum pcmk_exec_status exec_status, + const char *exit_reason); +-- +2.27.0 + + +From 0937c92476ac737a5f5146932824bde8bdd7db98 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 16:02:27 -0600 +Subject: [PATCH 02/13] Refactor: various: add convenience function for + checking result success + +A successful pcmk__action_result_t has both exit status CRM_EX_OK (a.k.a +PCMK_OCF_OK) and execution status PCMK_EXEC_DONE. Since checking that is +clunky, we sometimes just check exit status, which is less than ideal. + +The convenience function makes it easy to check both, and improves readability. +--- + daemons/controld/controld_remote_ra.c | 4 ++-- + daemons/execd/execd_commands.c | 12 ++++++------ + daemons/fenced/fenced_commands.c | 14 ++++++-------- + include/crm/common/results_internal.h | 16 ++++++++++++++++ + lib/fencing/st_client.c | 4 ++-- + lib/fencing/st_rhcs.c | 2 +- + 6 files changed, 33 insertions(+), 19 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 74cbfd673..55ac162c7 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -297,7 +297,7 @@ static void + check_remote_node_state(remote_ra_cmd_t *cmd) + { + /* Only successful actions can change node state */ +- if (cmd->result.exit_status != PCMK_OCF_OK) { ++ if (!pcmk__result_ok(&(cmd->result))) { + return; + } + +@@ -365,7 +365,7 @@ report_remote_ra_result(remote_ra_cmd_t * cmd) + lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status, + cmd->result.exit_reason); + +- if (cmd->reported_success && (cmd->result.exit_status != PCMK_OCF_OK)) { ++ if (cmd->reported_success && !pcmk__result_ok(&(cmd->result))) { + op.t_rcchange = (unsigned int) time(NULL); + /* This edge case will likely never ever occur, but if it does the + * result is that a failure will not be processed correctly. This is only +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 667525039..02070bf11 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -878,7 +878,7 @@ action_complete(svc_action_t * action) + } + + if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { +- if ((cmd->result.exit_status == PCMK_OCF_OK) ++ if (pcmk__result_ok(&(cmd->result)) + && pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) { + /* systemd returns from start and stop actions after the action + * begins, not after it completes. We have to jump through a few +@@ -894,7 +894,7 @@ action_complete(svc_action_t * action) + if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + goagain = true; + +- } else if ((cmd->result.exit_status == PCMK_OCF_OK) ++ } else if (pcmk__result_ok(&(cmd->result)) + && pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + goagain = true; + +@@ -927,12 +927,12 @@ action_complete(svc_action_t * action) + #if SUPPORT_NAGIOS + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) { + if (action_matches(cmd, "monitor", 0) +- && (cmd->result.exit_status == PCMK_OCF_OK)) { ++ && pcmk__result_ok(&(cmd->result))) { + /* Successfully executed --version for the nagios plugin */ + cmd->result.exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei) +- && (cmd->result.exit_status != PCMK_OCF_OK)) { ++ && !pcmk__result_ok(&(cmd->result))) { + #ifdef PCMK__TIME_USE_CGT + goagain = true; + #endif +@@ -955,7 +955,7 @@ action_complete(svc_action_t * action) + cmd->start_delay = delay; + cmd->timeout = timeout_left; + +- if (cmd->result.exit_status == PCMK_OCF_OK) { ++ if (pcmk__result_ok(&(cmd->result))) { + crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); + +@@ -1066,7 +1066,7 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + cmd->interval_ms, rc); + + // Certain successful actions change the known state of the resource +- if ((rsc != NULL) && (cmd->result.exit_status == PCMK_OCF_OK)) { ++ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 9f2f1cc40..26501a4b3 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1188,8 +1188,7 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, + + mainloop_set_trigger(dev->work); + +- if ((result->execution_status == PCMK_EXEC_DONE) +- && (result->exit_status == CRM_EX_OK)) { ++ if (pcmk__result_ok(result)) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); + dev->targets = stonith__parse_targets(result->action_stdout); +@@ -2310,15 +2309,14 @@ log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, + GString *msg = g_string_sized_new(80); // Reasonable starting size + + // Choose log levels appropriately if we have a result +- if ((result->execution_status == PCMK_EXEC_DONE) +- && (result->exit_status == CRM_EX_OK)) { // Success ++ if (pcmk__result_ok(result)) { + log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; + if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_DEBUG; + } + next = NULL; +- } else { // Failure ++ } else { + log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; + if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { +@@ -2482,7 +2480,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + /* The device is ready to do something else now */ + device = g_hash_table_lookup(device_list, cmd->device); + if (device) { +- if (!device->verified && (result->exit_status == CRM_EX_OK) && ++ if (!device->verified && pcmk__result_ok(result) && + (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { + + device->verified = TRUE; +@@ -2491,7 +2489,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + mainloop_set_trigger(device->work); + } + +- if (result->exit_status == CRM_EX_OK) { ++ if (pcmk__result_ok(result)) { + GList *iter; + /* see if there are any required devices left to execute for this op */ + for (iter = cmd->device_next; iter != NULL; iter = iter->next) { +@@ -2523,7 +2521,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + send_async_reply(cmd, result, pid, false); + +- if (result->exit_status != CRM_EX_OK) { ++ if (!pcmk__result_ok(result)) { + goto done; + } + +diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h +index 6befaa0ed..0c5833937 100644 +--- a/include/crm/common/results_internal.h ++++ b/include/crm/common/results_internal.h +@@ -54,4 +54,20 @@ void pcmk__set_result_output(pcmk__action_result_t *result, + + void pcmk__reset_result(pcmk__action_result_t *result); + ++/*! ++ * \internal ++ * \brief Check whether a result is OK ++ * ++ * \param[in] result ++ * ++ * \return true if the result's exit status is CRM_EX_OK and its ++ * execution status is PCMK_EXEC_DONE, otherwise false ++ */ ++static inline bool ++pcmk__result_ok(const pcmk__action_result_t *result) ++{ ++ return (result != NULL) && (result->exit_status == CRM_EX_OK) ++ && (result->execution_status == PCMK_EXEC_DONE); ++} ++ + #endif // PCMK__COMMON_RESULTS_INTERNAL__H +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2fbff7f24..af461d0d4 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -760,7 +760,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + default: break; + } + +- if (result->exit_status == CRM_EX_OK) { ++ if (pcmk__result_ok(result)) { + return pcmk_rc_ok; + } + +@@ -797,7 +797,7 @@ stonith_action_async_done(svc_action_t *svc_action) + + log_action(action, action->pid); + +- if ((action->result.exit_status != CRM_EX_OK) ++ if (!pcmk__result_ok(&(action->result)) + && update_remaining_timeout(action)) { + + int rc = internal_stonith_action_execute(action); +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 6c8cbedc7..865e04bc2 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -148,7 +148,7 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + return rc; + } + +- if (result->exit_status != CRM_EX_OK) { ++ if (!pcmk__result_ok(result)) { + crm_warn("Metadata action for %s returned error code %d", + agent, result->exit_status); + rc = pcmk_rc2legacy(stonith__result2rc(result)); +-- +2.27.0 + + +From 4c39ff00a0c028354a9da7f80986f7e34b05ba08 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 16:07:01 -0600 +Subject: [PATCH 03/13] Low: fencing: improve mapping of execution status to + legacy return code + +PCMK_EXEC_PENDING is likely not possible with the current code, but map it to +EINPROGRESS for completeness. + +PCMK_EXEC_INVALID is not yet used by the fencer but will be. +--- + lib/fencing/st_client.c | 30 ++++++++++++++++++++++++++---- + 1 file changed, 26 insertions(+), 4 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index af461d0d4..93513e9f3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -749,7 +749,12 @@ update_remaining_timeout(stonith_action_t * action) + int + stonith__result2rc(const pcmk__action_result_t *result) + { ++ if (pcmk__result_ok(result)) { ++ return pcmk_rc_ok; ++ } ++ + switch (result->execution_status) { ++ case PCMK_EXEC_PENDING: return EINPROGRESS; + case PCMK_EXEC_CANCELLED: return ECANCELED; + case PCMK_EXEC_TIMEOUT: return ETIME; + case PCMK_EXEC_NOT_INSTALLED: return ENOENT; +@@ -757,11 +762,28 @@ stonith__result2rc(const pcmk__action_result_t *result) + case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; + case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; + case PCMK_EXEC_NO_SECRETS: return EACCES; +- default: break; +- } + +- if (pcmk__result_ok(result)) { +- return pcmk_rc_ok; ++ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API ++ * operations that don't involve executing an agent (for example, ++ * registering devices). This allows us to use the CRM_EX_* codes in the ++ * exit status for finer-grained responses. ++ */ ++ case PCMK_EXEC_INVALID: ++ switch (result->exit_status) { ++ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; ++ case CRM_EX_PROTOCOL: return EPROTO; ++ ++ /* CRM_EX_EXPIRED is used for orphaned fencing operations left ++ * over from a previous instance of the fencer. For API backward ++ * compatibility, this is mapped to the previously used code for ++ * this case, EHOSTUNREACH. ++ */ ++ case CRM_EX_EXPIRED: return EHOSTUNREACH; ++ default: break; ++ } ++ ++ default: ++ break; + } + + // Try to provide useful error code based on result's error output +-- +2.27.0 + + +From 4e638783d1cd7c9398a603fc6df7e9d868262b16 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 11:41:12 -0600 +Subject: [PATCH 04/13] Refactor: libstonithd: separate action-related code + into own source file + +Everything related to stonith_action_t has been moved from st_client.c to a new +st_actions.c, since st_client.c was ridiculously large, and the action stuff +isn't all client-related. No code was changed. + +Before: + 2804 st_client.c + +After: + 545 lib/fencing/st_actions.c + 2278 lib/fencing/st_client.c +--- + lib/fencing/Makefile.am | 2 +- + lib/fencing/st_actions.c | 545 +++++++++++++++++++++++++++++++++++++++ + lib/fencing/st_client.c | 528 +------------------------------------ + 3 files changed, 547 insertions(+), 528 deletions(-) + create mode 100644 lib/fencing/st_actions.c + +diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am +index 205c4873d..dac215c16 100644 +--- a/lib/fencing/Makefile.am ++++ b/lib/fencing/Makefile.am +@@ -22,7 +22,7 @@ libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) + libstonithd_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la + libstonithd_la_LIBADD += $(top_builddir)/lib/services/libcrmservice.la + +-libstonithd_la_SOURCES = st_client.c st_output.c st_rhcs.c ++libstonithd_la_SOURCES = st_actions.c st_client.c st_output.c st_rhcs.c + if BUILD_LHA_SUPPORT + libstonithd_la_SOURCES += st_lha.c + endif +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +new file mode 100644 +index 000000000..64d3afd5d +--- /dev/null ++++ b/lib/fencing/st_actions.c +@@ -0,0 +1,545 @@ ++/* ++ * Copyright 2004-2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "fencing_private.h" ++ ++struct stonith_action_s { ++ /*! user defined data */ ++ char *agent; ++ char *action; ++ char *victim; ++ GHashTable *args; ++ int timeout; ++ int async; ++ void *userdata; ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); ++ void (*fork_cb) (int pid, void *user_data); ++ ++ svc_action_t *svc_action; ++ ++ /*! internal timing information */ ++ time_t initial_start_time; ++ int tries; ++ int remaining_timeout; ++ int max_retries; ++ ++ int pid; ++ pcmk__action_result_t result; ++}; ++ ++static int internal_stonith_action_execute(stonith_action_t *action); ++static void log_action(stonith_action_t *action, pid_t pid); ++ ++/*! ++ * \internal ++ * \brief Set an action's result based on services library result ++ * ++ * \param[in] action Fence action to set result for ++ * \param[in] svc_action Service action to get result from ++ */ ++static void ++set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) ++{ ++ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, ++ services__exit_reason(svc_action)); ++ pcmk__set_result_output(&(action->result), ++ services__grab_stdout(svc_action), ++ services__grab_stderr(svc_action)); ++} ++ ++static void ++log_action(stonith_action_t *action, pid_t pid) ++{ ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to warning for stderr. ++ */ ++ if (action->result.action_stderr != NULL) { ++ /* Logging the whole string confuses syslog when the string is xml */ ++ char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); ++ ++ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); ++ free(prefix); ++ } ++} ++ ++static void ++append_config_arg(gpointer key, gpointer value, gpointer user_data) ++{ ++ /* The fencer will filter "action" out when it registers the device, ++ * but ignore it here in case any external API users don't. ++ * ++ * Also filter out parameters handled directly by Pacemaker. ++ */ ++ if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) ++ && !pcmk_stonith_param(key) ++ && (strstr(key, CRM_META) == NULL) ++ && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { ++ ++ crm_trace("Passing %s=%s with fence action", ++ (const char *) key, (const char *) (value? value : "")); ++ g_hash_table_insert((GHashTable *) user_data, ++ strdup(key), strdup(value? value : "")); ++ } ++} ++ ++static GHashTable * ++make_args(const char *agent, const char *action, const char *victim, ++ uint32_t victim_nodeid, GHashTable * device_args, ++ GHashTable * port_map, const char *host_arg) ++{ ++ GHashTable *arg_list = NULL; ++ const char *value = NULL; ++ ++ CRM_CHECK(action != NULL, return NULL); ++ ++ arg_list = pcmk__strkey_table(free, free); ++ ++ // Add action to arguments (using an alias if requested) ++ if (device_args) { ++ char buffer[512]; ++ ++ snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); ++ value = g_hash_table_lookup(device_args, buffer); ++ if (value) { ++ crm_debug("Substituting '%s' for fence action %s targeting %s", ++ value, action, victim); ++ action = value; ++ } ++ } ++ g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), ++ strdup(action)); ++ ++ /* If this is a fencing operation against another node, add more standard ++ * arguments. ++ */ ++ if (victim && device_args) { ++ const char *param = NULL; ++ ++ /* Always pass the target's name, per ++ * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md ++ */ ++ g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); ++ ++ // If the target's node ID was specified, pass it, too ++ if (victim_nodeid) { ++ char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); ++ ++ // cts-fencing looks for this log message ++ crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", ++ nodeid, action, victim); ++ g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); ++ } ++ ++ // Check whether target must be specified in some other way ++ param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); ++ if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) ++ && !pcmk__str_eq(param, "none", pcmk__str_casei)) { ++ ++ if (param == NULL) { ++ /* Use the caller's default for pcmk_host_argument, or "port" if ++ * none was given ++ */ ++ param = (host_arg == NULL)? "port" : host_arg; ++ } ++ value = g_hash_table_lookup(device_args, param); ++ ++ if (pcmk__str_eq(value, "dynamic", ++ pcmk__str_casei|pcmk__str_null_matches)) { ++ /* If the host argument was "dynamic" or not explicitly specified, ++ * add it with the target ++ */ ++ const char *alias = NULL; ++ ++ if (port_map) { ++ alias = g_hash_table_lookup(port_map, victim); ++ } ++ if (alias == NULL) { ++ alias = victim; ++ } ++ crm_debug("Passing %s='%s' with fence action %s targeting %s", ++ param, alias, action, victim); ++ g_hash_table_insert(arg_list, strdup(param), strdup(alias)); ++ } ++ } ++ } ++ ++ if (device_args) { ++ g_hash_table_foreach(device_args, append_config_arg, arg_list); ++ } ++ ++ return arg_list; ++} ++ ++/*! ++ * \internal ++ * \brief Free all memory used by a stonith action ++ * ++ * \param[in,out] action Action to free ++ */ ++void ++stonith__destroy_action(stonith_action_t *action) ++{ ++ if (action) { ++ free(action->agent); ++ if (action->args) { ++ g_hash_table_destroy(action->args); ++ } ++ free(action->action); ++ free(action->victim); ++ if (action->svc_action) { ++ services_action_free(action->svc_action); ++ } ++ pcmk__reset_result(&(action->result)); ++ free(action); ++ } ++} ++ ++/*! ++ * \internal ++ * \brief Get the result of an executed stonith action ++ * ++ * \param[in] action Executed action ++ * ++ * \return Pointer to action's result (or NULL if \p action is NULL) ++ */ ++pcmk__action_result_t * ++stonith__action_result(stonith_action_t *action) ++{ ++ return (action == NULL)? NULL : &(action->result); ++} ++ ++#define FAILURE_MAX_RETRIES 2 ++stonith_action_t * ++stonith_action_create(const char *agent, ++ const char *_action, ++ const char *victim, ++ uint32_t victim_nodeid, ++ int timeout, GHashTable * device_args, ++ GHashTable * port_map, const char *host_arg) ++{ ++ stonith_action_t *action; ++ ++ action = calloc(1, sizeof(stonith_action_t)); ++ action->args = make_args(agent, _action, victim, victim_nodeid, ++ device_args, port_map, host_arg); ++ crm_debug("Preparing '%s' action for %s using agent %s", ++ _action, (victim? victim : "no target"), agent); ++ action->agent = strdup(agent); ++ action->action = strdup(_action); ++ if (victim) { ++ action->victim = strdup(victim); ++ } ++ action->timeout = action->remaining_timeout = timeout; ++ action->max_retries = FAILURE_MAX_RETRIES; ++ ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, ++ "Initialization bug in fencing library"); ++ ++ if (device_args) { ++ char buffer[512]; ++ const char *value = NULL; ++ ++ snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); ++ value = g_hash_table_lookup(device_args, buffer); ++ ++ if (value) { ++ action->max_retries = atoi(value); ++ } ++ } ++ ++ return action; ++} ++ ++static gboolean ++update_remaining_timeout(stonith_action_t * action) ++{ ++ int diff = time(NULL) - action->initial_start_time; ++ ++ if (action->tries >= action->max_retries) { ++ crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", ++ action->agent, action->action, action->max_retries); ++ action->remaining_timeout = 0; ++ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) ++ && (diff < (action->timeout * 0.7))) { ++ /* only set remaining timeout period if there is 30% ++ * or greater of the original timeout period left */ ++ action->remaining_timeout = action->timeout - diff; ++ } else { ++ action->remaining_timeout = 0; ++ } ++ return action->remaining_timeout ? TRUE : FALSE; ++} ++ ++/*! ++ * \internal ++ * \brief Map a fencing action result to a standard return code ++ * ++ * \param[in] result Fencing action result to map ++ * ++ * \return Standard Pacemaker return code that best corresponds to \p result ++ */ ++int ++stonith__result2rc(const pcmk__action_result_t *result) ++{ ++ if (pcmk__result_ok(result)) { ++ return pcmk_rc_ok; ++ } ++ ++ switch (result->execution_status) { ++ case PCMK_EXEC_PENDING: return EINPROGRESS; ++ case PCMK_EXEC_CANCELLED: return ECANCELED; ++ case PCMK_EXEC_TIMEOUT: return ETIME; ++ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; ++ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; ++ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; ++ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; ++ case PCMK_EXEC_NO_SECRETS: return EACCES; ++ ++ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API ++ * operations that don't involve executing an agent (for example, ++ * registering devices). This allows us to use the CRM_EX_* codes in the ++ * exit status for finer-grained responses. ++ */ ++ case PCMK_EXEC_INVALID: ++ switch (result->exit_status) { ++ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; ++ case CRM_EX_PROTOCOL: return EPROTO; ++ ++ /* CRM_EX_EXPIRED is used for orphaned fencing operations left ++ * over from a previous instance of the fencer. For API backward ++ * compatibility, this is mapped to the previously used code for ++ * this case, EHOSTUNREACH. ++ */ ++ case CRM_EX_EXPIRED: return EHOSTUNREACH; ++ default: break; ++ } ++ ++ default: ++ break; ++ } ++ ++ // Try to provide useful error code based on result's error output ++ ++ if (result->action_stderr == NULL) { ++ return ENODATA; ++ ++ } else if (strcasestr(result->action_stderr, "timed out") ++ || strcasestr(result->action_stderr, "timeout")) { ++ return ETIME; ++ ++ } else if (strcasestr(result->action_stderr, "unrecognised action") ++ || strcasestr(result->action_stderr, "unrecognized action") ++ || strcasestr(result->action_stderr, "unsupported action")) { ++ return EOPNOTSUPP; ++ } ++ ++ // Oh well, we tried ++ return pcmk_rc_error; ++} ++ ++static void ++stonith_action_async_done(svc_action_t *svc_action) ++{ ++ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; ++ ++ set_result_from_svc_action(action, svc_action); ++ ++ svc_action->params = NULL; ++ ++ crm_debug("Child process %d performing action '%s' exited with rc %d", ++ action->pid, action->action, svc_action->rc); ++ ++ log_action(action, action->pid); ++ ++ if (!pcmk__result_ok(&(action->result)) ++ && update_remaining_timeout(action)) { ++ ++ int rc = internal_stonith_action_execute(action); ++ if (rc == pcmk_ok) { ++ return; ++ } ++ } ++ ++ if (action->done_cb) { ++ action->done_cb(action->pid, &(action->result), action->userdata); ++ } ++ ++ action->svc_action = NULL; // don't remove our caller ++ stonith__destroy_action(action); ++} ++ ++static void ++stonith_action_async_forked(svc_action_t *svc_action) ++{ ++ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; ++ ++ action->pid = svc_action->pid; ++ action->svc_action = svc_action; ++ ++ if (action->fork_cb) { ++ (action->fork_cb) (svc_action->pid, action->userdata); ++ } ++ ++ crm_trace("Child process %d performing action '%s' successfully forked", ++ action->pid, action->action); ++} ++ ++static int ++internal_stonith_action_execute(stonith_action_t * action) ++{ ++ int rc = -EPROTO; ++ int is_retry = 0; ++ svc_action_t *svc_action = NULL; ++ static int stonith_sequence = 0; ++ char *buffer = NULL; ++ ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ if ((action->action == NULL) || (action->args == NULL) ++ || (action->agent == NULL)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); ++ return -EINVAL; ++ } ++ ++ if (!action->tries) { ++ action->initial_start_time = time(NULL); ++ } ++ action->tries++; ++ ++ if (action->tries > 1) { ++ crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", ++ action->tries, action->agent, action->action, action->remaining_timeout); ++ is_retry = 1; ++ } ++ ++ buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", ++ basename(action->agent)); ++ svc_action = services_action_create_generic(buffer, NULL); ++ free(buffer); ++ ++ if (svc_action->rc != PCMK_OCF_UNKNOWN) { ++ set_result_from_svc_action(action, svc_action); ++ services_action_free(svc_action); ++ return -E2BIG; ++ } ++ ++ svc_action->timeout = 1000 * action->remaining_timeout; ++ svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); ++ svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), ++ action->action, action->tries); ++ svc_action->agent = strdup(action->agent); ++ svc_action->sequence = stonith_sequence++; ++ svc_action->params = action->args; ++ svc_action->cb_data = (void *) action; ++ svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, ++ LOG_TRACE, "Action", ++ svc_action->id, svc_action->flags, ++ SVC_ACTION_NON_BLOCKED, ++ "SVC_ACTION_NON_BLOCKED"); ++ ++ /* keep retries from executing out of control and free previous results */ ++ if (is_retry) { ++ pcmk__reset_result(&(action->result)); ++ sleep(1); ++ } ++ ++ if (action->async) { ++ /* async */ ++ if (services_action_async_fork_notify(svc_action, ++ &stonith_action_async_done, ++ &stonith_action_async_forked)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, ++ PCMK_EXEC_PENDING, NULL); ++ return pcmk_ok; ++ } ++ ++ } else if (services_action_sync(svc_action)) { // sync success ++ rc = pcmk_ok; ++ ++ } else { // sync failure ++ rc = -ECONNABORTED; ++ } ++ ++ set_result_from_svc_action(action, svc_action); ++ svc_action->params = NULL; ++ services_action_free(svc_action); ++ return rc; ++} ++ ++/*! ++ * \internal ++ * \brief Kick off execution of an async stonith action ++ * ++ * \param[in,out] action Action to be executed ++ * \param[in,out] userdata Datapointer to be passed to callbacks ++ * \param[in] done Callback to notify action has failed/succeeded ++ * \param[in] fork_callback Callback to notify successful fork of child ++ * ++ * \return pcmk_ok if ownership of action has been taken, -errno otherwise ++ */ ++int ++stonith_action_execute_async(stonith_action_t * action, ++ void *userdata, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, ++ void *user_data), ++ void (*fork_cb) (int pid, void *user_data)) ++{ ++ if (!action) { ++ return -EINVAL; ++ } ++ ++ action->userdata = userdata; ++ action->done_cb = done; ++ action->fork_cb = fork_cb; ++ action->async = 1; ++ ++ return internal_stonith_action_execute(action); ++} ++ ++/*! ++ * \internal ++ * \brief Execute a stonith action ++ * ++ * \param[in,out] action Action to execute ++ * ++ * \return pcmk_ok on success, -errno otherwise ++ */ ++int ++stonith__execute(stonith_action_t *action) ++{ ++ int rc = pcmk_ok; ++ ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ // Keep trying until success, max retries, or timeout ++ do { ++ rc = internal_stonith_action_execute(action); ++ } while ((rc != pcmk_ok) && update_remaining_timeout(action)); ++ ++ return rc; ++} +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 93513e9f3..944cd1863 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -8,28 +8,20 @@ + */ + + #include +-#include ++ + #include + #include + #include + #include + #include +-#include + #include +- +-#include + #include +-#include +- + #include + + #include + #include + #include + #include +-#include +-#include +-#include + + #include + +@@ -37,31 +29,6 @@ + + CRM_TRACE_INIT_DATA(stonith); + +-struct stonith_action_s { +- /*! user defined data */ +- char *agent; +- char *action; +- char *victim; +- GHashTable *args; +- int timeout; +- int async; +- void *userdata; +- void (*done_cb) (int pid, const pcmk__action_result_t *result, +- void *user_data); +- void (*fork_cb) (int pid, void *user_data); +- +- svc_action_t *svc_action; +- +- /*! internal timing information */ +- time_t initial_start_time; +- int tries; +- int remaining_timeout; +- int max_retries; +- +- int pid; +- pcmk__action_result_t result; +-}; +- + typedef struct stonith_private_s { + char *token; + crm_ipc_t *ipc; +@@ -118,8 +85,6 @@ static int stonith_send_command(stonith_t *stonith, const char *op, + + static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); +-static int internal_stonith_action_execute(stonith_action_t * action); +-static void log_action(stonith_action_t *action, pid_t pid); + + /*! + * \brief Get agent namespace by name +@@ -196,23 +161,6 @@ stonith_get_namespace(const char *agent, const char *namespace_s) + return st_namespace_invalid; + } + +-/*! +- * \internal +- * \brief Set an action's result based on services library result +- * +- * \param[in] action Fence action to set result for +- * \param[in] svc_action Service action to get result from +- */ +-static void +-set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) +-{ +- pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, +- services__exit_reason(svc_action)); +- pcmk__set_result_output(&(action->result), +- services__grab_stdout(svc_action), +- services__grab_stderr(svc_action)); +-} +- + gboolean + stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + { +@@ -273,21 +221,6 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); + } + +-static void +-log_action(stonith_action_t *action, pid_t pid) +-{ +- /* The services library has already logged the output at info or debug +- * level, so just raise to warning for stderr. +- */ +- if (action->result.action_stderr != NULL) { +- /* Logging the whole string confuses syslog when the string is xml */ +- char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); +- +- crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); +- free(prefix); +- } +-} +- + /* when cycling through the list we don't want to delete items + so just mark them and when we know nobody is using the list + loop over it to remove the marked items +@@ -530,465 +463,6 @@ stonith_api_register_level(stonith_t * st, int options, const char *node, int le + level, device_list); + } + +-static void +-append_config_arg(gpointer key, gpointer value, gpointer user_data) +-{ +- /* The fencer will filter "action" out when it registers the device, +- * but ignore it here in case any external API users don't. +- * +- * Also filter out parameters handled directly by Pacemaker. +- */ +- if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) +- && !pcmk_stonith_param(key) +- && (strstr(key, CRM_META) == NULL) +- && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { +- +- crm_trace("Passing %s=%s with fence action", +- (const char *) key, (const char *) (value? value : "")); +- g_hash_table_insert((GHashTable *) user_data, +- strdup(key), strdup(value? value : "")); +- } +-} +- +-static GHashTable * +-make_args(const char *agent, const char *action, const char *victim, +- uint32_t victim_nodeid, GHashTable * device_args, +- GHashTable * port_map, const char *host_arg) +-{ +- GHashTable *arg_list = NULL; +- const char *value = NULL; +- +- CRM_CHECK(action != NULL, return NULL); +- +- arg_list = pcmk__strkey_table(free, free); +- +- // Add action to arguments (using an alias if requested) +- if (device_args) { +- char buffer[512]; +- +- snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); +- value = g_hash_table_lookup(device_args, buffer); +- if (value) { +- crm_debug("Substituting '%s' for fence action %s targeting %s", +- value, action, victim); +- action = value; +- } +- } +- g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), +- strdup(action)); +- +- /* If this is a fencing operation against another node, add more standard +- * arguments. +- */ +- if (victim && device_args) { +- const char *param = NULL; +- +- /* Always pass the target's name, per +- * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md +- */ +- g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); +- +- // If the target's node ID was specified, pass it, too +- if (victim_nodeid) { +- char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); +- +- // cts-fencing looks for this log message +- crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", +- nodeid, action, victim); +- g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); +- } +- +- // Check whether target must be specified in some other way +- param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); +- if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) +- && !pcmk__str_eq(param, "none", pcmk__str_casei)) { +- +- if (param == NULL) { +- /* Use the caller's default for pcmk_host_argument, or "port" if +- * none was given +- */ +- param = (host_arg == NULL)? "port" : host_arg; +- } +- value = g_hash_table_lookup(device_args, param); +- +- if (pcmk__str_eq(value, "dynamic", +- pcmk__str_casei|pcmk__str_null_matches)) { +- /* If the host argument was "dynamic" or not explicitly specified, +- * add it with the target +- */ +- const char *alias = NULL; +- +- if (port_map) { +- alias = g_hash_table_lookup(port_map, victim); +- } +- if (alias == NULL) { +- alias = victim; +- } +- crm_debug("Passing %s='%s' with fence action %s targeting %s", +- param, alias, action, victim); +- g_hash_table_insert(arg_list, strdup(param), strdup(alias)); +- } +- } +- } +- +- if (device_args) { +- g_hash_table_foreach(device_args, append_config_arg, arg_list); +- } +- +- return arg_list; +-} +- +-/*! +- * \internal +- * \brief Free all memory used by a stonith action +- * +- * \param[in,out] action Action to free +- */ +-void +-stonith__destroy_action(stonith_action_t *action) +-{ +- if (action) { +- free(action->agent); +- if (action->args) { +- g_hash_table_destroy(action->args); +- } +- free(action->action); +- free(action->victim); +- if (action->svc_action) { +- services_action_free(action->svc_action); +- } +- pcmk__reset_result(&(action->result)); +- free(action); +- } +-} +- +-/*! +- * \internal +- * \brief Get the result of an executed stonith action +- * +- * \param[in] action Executed action +- * +- * \return Pointer to action's result (or NULL if \p action is NULL) +- */ +-pcmk__action_result_t * +-stonith__action_result(stonith_action_t *action) +-{ +- return (action == NULL)? NULL : &(action->result); +-} +- +-#define FAILURE_MAX_RETRIES 2 +-stonith_action_t * +-stonith_action_create(const char *agent, +- const char *_action, +- const char *victim, +- uint32_t victim_nodeid, +- int timeout, GHashTable * device_args, +- GHashTable * port_map, const char *host_arg) +-{ +- stonith_action_t *action; +- +- action = calloc(1, sizeof(stonith_action_t)); +- action->args = make_args(agent, _action, victim, victim_nodeid, +- device_args, port_map, host_arg); +- crm_debug("Preparing '%s' action for %s using agent %s", +- _action, (victim? victim : "no target"), agent); +- action->agent = strdup(agent); +- action->action = strdup(_action); +- if (victim) { +- action->victim = strdup(victim); +- } +- action->timeout = action->remaining_timeout = timeout; +- action->max_retries = FAILURE_MAX_RETRIES; +- +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, +- "Initialization bug in fencing library"); +- +- if (device_args) { +- char buffer[512]; +- const char *value = NULL; +- +- snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); +- value = g_hash_table_lookup(device_args, buffer); +- +- if (value) { +- action->max_retries = atoi(value); +- } +- } +- +- return action; +-} +- +-static gboolean +-update_remaining_timeout(stonith_action_t * action) +-{ +- int diff = time(NULL) - action->initial_start_time; +- +- if (action->tries >= action->max_retries) { +- crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", +- action->agent, action->action, action->max_retries); +- action->remaining_timeout = 0; +- } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) +- && (diff < (action->timeout * 0.7))) { +- /* only set remaining timeout period if there is 30% +- * or greater of the original timeout period left */ +- action->remaining_timeout = action->timeout - diff; +- } else { +- action->remaining_timeout = 0; +- } +- return action->remaining_timeout ? TRUE : FALSE; +-} +- +-/*! +- * \internal +- * \brief Map a fencing action result to a standard return code +- * +- * \param[in] result Fencing action result to map +- * +- * \return Standard Pacemaker return code that best corresponds to \p result +- */ +-int +-stonith__result2rc(const pcmk__action_result_t *result) +-{ +- if (pcmk__result_ok(result)) { +- return pcmk_rc_ok; +- } +- +- switch (result->execution_status) { +- case PCMK_EXEC_PENDING: return EINPROGRESS; +- case PCMK_EXEC_CANCELLED: return ECANCELED; +- case PCMK_EXEC_TIMEOUT: return ETIME; +- case PCMK_EXEC_NOT_INSTALLED: return ENOENT; +- case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; +- case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; +- case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; +- case PCMK_EXEC_NO_SECRETS: return EACCES; +- +- /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API +- * operations that don't involve executing an agent (for example, +- * registering devices). This allows us to use the CRM_EX_* codes in the +- * exit status for finer-grained responses. +- */ +- case PCMK_EXEC_INVALID: +- switch (result->exit_status) { +- case CRM_EX_INSUFFICIENT_PRIV: return EACCES; +- case CRM_EX_PROTOCOL: return EPROTO; +- +- /* CRM_EX_EXPIRED is used for orphaned fencing operations left +- * over from a previous instance of the fencer. For API backward +- * compatibility, this is mapped to the previously used code for +- * this case, EHOSTUNREACH. +- */ +- case CRM_EX_EXPIRED: return EHOSTUNREACH; +- default: break; +- } +- +- default: +- break; +- } +- +- // Try to provide useful error code based on result's error output +- +- if (result->action_stderr == NULL) { +- return ENODATA; +- +- } else if (strcasestr(result->action_stderr, "timed out") +- || strcasestr(result->action_stderr, "timeout")) { +- return ETIME; +- +- } else if (strcasestr(result->action_stderr, "unrecognised action") +- || strcasestr(result->action_stderr, "unrecognized action") +- || strcasestr(result->action_stderr, "unsupported action")) { +- return EOPNOTSUPP; +- } +- +- // Oh well, we tried +- return pcmk_rc_error; +-} +- +-static void +-stonith_action_async_done(svc_action_t *svc_action) +-{ +- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; +- +- set_result_from_svc_action(action, svc_action); +- +- svc_action->params = NULL; +- +- crm_debug("Child process %d performing action '%s' exited with rc %d", +- action->pid, action->action, svc_action->rc); +- +- log_action(action, action->pid); +- +- if (!pcmk__result_ok(&(action->result)) +- && update_remaining_timeout(action)) { +- +- int rc = internal_stonith_action_execute(action); +- if (rc == pcmk_ok) { +- return; +- } +- } +- +- if (action->done_cb) { +- action->done_cb(action->pid, &(action->result), action->userdata); +- } +- +- action->svc_action = NULL; // don't remove our caller +- stonith__destroy_action(action); +-} +- +-static void +-stonith_action_async_forked(svc_action_t *svc_action) +-{ +- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; +- +- action->pid = svc_action->pid; +- action->svc_action = svc_action; +- +- if (action->fork_cb) { +- (action->fork_cb) (svc_action->pid, action->userdata); +- } +- +- crm_trace("Child process %d performing action '%s' successfully forked", +- action->pid, action->action); +-} +- +-static int +-internal_stonith_action_execute(stonith_action_t * action) +-{ +- int rc = -EPROTO; +- int is_retry = 0; +- svc_action_t *svc_action = NULL; +- static int stonith_sequence = 0; +- char *buffer = NULL; +- +- CRM_CHECK(action != NULL, return -EINVAL); +- +- if ((action->action == NULL) || (action->args == NULL) +- || (action->agent == NULL)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, +- PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); +- return -EINVAL; +- } +- +- if (!action->tries) { +- action->initial_start_time = time(NULL); +- } +- action->tries++; +- +- if (action->tries > 1) { +- crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", +- action->tries, action->agent, action->action, action->remaining_timeout); +- is_retry = 1; +- } +- +- buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", +- basename(action->agent)); +- svc_action = services_action_create_generic(buffer, NULL); +- free(buffer); +- +- if (svc_action->rc != PCMK_OCF_UNKNOWN) { +- set_result_from_svc_action(action, svc_action); +- services_action_free(svc_action); +- return -E2BIG; +- } +- +- svc_action->timeout = 1000 * action->remaining_timeout; +- svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); +- svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), +- action->action, action->tries); +- svc_action->agent = strdup(action->agent); +- svc_action->sequence = stonith_sequence++; +- svc_action->params = action->args; +- svc_action->cb_data = (void *) action; +- svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, +- LOG_TRACE, "Action", +- svc_action->id, svc_action->flags, +- SVC_ACTION_NON_BLOCKED, +- "SVC_ACTION_NON_BLOCKED"); +- +- /* keep retries from executing out of control and free previous results */ +- if (is_retry) { +- pcmk__reset_result(&(action->result)); +- sleep(1); +- } +- +- if (action->async) { +- /* async */ +- if (services_action_async_fork_notify(svc_action, +- &stonith_action_async_done, +- &stonith_action_async_forked)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, +- PCMK_EXEC_PENDING, NULL); +- return pcmk_ok; +- } +- +- } else if (services_action_sync(svc_action)) { // sync success +- rc = pcmk_ok; +- +- } else { // sync failure +- rc = -ECONNABORTED; +- } +- +- set_result_from_svc_action(action, svc_action); +- svc_action->params = NULL; +- services_action_free(svc_action); +- return rc; +-} +- +-/*! +- * \internal +- * \brief Kick off execution of an async stonith action +- * +- * \param[in,out] action Action to be executed +- * \param[in,out] userdata Datapointer to be passed to callbacks +- * \param[in] done Callback to notify action has failed/succeeded +- * \param[in] fork_callback Callback to notify successful fork of child +- * +- * \return pcmk_ok if ownership of action has been taken, -errno otherwise +- */ +-int +-stonith_action_execute_async(stonith_action_t * action, +- void *userdata, +- void (*done) (int pid, +- const pcmk__action_result_t *result, +- void *user_data), +- void (*fork_cb) (int pid, void *user_data)) +-{ +- if (!action) { +- return -EINVAL; +- } +- +- action->userdata = userdata; +- action->done_cb = done; +- action->fork_cb = fork_cb; +- action->async = 1; +- +- return internal_stonith_action_execute(action); +-} +- +-/*! +- * \internal +- * \brief Execute a stonith action +- * +- * \param[in,out] action Action to execute +- * +- * \return pcmk_ok on success, -errno otherwise +- */ +-int +-stonith__execute(stonith_action_t *action) +-{ +- int rc = pcmk_ok; +- +- CRM_CHECK(action != NULL, return -EINVAL); +- +- // Keep trying until success, max retries, or timeout +- do { +- rc = internal_stonith_action_execute(action); +- } while ((rc != pcmk_ok) && update_remaining_timeout(action)); +- +- return rc; +-} +- + static int + stonith_api_device_list(stonith_t * stonith, int call_options, const char *namespace, + stonith_key_value_t ** devices, int timeout) +-- +2.27.0 + + +From 883a3cf7d3f73d02417d3997a7885dd5a7bebac7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:39:17 -0600 +Subject: [PATCH 05/13] Low: fencing,executor: improve mapping of legacy return + code to execution status + +Move stonith_rc2status() from the executor to the fencing library for future +reuse, exposing it internally as stonith__legacy2status(). Update it to use +recently added execution status codes. +--- + daemons/execd/execd_commands.c | 66 ++++++++-------------------------- + include/crm/fencing/internal.h | 2 ++ + lib/fencing/st_actions.c | 36 +++++++++++++++++++ + 3 files changed, 52 insertions(+), 52 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 02070bf11..0ccaa1ced 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -21,6 +21,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -999,56 +1000,6 @@ action_complete(svc_action_t * action) + cmd_finalize(cmd, rsc); + } + +-/*! +- * \internal +- * \brief Determine operation status of a stonith operation +- * +- * Non-stonith resource operations get their operation status directly from the +- * service library, but the fencer does not have an equivalent, so we must infer +- * an operation status from the fencer API's return code. +- * +- * \param[in] action Name of action performed on stonith resource +- * \param[in] interval_ms Action interval +- * \param[in] rc Action result from fencer +- * +- * \return Operation status corresponding to fencer API return code +- */ +-static int +-stonith_rc2status(const char *action, guint interval_ms, int rc) +-{ +- int status = PCMK_EXEC_DONE; +- +- switch (rc) { +- case pcmk_ok: +- break; +- +- case -EOPNOTSUPP: +- case -EPROTONOSUPPORT: +- status = PCMK_EXEC_NOT_SUPPORTED; +- break; +- +- case -ETIME: +- case -ETIMEDOUT: +- status = PCMK_EXEC_TIMEOUT; +- break; +- +- case -ENOTCONN: +- case -ECOMM: +- // Couldn't talk to fencer +- status = PCMK_EXEC_ERROR; +- break; +- +- case -ENODEV: +- // The device is not registered with the fencer +- status = PCMK_EXEC_ERROR; +- break; +- +- default: +- break; +- } +- return status; +-} +- + static void + stonith_action_complete(lrmd_cmd_t * cmd, int rc) + { +@@ -1062,8 +1013,19 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + * the fencer return code. + */ + if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { +- cmd->result.execution_status = stonith_rc2status(cmd->action, +- cmd->interval_ms, rc); ++ cmd->result.execution_status = stonith__legacy2status(rc); ++ ++ // Simplify status codes from fencer ++ switch (cmd->result.execution_status) { ++ case PCMK_EXEC_NOT_CONNECTED: ++ case PCMK_EXEC_INVALID: ++ case PCMK_EXEC_NO_FENCE_DEVICE: ++ case PCMK_EXEC_NO_SECRETS: ++ cmd->result.execution_status = PCMK_EXEC_ERROR; ++ break; ++ default: ++ break; ++ } + + // Certain successful actions change the known state of the resource + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 6a7e4232c..80f6443be 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -182,6 +182,8 @@ bool stonith__event_state_pending(stonith_history_t *history, void *user_data); + bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + ++int stonith__legacy2status(int rc); ++ + /*! + * \internal + * \brief Is a fencing operation in pending state? +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 64d3afd5d..9e785595a 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -360,6 +360,42 @@ stonith__result2rc(const pcmk__action_result_t *result) + return pcmk_rc_error; + } + ++/*! ++ * \internal ++ * \brief Determine execution status equivalent of legacy fencer return code ++ * ++ * Fence action notifications, and fence action callbacks from older fencers ++ * (<=2.1.2) in a rolling upgrade, will have only a legacy return code. Map this ++ * to an execution status as best as possible (essentially, the inverse of ++ * stonith__result2rc()). ++ * ++ * \param[in] rc Legacy return code from fencer ++ * ++ * \return Execution status best corresponding to \p rc ++ */ ++int ++stonith__legacy2status(int rc) ++{ ++ if (rc >= 0) { ++ return PCMK_EXEC_DONE; ++ } ++ switch (-rc) { ++ case EACCES: return PCMK_EXEC_NO_SECRETS; ++ case ECANCELED: return PCMK_EXEC_CANCELLED; ++ case EHOSTUNREACH: return PCMK_EXEC_INVALID; ++ case EINPROGRESS: return PCMK_EXEC_PENDING; ++ case ENODEV: return PCMK_EXEC_NO_FENCE_DEVICE; ++ case ENOENT: return PCMK_EXEC_NOT_INSTALLED; ++ case ENOTCONN: return PCMK_EXEC_NOT_CONNECTED; ++ case EOPNOTSUPP: return PCMK_EXEC_NOT_SUPPORTED; ++ case EPROTO: return PCMK_EXEC_INVALID; ++ case EPROTONOSUPPORT: return PCMK_EXEC_NOT_SUPPORTED; ++ case ETIME: return PCMK_EXEC_TIMEOUT; ++ case ETIMEDOUT: return PCMK_EXEC_TIMEOUT; ++ default: return PCMK_EXEC_ERROR; ++ } ++} ++ + static void + stonith_action_async_done(svc_action_t *svc_action) + { +-- +2.27.0 + + +From 639a9f4a2cbeb6cc41b754a1dcb1f360a9500e03 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 16:54:32 -0600 +Subject: [PATCH 06/13] Refactor: fencing: add functions for getting/setting + result via XML + +These will come in handy as we update the various fencer messages to include a +full result rather than just a legacy return code. The functions are in a new +source file fenced_messages.c which can have other stuff moved to it later. +--- + include/crm/fencing/internal.h | 3 + + lib/fencing/st_actions.c | 107 +++++++++++++++++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 80f6443be..4b5fd3959 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -60,6 +60,9 @@ stonith_action_t *stonith_action_create(const char *agent, + void stonith__destroy_action(stonith_action_t *action); + pcmk__action_result_t *stonith__action_result(stonith_action_t *action); + int stonith__result2rc(const pcmk__action_result_t *result); ++void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result); ++void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result); ++xmlNode *stonith__find_xe_with_result(xmlNode *xml); + + int + stonith_action_execute_async(stonith_action_t * action, +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 9e785595a..d4fc3f5ed 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -396,6 +396,113 @@ stonith__legacy2status(int rc) + } + } + ++/*! ++ * \internal ++ * \brief Add a fencing result to an XML element as attributes ++ * ++ * \param[in] xml XML element to add result to ++ * \param[in] result Fencing result to add (assume success if NULL) ++ */ ++void ++stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) ++{ ++ int exit_status = CRM_EX_OK; ++ enum pcmk_exec_status execution_status = PCMK_EXEC_DONE; ++ const char *exit_reason = NULL; ++ const char *action_stdout = NULL; ++ int rc = pcmk_ok; ++ ++ CRM_CHECK(xml != NULL, return); ++ ++ if (result != NULL) { ++ exit_status = result->exit_status; ++ execution_status = result->execution_status; ++ exit_reason = result->exit_reason; ++ action_stdout = result->action_stdout; ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); ++ } ++ ++ crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); ++ crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); ++ crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); ++ crm_xml_add(xml, "st_output", action_stdout); ++ ++ /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external ++ * code that use libstonithd <=2.1.2 don't check for the full result, and ++ * need a legacy return code instead. ++ */ ++ crm_xml_add_int(xml, F_STONITH_RC, rc); ++} ++ ++/*! ++ * \internal ++ * \brief Find a fencing result beneath an XML element ++ * ++ * \param[in] xml XML element to search ++ * ++ * \return \p xml or descendent of it that contains a fencing result, else NULL ++ */ ++xmlNode * ++stonith__find_xe_with_result(xmlNode *xml) ++{ ++ xmlNode *match = get_xpath_object("//@" XML_LRM_ATTR_RC, xml, LOG_NEVER); ++ ++ if (match == NULL) { ++ /* @COMPAT Peers <=2.1.2 in a rolling upgrade provide only a legacy ++ * return code, not a full result, so check for that. ++ */ ++ match = get_xpath_object("//@" F_STONITH_RC, xml, LOG_ERR); ++ } ++ return match; ++} ++ ++/*! ++ * \internal ++ * \brief Get a fencing result from an XML element's attributes ++ * ++ * \param[in] xml XML element with fencing result ++ * \param[out] result Where to store fencing result ++ */ ++void ++stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) ++{ ++ int exit_status = CRM_EX_OK; ++ int execution_status = PCMK_EXEC_DONE; ++ const char *exit_reason = NULL; ++ char *action_stdout = NULL; ++ ++ CRM_CHECK((xml != NULL) && (result != NULL), return); ++ ++ exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); ++ action_stdout = crm_element_value_copy(xml, "st_output"); ++ ++ // A result must include an exit status and execution status ++ if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) ++ || (crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, ++ &execution_status) < 0)) { ++ int rc = pcmk_ok; ++ exit_status = CRM_EX_ERROR; ++ ++ /* @COMPAT Peers <=2.1.2 in rolling upgrades provide only a legacy ++ * return code, not a full result, so check for that. ++ */ ++ if (crm_element_value_int(xml, F_STONITH_RC, &rc) == 0) { ++ if ((rc == pcmk_ok) || (rc == -EINPROGRESS)) { ++ exit_status = CRM_EX_OK; ++ } ++ execution_status = stonith__legacy2status(rc); ++ exit_reason = pcmk_strerror(rc); ++ ++ } else { ++ execution_status = PCMK_EXEC_ERROR; ++ exit_reason = "Fencer reply contained neither a full result " ++ "nor a legacy return code (bug?)"; ++ } ++ } ++ pcmk__set_result(result, exit_status, execution_status, exit_reason); ++ pcmk__set_result_output(result, action_stdout, NULL); ++} ++ + static void + stonith_action_async_done(svc_action_t *svc_action) + { +-- +2.27.0 + + +From 1f0121c6ad0d0235bcf01c8b60f9153592b3db83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:10:53 -0600 +Subject: [PATCH 07/13] Refactor: fencing: rename functions for invoking fence + callbacks + +... to make it clearer what the difference between them is +--- + lib/fencing/st_client.c | 44 +++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 944cd1863..dfc5860fc 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -847,9 +847,21 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) + return pcmk_ok; + } + ++/*! ++ * \internal ++ * \brief Invoke a (single) specified fence action callback ++ * ++ * \param[in] st Fencer API connection ++ * \param[in] call_id If positive, call ID of completed fence action, otherwise ++ * legacy return code for early action failure ++ * \param[in] rc Legacy return code for action result ++ * \param[in] userdata User data to pass to callback ++ * \param[in] callback Fence action callback to invoke ++ */ + static void +-invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, +- void (*callback) (stonith_t * st, stonith_callback_data_t * data)) ++invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, ++ void (*callback) (stonith_t *st, ++ stonith_callback_data_t *data)) + { + stonith_callback_data_t data = { 0, }; + +@@ -860,8 +872,21 @@ invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, + callback(st, &data); + } + ++/*! ++ * \internal ++ * \brief Invoke any callbacks registered for a specified fence action result ++ * ++ * Given a fence action result from the fencer, invoke any callback registered ++ * for that action, as well as any global callback registered. ++ * ++ * \param[in] st Fencer API connection ++ * \param[in] msg If non-NULL, fencer reply ++ * \param[in] call_id If \p msg is NULL, call ID of action that timed out ++ * \param[in] rc Legacy return code for result of action ++ */ + static void +-stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc) ++invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, ++ int rc) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *blob = NULL; +@@ -899,7 +924,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc + + if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { + crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); +- invoke_callback(stonith, call_id, rc, local_blob.user_data, local_blob.callback); ++ invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, ++ local_blob.callback); + + } else if (private->op_callback == NULL && rc != pcmk_ok) { + crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); +@@ -908,7 +934,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc + + if (private->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); +- invoke_callback(stonith, call_id, rc, NULL, private->op_callback); ++ invoke_fence_action_callback(stonith, call_id, rc, NULL, ++ private->op_callback); + } + crm_trace("OP callback activated."); + } +@@ -919,7 +946,7 @@ stonith_async_timeout_handler(gpointer data) + struct timer_rec_s *timer = data; + + crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); +- stonith_perform_callback(timer->stonith, NULL, timer->call_id, -ETIME); ++ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); + + /* Always return TRUE, never remove the handler + * We do that in stonith_del_callback() +@@ -994,7 +1021,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) + crm_trace("Activating %s callbacks...", type); + + if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { +- stonith_perform_callback(st, blob.xml, 0, 0); ++ invoke_registered_callbacks(st, blob.xml, 0, 0); + + } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { + foreach_notify_entry(private, stonith_send_notification, &blob); +@@ -1229,7 +1256,8 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti + } else if (call_id < 0) { + if (!(options & st_opt_report_only_success)) { + crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); +- invoke_callback(stonith, call_id, call_id, user_data, callback); ++ invoke_fence_action_callback(stonith, call_id, call_id, user_data, ++ callback); + } else { + crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); + } +-- +2.27.0 + + +From c32f11e70a88244f5a3217608055a4eaf8d28231 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:21:00 -0600 +Subject: [PATCH 08/13] Refactor: fencing: drop unnecessary argument when + invoking callbacks + +Refactor invoke_registered_callbacks() to treat a NULL message as a timeout, so +we can drop the rc argument. +--- + lib/fencing/st_client.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index dfc5860fc..9f2b0c1c1 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -882,15 +882,14 @@ invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, + * \param[in] st Fencer API connection + * \param[in] msg If non-NULL, fencer reply + * \param[in] call_id If \p msg is NULL, call ID of action that timed out +- * \param[in] rc Legacy return code for result of action + */ + static void +-invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, +- int rc) ++invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *blob = NULL; + stonith_callback_client_t local_blob; ++ int rc = pcmk_ok; + + CRM_CHECK(stonith != NULL, return); + CRM_CHECK(stonith->st_private != NULL, return); +@@ -902,7 +901,13 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, + local_blob.user_data = NULL; + local_blob.only_success = FALSE; + +- if (msg != NULL) { ++ if (msg == NULL) { ++ // Fencer didn't reply in time ++ rc = -ETIME; ++ ++ } else { ++ // We have the fencer reply ++ + crm_element_value_int(msg, F_STONITH_RC, &rc); + crm_element_value_int(msg, F_STONITH_CALLID, &call_id); + } +@@ -946,7 +951,7 @@ stonith_async_timeout_handler(gpointer data) + struct timer_rec_s *timer = data; + + crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); +- invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); ++ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id); + + /* Always return TRUE, never remove the handler + * We do that in stonith_del_callback() +@@ -1021,7 +1026,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) + crm_trace("Activating %s callbacks...", type); + + if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { +- invoke_registered_callbacks(st, blob.xml, 0, 0); ++ invoke_registered_callbacks(st, blob.xml, 0); + + } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { + foreach_notify_entry(private, stonith_send_notification, &blob); +-- +2.27.0 + + +From 5d8279b51ea9df738354649e4065663f2c16f1e6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:21:57 -0600 +Subject: [PATCH 09/13] Log: fencing: improve message for callback errors + +Improve checking of fencer replies, which also allows us to distinguish an +internal bug from a bad fencer reply in logs. Lower the bad reply message to +warning. +--- + lib/fencing/st_client.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 9f2b0c1c1..170b9d450 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -904,15 +904,20 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + if (msg == NULL) { + // Fencer didn't reply in time + rc = -ETIME; ++ CRM_LOG_ASSERT(call_id > 0); + + } else { + // We have the fencer reply + +- crm_element_value_int(msg, F_STONITH_RC, &rc); +- crm_element_value_int(msg, F_STONITH_CALLID, &call_id); +- } ++ if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { ++ rc = -pcmk_err_generic; ++ } + +- CRM_CHECK(call_id > 0, crm_log_xml_err(msg, "Bad result")); ++ if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) ++ || (call_id <= 0)) { ++ crm_log_xml_warn(msg, "Bad fencer reply"); ++ } ++ } + + blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, + call_id); +-- +2.27.0 + + +From e03c14d24e8cb011e870b9460930d139705bf0a2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 9 Nov 2021 14:59:12 -0600 +Subject: [PATCH 10/13] Doc: fencing: correct stonith_api_operations_t method + descriptions + +Many of the methods return a positive call ID on success +--- + include/crm/stonith-ng.h | 60 ++++++++++++++++++++++------------------ + 1 file changed, 33 insertions(+), 27 deletions(-) + +diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h +index 8d6ad477d..9643820e9 100644 +--- a/include/crm/stonith-ng.h ++++ b/include/crm/stonith-ng.h +@@ -164,39 +164,38 @@ typedef struct stonith_api_operations_s + int (*disconnect)(stonith_t *st); + + /*! +- * \brief Remove a registered stonith device with the local stonith daemon. ++ * \brief Unregister a fence device with the local fencer + * +- * \note Synchronous, guaranteed to occur in daemon before function returns. +- * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*remove_device)( + stonith_t *st, int options, const char *name); + + /*! +- * \brief Register a stonith device with the local stonith daemon. ++ * \brief Register a fence device with the local fencer + * +- * \note Synchronous, guaranteed to occur in daemon before function returns. +- * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*register_device)( + stonith_t *st, int options, const char *id, + const char *provider, const char *agent, stonith_key_value_t *params); + + /*! +- * \brief Remove a fencing level for a specific node. ++ * \brief Unregister a fencing level for specified node with local fencer + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*remove_level)( + stonith_t *st, int options, const char *node, int level); + + /*! +- * \brief Register a fencing level containing the fencing devices to be used +- * at that level for a specific node. ++ * \brief Register a fencing level for specified node with local fencer + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*register_level)( + stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list); +@@ -226,21 +225,24 @@ typedef struct stonith_api_operations_s + /*! + * \brief Retrieve string listing hosts and port assignments from a local stonith device. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*list)(stonith_t *st, int options, const char *id, char **list_output, int timeout); + + /*! + * \brief Check to see if a local stonith device is reachable + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*monitor)(stonith_t *st, int options, const char *id, int timeout); + + /*! + * \brief Check to see if a local stonith device's port is reachable + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*status)(stonith_t *st, int options, const char *id, const char *port, int timeout); + +@@ -267,7 +269,8 @@ typedef struct stonith_api_operations_s + * \param timeout, The default per device timeout to use with each device + * capable of fencing the target. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*fence)(stonith_t *st, int options, const char *node, const char *action, + int timeout, int tolerance); +@@ -275,7 +278,8 @@ typedef struct stonith_api_operations_s + /*! + * \brief Manually confirm that a node is down. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*confirm)(stonith_t *st, int options, const char *node); + +@@ -304,9 +308,6 @@ typedef struct stonith_api_operations_s + * \param[in] callback The callback function to register + * + * \return \c TRUE on success, \c FALSE if call_id is negative, -errno otherwise +- * +- * \todo This function should return \c pcmk_ok on success, and \c call_id +- * when negative, but that would break backward compatibility. + */ + int (*register_callback)(stonith_t *st, + int call_id, +@@ -317,12 +318,14 @@ typedef struct stonith_api_operations_s + void (*callback)(stonith_t *st, stonith_callback_data_t *data)); + + /*! +- * \brief Remove a registered callback for a given call id. ++ * \brief Remove a registered callback for a given call id ++ * ++ * \return pcmk_ok + */ + int (*remove_callback)(stonith_t *st, int call_id, bool all_callbacks); + + /*! +- * \brief Remove fencing level for specific node, node regex or attribute ++ * \brief Unregister fencing level for specified node, pattern or attribute + * + * \param[in] st Fencer connection to use + * \param[in] options Bitmask of stonith_call_options to pass to the fencer +@@ -332,7 +335,8 @@ typedef struct stonith_api_operations_s + * \param[in] value If not NULL, target by this node attribute value + * \param[in] level Index number of level to remove + * +- * \return 0 on success, negative error code otherwise ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + * + * \note The caller should set only one of node, pattern or attr/value. + */ +@@ -341,7 +345,7 @@ typedef struct stonith_api_operations_s + const char *attr, const char *value, int level); + + /*! +- * \brief Register fencing level for specific node, node regex or attribute ++ * \brief Register fencing level for specified node, pattern or attribute + * + * \param[in] st Fencer connection to use + * \param[in] options Bitmask of stonith_call_options to pass to fencer +@@ -352,7 +356,8 @@ typedef struct stonith_api_operations_s + * \param[in] level Index number of level to add + * \param[in] device_list Devices to use in level + * +- * \return 0 on success, negative error code otherwise ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + * + * \note The caller should set only one of node, pattern or attr/value. + */ +@@ -398,7 +403,8 @@ typedef struct stonith_api_operations_s + * \param delay, Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*fence_with_delay)(stonith_t *st, int options, const char *node, const char *action, + int timeout, int tolerance, int delay); +-- +2.27.0 + + +From 18c382731889b626b21ba6a14f9213ef1e45a524 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 11:14:24 -0600 +Subject: [PATCH 11/13] Refactor: fencing: define constant for XML attribute + for action output + +--- + daemons/fenced/fenced_commands.c | 4 ++-- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_actions.c | 4 ++-- + lib/fencing/st_client.c | 2 +- + 4 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 26501a4b3..aa14c52af 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2677,7 +2677,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- crm_xml_add(reply, "st_output", output); ++ crm_xml_add(reply, F_STONITH_OUTPUT, output); + crm_xml_add_int(reply, F_STONITH_RC, rc); + + if (request == NULL) { +@@ -2743,7 +2743,7 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); + crm_xml_add_int(reply, F_STONITH_RC, + pcmk_rc2legacy(stonith__result2rc(result))); +- crm_xml_add(reply, "st_output", result->action_stdout); ++ crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); + return reply; + } + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 4b5fd3959..f0d294a0b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -105,6 +105,7 @@ void stonith__device_parameter_flags(uint32_t *device_flags, + # define F_STONITH_REMOTE_OP_ID "st_remote_op" + # define F_STONITH_REMOTE_OP_ID_RELAY "st_remote_op_relay" + # define F_STONITH_RC "st_rc" ++# define F_STONITH_OUTPUT "st_output" + /*! Timeout period per a device execution */ + # define F_STONITH_TIMEOUT "st_timeout" + # define F_STONITH_TOLERANCE "st_tolerance" +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index d4fc3f5ed..5636810a5 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -425,7 +425,7 @@ stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) + crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); + crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); + crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); +- crm_xml_add(xml, "st_output", action_stdout); ++ crm_xml_add(xml, F_STONITH_OUTPUT, action_stdout); + + /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external + * code that use libstonithd <=2.1.2 don't check for the full result, and +@@ -474,7 +474,7 @@ stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) + CRM_CHECK((xml != NULL) && (result != NULL), return); + + exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); +- action_stdout = crm_element_value_copy(xml, "st_output"); ++ action_stdout = crm_element_value_copy(xml, F_STONITH_OUTPUT); + + // A result must include an exit status and execution status + if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 170b9d450..2dfadf922 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -600,7 +600,7 @@ stonith_api_list(stonith_t * stonith, int call_options, const char *id, char **l + if (output && list_info) { + const char *list_str; + +- list_str = crm_element_value(output, "st_output"); ++ list_str = crm_element_value(output, F_STONITH_OUTPUT); + + if (list_str) { + *list_info = strdup(list_str); +-- +2.27.0 + + +From 9fe9ed5d46c810cb9c12eb07271373ab92d271cd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 11:39:32 -0600 +Subject: [PATCH 12/13] Refactor: fencing: simplify invoking callbacks + +--- + lib/fencing/st_client.c | 42 +++++++++++++++++------------------------ + 1 file changed, 17 insertions(+), 25 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2dfadf922..2ca094566 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -887,8 +887,7 @@ static void + invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; +- stonith_callback_client_t *blob = NULL; +- stonith_callback_client_t local_blob; ++ stonith_callback_client_t *cb_info = NULL; + int rc = pcmk_ok; + + CRM_CHECK(stonith != NULL, return); +@@ -896,11 +895,6 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + + private = stonith->st_private; + +- local_blob.id = NULL; +- local_blob.callback = NULL; +- local_blob.user_data = NULL; +- local_blob.only_success = FALSE; +- + if (msg == NULL) { + // Fencer didn't reply in time + rc = -ETIME; +@@ -919,26 +913,21 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + } + } + +- blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, +- call_id); +- if (blob != NULL) { +- local_blob = *blob; +- blob = NULL; +- +- stonith_api_del_callback(stonith, call_id, FALSE); +- +- } else { +- crm_trace("No callback found for call %d", call_id); +- local_blob.callback = NULL; ++ if (call_id > 0) { ++ cb_info = pcmk__intkey_table_lookup(private->stonith_op_callback_table, ++ call_id); + } + +- if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { +- crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); +- invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, +- local_blob.callback); ++ if ((cb_info != NULL) && (cb_info->callback != NULL) ++ && (rc == pcmk_ok || !(cb_info->only_success))) { ++ crm_trace("Invoking callback %s for call %d", ++ crm_str(cb_info->id), call_id); ++ invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, ++ cb_info->callback); + +- } else if (private->op_callback == NULL && rc != pcmk_ok) { +- crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); ++ } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { ++ crm_warn("Fencing action without registered callback failed: %s", ++ pcmk_strerror(rc)); + crm_log_xml_debug(msg, "Failed fence update"); + } + +@@ -947,7 +936,10 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + invoke_fence_action_callback(stonith, call_id, rc, NULL, + private->op_callback); + } +- crm_trace("OP callback activated."); ++ ++ if (cb_info != NULL) { ++ stonith_api_del_callback(stonith, call_id, FALSE); ++ } + } + + static gboolean +-- +2.27.0 + + +From 8113b800ce677ba17a16ca176e8f6f9b4a042316 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 18:14:48 -0600 +Subject: [PATCH 13/13] Refactor: fencing: add a missing "break" statement + +No effect, but more correct +--- + lib/fencing/st_actions.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 5636810a5..7eaa8b0f2 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -336,6 +336,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + case CRM_EX_EXPIRED: return EHOSTUNREACH; + default: break; + } ++ break; + + default: + break; +-- +2.27.0 + diff --git a/SOURCES/004-systemd-metadata.patch b/SOURCES/004-systemd-metadata.patch new file mode 100644 index 0000000..142ef6a --- /dev/null +++ b/SOURCES/004-systemd-metadata.patch @@ -0,0 +1,73 @@ +From 09ef95a2eed48b4eb7488788a1b655d67eafe783 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 30 Nov 2021 14:47:12 -0500 +Subject: [PATCH] Low: libcrmservice: Handle systemd service templates. + +These unit files (which have an @ sign at the end) expect to be +parameterized by an instance name. Not providing an instance name +causes the dbus lookup to fail, and we fall back to assume this is an +LSB service. If the user doesn't provide an instance name, just add a +fake one. It doesn't seem to matter what name is given for the lookup. + +See: rhbz#2003151 +--- + lib/services/systemd.c | 22 ++++++++++++++++------ + 1 file changed, 16 insertions(+), 6 deletions(-) + +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 8e9fff484..27a3b376d 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -206,17 +206,27 @@ systemd_unit_extension(const char *name) + } + + static char * +-systemd_service_name(const char *name) ++systemd_service_name(const char *name, bool add_instance_name) + { +- if (name == NULL) { ++ if (pcmk__str_empty(name)) { + return NULL; + } + + if (systemd_unit_extension(name)) { + return strdup(name); +- } + +- return crm_strdup_printf("%s.service", name); ++ /* Services that end with an @ sign are systemd templates. They expect an ++ * instance name to follow the service name. If no instance name was ++ * provided, just add "x" to the string as the instance name. It doesn't ++ * seem to matter for purposes of looking up whether a service exists or ++ * not. ++ */ ++ } else if (add_instance_name && *(name+strlen(name)-1) == '@') { ++ return crm_strdup_printf("%sx.service", name); ++ ++ } else { ++ return crm_strdup_printf("%s.service", name); ++ } + } + + static void +@@ -427,7 +437,7 @@ invoke_unit_by_name(const char *arg_name, svc_action_t *op, char **path) + CRM_ASSERT(msg != NULL); + + // Add the (expanded) unit name as the argument +- name = systemd_service_name(arg_name); ++ name = systemd_service_name(arg_name, op == NULL || pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, + DBUS_TYPE_INVALID)); + free(name); +@@ -944,7 +954,7 @@ invoke_unit_by_path(svc_action_t *op, const char *unit) + /* (ss) */ + { + const char *replace_s = "replace"; +- char *name = systemd_service_name(op->agent); ++ char *name = systemd_service_name(op->agent, pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); + + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID)); +-- +2.27.0 + diff --git a/SOURCES/005-fencing-reasons.patch b/SOURCES/005-fencing-reasons.patch new file mode 100644 index 0000000..e0772c6 --- /dev/null +++ b/SOURCES/005-fencing-reasons.patch @@ -0,0 +1,2200 @@ +From 3d10dad9a555aae040d8473edfe31a4e4279c066 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:34:03 -0600 +Subject: [PATCH 01/19] Refactor: libcrmcommon: add internal API for checking + for fencing action + +The naming is a little awkward -- "fencing action" has multiple meanings +depending on the context. It can refer to fencer API requests, fence device +actions, fence agent actions, or just those actions that fence a node (off and +reboot). + +This new function pcmk__is_fencing_action() uses the last meaning, so it does +*not* return true for unfencing ("on" actions). +--- + include/crm/common/internal.h | 1 + + lib/common/operations.c | 14 ++++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h +index a35c5769a..694fc6cd4 100644 +--- a/include/crm/common/internal.h ++++ b/include/crm/common/internal.h +@@ -218,6 +218,7 @@ char *pcmk__notify_key(const char *rsc_id, const char *notify_type, + char *pcmk__transition_key(int transition_id, int action_id, int target_rc, + const char *node); + void pcmk__filter_op_for_digest(xmlNode *param_set); ++bool pcmk__is_fencing_action(const char *action); + + + // bitwise arithmetic utilities +diff --git a/lib/common/operations.c b/lib/common/operations.c +index aa7106ce6..366c18970 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -523,3 +523,17 @@ crm_op_needs_metadata(const char *rsc_class, const char *op) + CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, + CRMD_ACTION_NOTIFY, NULL); + } ++ ++/*! ++ * \internal ++ * \brief Check whether an action name is for a fencing action ++ * ++ * \param[in] action Action name to check ++ * ++ * \return true if \p action is "off", "reboot", or "poweroff", otherwise false ++ */ ++bool ++pcmk__is_fencing_action(const char *action) ++{ ++ return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); ++} +-- +2.27.0 + + +From 86ac00fb3e99d79ca2c442ae1670fe850146f734 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:38:58 -0600 +Subject: [PATCH 02/19] Low: fencer,scheduler: compare fence action names + case-sensitively + +Use the new convenience function pcmk__is_fencing_action() to check whether +an action name is a fencing action ("off", "reboot", or "poweroff"). This +changes the behavior from case-insensitive to case-sensitive, which is more +appropriate (the case-insensitivity was inherited from lazy use of the old +safe_str_eq() function which was always case-insensitive). +--- + daemons/fenced/fenced_commands.c | 6 +++--- + daemons/fenced/fenced_remote.c | 2 +- + lib/pacemaker/pcmk_graph_producer.c | 2 +- + lib/pengine/common.c | 8 +------- + 4 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 63bfad3a9..46c840f2a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -128,7 +128,7 @@ get_action_delay_max(stonith_device_t * device, const char * action) + const char *value = NULL; + int delay_max = 0; + +- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { ++ if (!pcmk__is_fencing_action(action)) { + return 0; + } + +@@ -146,7 +146,7 @@ get_action_delay_base(stonith_device_t *device, const char *action, const char * + char *hash_value = NULL; + int delay_base = 0; + +- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { ++ if (!pcmk__is_fencing_action(action)) { + return 0; + } + +@@ -448,7 +448,7 @@ stonith_device_execute(stonith_device_t * device) + + if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, + STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { +- if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { ++ if (pcmk__is_fencing_action(cmd->action)) { + if (node_does_watchdog_fencing(stonith_our_uname)) { + pcmk__panic(__func__); + goto done; +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 963433bf3..358ea3aa7 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1758,7 +1758,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + if (!tp) { + return FALSE; + } +- if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) { ++ if (pcmk__is_fencing_action(op->action)) { + /* Don't count the devices on the target node if we are killing + * the target node. */ + skip_target = TRUE; +diff --git a/lib/pacemaker/pcmk_graph_producer.c b/lib/pacemaker/pcmk_graph_producer.c +index ffcbd1274..5bec9d8ce 100644 +--- a/lib/pacemaker/pcmk_graph_producer.c ++++ b/lib/pacemaker/pcmk_graph_producer.c +@@ -721,7 +721,7 @@ add_downed_nodes(xmlNode *xml, const pe_action_t *action, + /* Fencing makes the action's node and any hosted guest nodes down */ + const char *fence = g_hash_table_lookup(action->meta, "stonith_action"); + +- if (pcmk__strcase_any_of(fence, "off", "reboot", NULL)) { ++ if (pcmk__is_fencing_action(fence)) { + xmlNode *downed = create_xml_node(xml, XML_GRAPH_TAG_DOWNED); + add_node_to_xml_by_id(action->node->details->id, downed); + pe_foreach_guest_node(data_set, action->node, add_node_to_xml, downed); +diff --git a/lib/pengine/common.c b/lib/pengine/common.c +index 236fc26b1..fe4223816 100644 +--- a/lib/pengine/common.c ++++ b/lib/pengine/common.c +@@ -27,12 +27,6 @@ check_health(const char *value) + "migrate-on-red", NULL); + } + +-static bool +-check_stonith_action(const char *value) +-{ +- return pcmk__strcase_any_of(value, "reboot", "poweroff", "off", NULL); +-} +- + static bool + check_placement_strategy(const char *value) + { +@@ -114,7 +108,7 @@ static pcmk__cluster_option_t pe_opts[] = { + }, + { + "stonith-action", NULL, "select", "reboot, off, poweroff", +- "reboot", check_stonith_action, ++ "reboot", pcmk__is_fencing_action, + "Action to send to fence device when a node needs to be fenced " + "(\"poweroff\" is a deprecated alias for \"off\")", + NULL +-- +2.27.0 + + +From c8f6e8a04c4fa4271db817af0a23aa941c9d7689 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 17:42:21 -0600 +Subject: [PATCH 03/19] Refactor: fencing: rename type for peer query replies + +st_query_result_t contains the device information parsed from a peer's query +reply, but the name could easily be confused with the actual success/failure +result of the query action itself. Rename it to peer_device_info_t. +--- + daemons/fenced/fenced_remote.c | 103 +++++++++++++++++---------------- + 1 file changed, 52 insertions(+), 51 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 358ea3aa7..9e2f62804 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -41,7 +41,7 @@ + + /* When one fencer queries its peers for devices able to handle a fencing + * request, each peer will reply with a list of such devices available to it. +- * Each reply will be parsed into a st_query_result_t, with each device's ++ * Each reply will be parsed into a peer_device_info_t, with each device's + * information kept in a device_properties_t. + */ + +@@ -72,18 +72,19 @@ typedef struct st_query_result_s { + int ndevices; + /* Devices available to this host that are capable of fencing the target */ + GHashTable *devices; +-} st_query_result_t; ++} peer_device_info_t; + + GHashTable *stonith_remote_op_list = NULL; + +-void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc); ++void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, ++ int rc); + static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, +- const st_query_result_t *chosen_peer); ++ const peer_device_info_t *chosen_peer); + + static gint + sort_strings(gconstpointer a, gconstpointer b) +@@ -95,7 +96,7 @@ static void + free_remote_query(gpointer data) + { + if (data) { +- st_query_result_t *query = data; ++ peer_device_info_t *query = data; + + crm_trace("Free'ing query result from %s", query->host); + g_hash_table_destroy(query->devices); +@@ -150,8 +151,8 @@ count_peer_device(gpointer key, gpointer value, gpointer user_data) + * \return Number of devices available to peer that were not already executed + */ + static int +-count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, +- gboolean verified_only) ++count_peer_devices(const remote_fencing_op_t *op, ++ const peer_device_info_t *peer, gboolean verified_only) + { + struct peer_count_data data; + +@@ -175,7 +176,7 @@ count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, + * \return Device properties if found, NULL otherwise + */ + static device_properties_t * +-find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, ++find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer, + const char *device) + { + device_properties_t *props = g_hash_table_lookup(peer->devices, device); +@@ -196,7 +197,7 @@ find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, + * \return TRUE if device was found and marked, FALSE otherwise + */ + static gboolean +-grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer, ++grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer, + const char *device, gboolean verified_devices_only) + { + device_properties_t *props = find_peer_device(op, peer, device); +@@ -1216,7 +1217,7 @@ enum find_best_peer_options { + FIND_PEER_VERIFIED_ONLY = 0x0004, + }; + +-static st_query_result_t * ++static peer_device_info_t * + find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options) + { + GList *iter = NULL; +@@ -1227,7 +1228,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + } + + for (iter = op->query_results; iter != NULL; iter = iter->next) { +- st_query_result_t *peer = iter->data; ++ peer_device_info_t *peer = iter->data; + + crm_trace("Testing result from %s targeting %s with %d device%s: %d %x", + peer->host, op->target, peer->ndevices, +@@ -1257,11 +1258,11 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + return NULL; + } + +-static st_query_result_t * ++static peer_device_info_t * + stonith_choose_peer(remote_fencing_op_t * op) + { + const char *device = NULL; +- st_query_result_t *peer = NULL; ++ peer_device_info_t *peer = NULL; + uint32_t active = fencing_active_peers(); + + do { +@@ -1317,8 +1318,8 @@ stonith_choose_peer(remote_fencing_op_t * op) + } + + static int +-get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, +- const char *device) ++get_device_timeout(const remote_fencing_op_t *op, ++ const peer_device_info_t *peer, const char *device) + { + device_properties_t *props; + +@@ -1338,7 +1339,7 @@ get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, + + struct timeout_data { + const remote_fencing_op_t *op; +- const st_query_result_t *peer; ++ const peer_device_info_t *peer; + int total_timeout; + }; + +@@ -1365,7 +1366,7 @@ add_device_timeout(gpointer key, gpointer value, gpointer user_data) + } + + static int +-get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) ++get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer) + { + struct timeout_data timeout; + +@@ -1380,7 +1381,7 @@ get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) + + static int + get_op_total_timeout(const remote_fencing_op_t *op, +- const st_query_result_t *chosen_peer) ++ const peer_device_info_t *chosen_peer) + { + int total_timeout = 0; + stonith_topology_t *tp = find_topology_for_host(op->target); +@@ -1403,7 +1404,7 @@ get_op_total_timeout(const remote_fencing_op_t *op, + } + for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { + for (iter = op->query_results; iter != NULL; iter = iter->next) { +- const st_query_result_t *peer = iter->data; ++ const peer_device_info_t *peer = iter->data; + + if (find_peer_device(op, peer, device_list->data)) { + total_timeout += get_device_timeout(op, peer, +@@ -1555,7 +1556,7 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + } + + void +-call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) ++call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + { + const char *device = NULL; + int timeout = op->base_timeout; +@@ -1734,8 +1735,8 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) + static gint + sort_peers(gconstpointer a, gconstpointer b) + { +- const st_query_result_t *peer_a = a; +- const st_query_result_t *peer_b = b; ++ const peer_device_info_t *peer_a = a; ++ const peer_device_info_t *peer_b = b; + + return (peer_b->ndevices - peer_a->ndevices); + } +@@ -1768,7 +1769,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + for (device = tp->levels[i]; device; device = device->next) { + match = NULL; + for (iter = op->query_results; iter && !match; iter = iter->next) { +- st_query_result_t *peer = iter->data; ++ peer_device_info_t *peer = iter->data; + + if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { + continue; +@@ -1850,31 +1851,31 @@ parse_action_specific(xmlNode *xml, const char *peer, const char *device, + * + * \param[in] xml XML node containing device properties + * \param[in,out] op Operation that query and reply relate to +- * \param[in,out] result Peer's results ++ * \param[in,out] peer Peer's device information + * \param[in] device ID of device being parsed + */ + static void + add_device_properties(xmlNode *xml, remote_fencing_op_t *op, +- st_query_result_t *result, const char *device) ++ peer_device_info_t *peer, const char *device) + { + xmlNode *child; + int verified = 0; + device_properties_t *props = calloc(1, sizeof(device_properties_t)); + +- /* Add a new entry to this result's devices list */ ++ /* Add a new entry to this peer's devices list */ + CRM_ASSERT(props != NULL); +- g_hash_table_insert(result->devices, strdup(device), props); ++ g_hash_table_insert(peer->devices, strdup(device), props); + + /* Peers with verified (monitored) access will be preferred */ + crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified); + if (verified) { + crm_trace("Peer %s has confirmed a verified device %s", +- result->host, device); ++ peer->host, device); + props->verified = TRUE; + } + + /* Parse action-specific device properties */ +- parse_action_specific(xml, result->host, device, op_requested_action(op), ++ parse_action_specific(xml, peer->host, device, op_requested_action(op), + op, st_phase_requested, props); + for (child = pcmk__xml_first_child(xml); child != NULL; + child = pcmk__xml_next(child)) { +@@ -1883,10 +1884,10 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, + * winds up getting remapped. + */ + if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) { +- parse_action_specific(child, result->host, device, "off", ++ parse_action_specific(child, peer->host, device, "off", + op, st_phase_off, props); + } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) { +- parse_action_specific(child, result->host, device, "on", ++ parse_action_specific(child, peer->host, device, "on", + op, st_phase_on, props); + } + } +@@ -1903,17 +1904,17 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, + * + * \return Newly allocated result structure with parsed reply + */ +-static st_query_result_t * ++static peer_device_info_t * + add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml) + { +- st_query_result_t *result = calloc(1, sizeof(st_query_result_t)); ++ peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t)); + xmlNode *child; + + // cppcheck seems not to understand the abort logic in CRM_CHECK + // cppcheck-suppress memleak +- CRM_CHECK(result != NULL, return NULL); +- result->host = strdup(host); +- result->devices = pcmk__strkey_table(free, free); ++ CRM_CHECK(peer != NULL, return NULL); ++ peer->host = strdup(host); ++ peer->devices = pcmk__strkey_table(free, free); + + /* Each child element describes one capable device available to the peer */ + for (child = pcmk__xml_first_child(xml); child != NULL; +@@ -1921,17 +1922,17 @@ add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml + const char *device = ID(child); + + if (device) { +- add_device_properties(child, op, result, device); ++ add_device_properties(child, op, peer, device); + } + } + +- result->ndevices = g_hash_table_size(result->devices); +- CRM_CHECK(ndevices == result->ndevices, ++ peer->ndevices = g_hash_table_size(peer->devices); ++ CRM_CHECK(ndevices == peer->ndevices, + crm_err("Query claimed to have %d device%s but %d found", +- ndevices, pcmk__plural_s(ndevices), result->ndevices)); ++ ndevices, pcmk__plural_s(ndevices), peer->ndevices)); + +- op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); +- return result; ++ op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers); ++ return peer; + } + + /*! +@@ -1957,7 +1958,7 @@ process_remote_stonith_query(xmlNode * msg) + const char *id = NULL; + const char *host = NULL; + remote_fencing_op_t *op = NULL; +- st_query_result_t *result = NULL; ++ peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); + +@@ -1991,7 +1992,7 @@ process_remote_stonith_query(xmlNode * msg) + op->replies, replies_expected, host, + op->target, op->action, ndevices, pcmk__plural_s(ndevices), id); + if (ndevices > 0) { +- result = add_result(op, host, ndevices, dev); ++ peer = add_result(op, host, ndevices, dev); + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2001,7 +2002,7 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- call_remote_stonith(op, result, pcmk_ok); ++ call_remote_stonith(op, peer, pcmk_ok); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", +@@ -2010,15 +2011,15 @@ process_remote_stonith_query(xmlNode * msg) + } + + } else if (op->state == st_query) { +- int nverified = count_peer_devices(op, result, TRUE); ++ int nverified = count_peer_devices(op, peer, TRUE); + + /* We have a result for a non-topology fencing op that looks promising, + * go ahead and start fencing before query timeout */ +- if (result && (host_is_target == FALSE) && nverified) { ++ if ((peer != NULL) && !host_is_target && nverified) { + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- call_remote_stonith(op, result, pcmk_ok); ++ call_remote_stonith(op, peer, pcmk_ok); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", +@@ -2029,10 +2030,10 @@ process_remote_stonith_query(xmlNode * msg) + crm_trace("Waiting for more peer results before launching fencing operation"); + } + +- } else if (result && (op->state == st_done)) { ++ } else if ((peer != NULL) && (op->state == st_done)) { + crm_info("Discarding query result from %s (%d device%s): " +- "Operation is %s", result->host, +- result->ndevices, pcmk__plural_s(result->ndevices), ++ "Operation is %s", peer->host, ++ peer->ndevices, pcmk__plural_s(peer->ndevices), + stonith_op_state_str(op->state)); + } + +-- +2.27.0 + + +From 913e0620310089d2250e9ecde383df757f8e8063 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:46:37 -0600 +Subject: [PATCH 04/19] Low: fencer: improve broadcasting replies for fenced + originators + +If the target of a fencing action was also the originator, the executioner +broadcasts the result on their behalf. + +Previously, it would check if the action was not in a list of actions that are +never broadcasted. However we really only want to broadcast off/reboot results +so just check for that instead. + +This also rearranges reply creation slightly so we don't trace-log the reply +until it is fully created. +--- + daemons/fenced/fenced_commands.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 46c840f2a..e4185f6e1 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2385,32 +2385,31 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + int pid, bool merged) + { + xmlNode *reply = NULL; +- gboolean bcast = FALSE; ++ bool bcast = false; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + + reply = construct_async_reply(cmd, result); + +- // Only replies for certain actions are broadcast +- if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", +- NULL)) { +- crm_trace("Never broadcast '%s' replies", cmd->action); ++ // If target was also the originator, broadcast fencing results for it ++ if (!stand_alone && pcmk__is_fencing_action(cmd->action) ++ && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { + +- } else if (!stand_alone && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei) && !pcmk__str_eq(cmd->action, "on", pcmk__str_casei)) { +- crm_trace("Broadcast '%s' reply for %s", cmd->action, cmd->victim); ++ crm_trace("Broadcast '%s' result for %s (target was also originator)", ++ cmd->action, cmd->victim); + crm_xml_add(reply, F_SUBTYPE, "broadcast"); +- bcast = TRUE; ++ crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); ++ bcast = true; + } + + log_async_result(cmd, result, pid, NULL, merged); +- crm_log_xml_trace(reply, "Reply"); + + if (merged) { + crm_xml_add(reply, F_STONITH_MERGED, "true"); + } ++ crm_log_xml_trace(reply, "Reply"); + + if (bcast) { +- crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); + + } else if (cmd->origin) { +-- +2.27.0 + + +From 8b8f94fd9ca5e61922cb81e32c8a3d0f1d75fb0b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 14:40:49 -0600 +Subject: [PATCH 05/19] Refactor: fencer: avoid code duplication when sending + async reply + +... and clean up reply function +--- + daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index e4185f6e1..4ea0a337a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2411,15 +2411,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + + if (bcast) { + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); +- +- } else if (cmd->origin) { +- crm_trace("Directed reply to %s", cmd->origin); +- send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); +- + } else { +- crm_trace("Directed local %ssync reply to %s", +- (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name); +- do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); ++ stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); + } + + if (stand_alone) { +@@ -2814,16 +2807,28 @@ check_alternate_host(const char *target) + return alternate_host; + } + ++/*! ++ * \internal ++ * \brief Send a reply to a CPG peer or IPC client ++ * ++ * \param[in] reply XML reply to send ++ * \param[in] call_options Send synchronously if st_opt_sync_call is set here ++ * \param[in] remote_peer If not NULL, name of peer node to send CPG reply ++ * \param[in] client_id If not NULL, name of client to send IPC reply ++ */ + static void +-stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, ++stonith_send_reply(xmlNode *reply, int call_options, const char *remote_peer, + const char *client_id) + { +- if (remote_peer) { +- send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); +- } else { ++ CRM_CHECK((reply != NULL) && ((remote_peer != NULL) || (client_id != NULL)), ++ return); ++ ++ if (remote_peer == NULL) { + do_local_reply(reply, client_id, +- pcmk_is_set(call_options, st_opt_sync_call), +- (remote_peer != NULL)); ++ pcmk_is_set(call_options, st_opt_sync_call), FALSE); ++ } else { ++ send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, ++ reply, FALSE); + } + } + +-- +2.27.0 + + +From 2cdbda58f0e9f38a0e302506107fd933cb415144 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 17:24:09 -0600 +Subject: [PATCH 06/19] Refactor: fencer: ensure all requests get clean-up + +handle_request() has if-else blocks for each type of request. Previously, if a +request didn't need a reply, the function would do any clean-up needed and +return immediately. Now, we track whether a reply is needed, and all request +types flow to the end of the function for consistent clean-up. + +This doesn't change any behavior at this point, but allows us to do more at the +end of request handling. +--- + daemons/fenced/fenced_commands.c | 46 ++++++++++++++++++-------------- + 1 file changed, 26 insertions(+), 20 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 4ea0a337a..19477b49b 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2892,6 +2892,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + xmlNode *data = NULL; + xmlNode *reply = NULL; ++ bool need_reply = true; + + char *output = NULL; + const char *op = crm_element_value(request, F_STONITH_OPERATION); +@@ -2921,10 +2922,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + pcmk__ipc_send_xml(client, id, reply, flags); + client->request_id = 0; + free_xml(reply); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { + rc = stonith_device_action(request, &output); ++ need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -2933,7 +2936,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); + do_stonith_async_timeout_update(client_id, call_id, op_timeout); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + if (remote_peer) { +@@ -2944,7 +2948,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + remove_relay_op(request); + + stonith_query(request, remote_peer, client_id, call_options); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { + const char *flag_name = NULL; +@@ -2965,7 +2970,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); +@@ -2977,8 +2983,11 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value(dev, F_STONITH_ACTION), + crm_element_value(dev, F_STONITH_TARGET)); + +- if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { ++ if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { ++ rc = -EPROTO; ++ } else { + rc = -EINPROGRESS; ++ need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +@@ -3012,7 +3021,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); + + if (stonith_check_fence_tolerance(tolerance, target, action)) { +- rc = 0; ++ rc = pcmk_ok; + goto done; + } + +@@ -3047,10 +3056,13 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + FALSE); + rc = -EINPROGRESS; + +- } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) { ++ } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { ++ rc = -EPROTO; ++ } else { + rc = -EINPROGRESS; + } + } ++ need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { + rc = stonith_fence_history(request, &data, remote_peer, call_options); +@@ -3058,8 +3070,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + /* we don't expect answers to the broadcast + * we might have sent out + */ +- free_xml(data); +- return pcmk_ok; ++ rc = pcmk_ok; ++ need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { +@@ -3111,8 +3123,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(request, XML_ATTR_ID, &node_id); + name = crm_element_value(request, XML_ATTR_UNAME); + reap_crm_member(node_id, name); +- +- return pcmk_ok; ++ rc = pcmk_ok; ++ need_reply = false; + + } else { + crm_err("Unknown IPC request %s from %s %s", op, +@@ -3120,20 +3132,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + ((client == NULL)? remote_peer : pcmk__client_name(client))); + } + +- done: +- ++done: + if (rc == -EACCES) { + crm_warn("Rejecting IPC request '%s' from unprivileged client %s", + crm_str(op), pcmk__client_name(client)); + } + +- /* Always reply unless the request is in process still. +- * If in progress, a reply will happen async after the request +- * processing is finished */ +- if (rc != -EINPROGRESS) { +- crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0, +- id, pcmk_is_set(call_options, st_opt_sync_call), call_options, +- crm_element_value(request, F_STONITH_CALLOPTS)); ++ // Reply if result is known ++ if (need_reply) { + + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); +-- +2.27.0 + + +From 067d655ebd3fbb0ed27f4e7426db4c3b661ba777 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 17:26:32 -0600 +Subject: [PATCH 07/19] Log: fencer: improve debug logs when processing CPG/IPC + messages + +By moving the result log messages from stonith_command() to handle_reply() and +handle_request(), we can simplify stonith_command() and give slightly better +messages. +--- + daemons/fenced/fenced_commands.c | 80 +++++++++++++++----------------- + 1 file changed, 38 insertions(+), 42 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 19477b49b..98af0e04f 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2883,7 +2883,7 @@ remove_relay_op(xmlNode * request) + } + } + +-static int ++static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) + { +@@ -3152,73 +3152,69 @@ done: + free_xml(data); + free_xml(reply); + +- return rc; ++ crm_debug("Processed %s request from %s %s: %s (rc=%d)", ++ op, ((client == NULL)? "peer" : "client"), ++ ((client == NULL)? remote_peer : pcmk__client_name(client)), ++ ((rc > 0)? "" : pcmk_strerror(rc)), rc); + } + + static void + handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) + { +- const char *op = crm_element_value(request, F_STONITH_OPERATION); ++ // Copy, because request might be freed before we want to log this ++ char *op = crm_element_value_copy(request, F_STONITH_OPERATION); + + if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + process_remote_stonith_query(request); +- } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { +- process_remote_stonith_exec(request); +- } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +- /* Reply to a complex fencing op */ ++ } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { + process_remote_stonith_exec(request); + } else { +- crm_err("Unknown %s reply from %s %s", op, +- ((client == NULL)? "peer" : "client"), ++ crm_err("Ignoring unknown %s reply from %s %s", ++ crm_str(op), ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client))); + crm_log_xml_warn(request, "UnknownOp"); ++ free(op); ++ return; + } ++ crm_debug("Processed %s reply from %s %s", ++ op, ((client == NULL)? "peer" : "client"), ++ ((client == NULL)? remote_peer : pcmk__client_name(client))); ++ free(op); + } + ++/*! ++ * \internal ++ * \brief Handle a message from an IPC client or CPG peer ++ * ++ * \param[in] client If not NULL, IPC client that sent message ++ * \param[in] id If from IPC client, IPC message ID ++ * \param[in] flags Message flags ++ * \param[in] message Message XML ++ * \param[in] remote_peer If not NULL, CPG peer that sent message ++ */ + void + stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, +- xmlNode *request, const char *remote_peer) ++ xmlNode *message, const char *remote_peer) + { +- int call_options = 0; +- int rc = 0; +- gboolean is_reply = FALSE; +- +- /* Copy op for reporting. The original might get freed by handle_reply() +- * before we use it in crm_debug(): +- * handle_reply() +- * |- process_remote_stonith_exec() +- * |-- remote_op_done() +- * |--- handle_local_reply_and_notify() +- * |---- crm_xml_add(...F_STONITH_OPERATION...) +- * |--- free_xml(op->request) +- */ +- char *op = crm_element_value_copy(request, F_STONITH_OPERATION); +- +- if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_NEVER)) { +- is_reply = TRUE; +- } ++ int call_options = st_opt_none; ++ bool is_reply = get_xpath_object("//" T_STONITH_REPLY, message, ++ LOG_NEVER) != NULL; + +- crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); +- crm_debug("Processing %s%s %u from %s %s with call options 0x%08x", +- op, (is_reply? " reply" : ""), id, ++ crm_element_value_int(message, F_STONITH_CALLOPTS, &call_options); ++ crm_debug("Processing %ssynchronous %s %s %u from %s %s", ++ pcmk_is_set(call_options, st_opt_sync_call)? "" : "a", ++ crm_element_value(message, F_STONITH_OPERATION), ++ (is_reply? "reply" : "request"), id, + ((client == NULL)? "peer" : "client"), +- ((client == NULL)? remote_peer : pcmk__client_name(client)), +- call_options); ++ ((client == NULL)? remote_peer : pcmk__client_name(client))); + + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); + } + + if (is_reply) { +- handle_reply(client, request, remote_peer); ++ handle_reply(client, message, remote_peer); + } else { +- rc = handle_request(client, id, flags, request, remote_peer); ++ handle_request(client, id, flags, message, remote_peer); + } +- +- crm_debug("Processed %s%s from %s %s: %s (rc=%d)", +- op, (is_reply? " reply" : ""), +- ((client == NULL)? "peer" : "client"), +- ((client == NULL)? remote_peer : pcmk__client_name(client)), +- ((rc > 0)? "" : pcmk_strerror(rc)), rc); +- free(op); + } +-- +2.27.0 + + +From 44cb340c11b4652f452a47eb2b0050b4a459382b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 15 Nov 2021 16:29:09 -0600 +Subject: [PATCH 08/19] Refactor: fencer: drop unused argument from + notification functions + +--- + daemons/fenced/fenced_commands.c | 12 ++++++------ + daemons/fenced/fenced_history.c | 6 +++--- + daemons/fenced/fenced_remote.c | 6 +++--- + daemons/fenced/pacemaker-fenced.c | 18 +++++++++--------- + daemons/fenced/pacemaker-fenced.h | 6 +++--- + 5 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 98af0e04f..946ce4042 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2428,8 +2428,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + free_xml(reply); +@@ -3082,7 +3082,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_device(call_options, op, rc, device_id); ++ do_stonith_notify_device(op, rc, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3093,7 +3093,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_device(call_options, op, rc, device_id); ++ do_stonith_notify_device(op, rc, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; +@@ -3103,7 +3103,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_level(call_options, op, rc, device_id); ++ do_stonith_notify_level(op, rc, device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3114,7 +3114,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_level(call_options, op, rc, device_id); ++ do_stonith_notify_level(op, rc, device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 1ba034ba9..7127593b6 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, + g_hash_table_foreach_remove(stonith_remote_op_list, + stonith_remove_history_entry, + (gpointer) target); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + } + +@@ -396,7 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + + if (updated) { + stonith_fence_history_trim(); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + if (cnt == 0) { +@@ -470,7 +470,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + is done so send a notification for anything + that smells like history-sync + */ +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY_SYNCED, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); + if (crm_element_value(msg, F_STONITH_CALLID)) { + /* this is coming from the stonith-API + * +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 9e2f62804..c907cd120 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -423,8 +423,8 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; +@@ -1119,7 +1119,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + if (op->state != st_duplicate) { + /* kick history readers */ +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + /* safe to trim as long as that doesn't touch pending ops */ +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index a64004ce1..a290e1670 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -357,7 +357,7 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int + } + + void +-do_stonith_notify(int options, const char *type, int result, xmlNode * data) ++do_stonith_notify(const char *type, int result, xmlNode *data) + { + /* TODO: Standardize the contents of data */ + xmlNode *update_msg = create_xml_node(NULL, "notify"); +@@ -380,7 +380,7 @@ do_stonith_notify(int options, const char *type, int result, xmlNode * data) + } + + static void +-do_stonith_notify_config(int options, const char *op, int rc, ++do_stonith_notify_config(const char *op, int rc, + const char *desc, int active) + { + xmlNode *notify_data = create_xml_node(NULL, op); +@@ -390,20 +390,20 @@ do_stonith_notify_config(int options, const char *op, int rc, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(options, op, rc, notify_data); ++ do_stonith_notify(op, rc, notify_data); + free_xml(notify_data); + } + + void +-do_stonith_notify_device(int options, const char *op, int rc, const char *desc) ++do_stonith_notify_device(const char *op, int rc, const char *desc) + { +- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); ++ do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); + } + + void +-do_stonith_notify_level(int options, const char *op, int rc, const char *desc) ++do_stonith_notify_level(const char *op, int rc, const char *desc) + { +- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); ++ do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); + } + + static void +@@ -418,7 +418,7 @@ topology_remove_helper(const char *node, int level) + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + + rc = stonith_level_remove(data, &desc); +- do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); ++ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); + + free_xml(data); + free(desc); +@@ -468,7 +468,7 @@ handle_topology_change(xmlNode *match, bool remove) + } + + rc = stonith_level_register(match, &desc); +- do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); ++ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); + + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index a64b57693..3e41d867e 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -233,9 +233,9 @@ xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + +-void do_stonith_notify(int options, const char *type, int result, xmlNode * data); +-void do_stonith_notify_device(int options, const char *op, int rc, const char *desc); +-void do_stonith_notify_level(int options, const char *op, int rc, const char *desc); ++void do_stonith_notify(const char *type, int result, xmlNode *data); ++void do_stonith_notify_device(const char *op, int rc, const char *desc); ++void do_stonith_notify_level(const char *op, int rc, const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, +-- +2.27.0 + + +From a49df4901b663b3366634c1d58f04625ecba4005 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 11:57:14 -0600 +Subject: [PATCH 09/19] Refactor: fencer: functionize checking for privileged + client + +... for readability and to make planned changes easier +--- + daemons/fenced/fenced_commands.c | 49 +++++++++++++++++++------------- + 1 file changed, 30 insertions(+), 19 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 946ce4042..34c956f5c 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2883,6 +2883,32 @@ remove_relay_op(xmlNode * request) + } + } + ++/*! ++ * \internal ++ * \brief Check whether an API request was sent by a privileged user ++ * ++ * API commands related to fencing configuration may be done only by privileged ++ * IPC users (i.e. root or hacluster), because all other users should go through ++ * the CIB to have ACLs applied. If no client was given, this is a peer request, ++ * which is always allowed. ++ * ++ * \param[in] c IPC client that sent request (or NULL if sent by CPG peer) ++ * \param[in] op Requested API operation (for logging only) ++ * ++ * \return true if sender is peer or privileged client, otherwise false ++ */ ++static inline bool ++is_privileged(pcmk__client_t *c, const char *op) ++{ ++ if ((c == NULL) || pcmk_is_set(c->flags, pcmk__client_privileged)) { ++ return true; ++ } else { ++ crm_warn("Rejecting IPC request '%s' from unprivileged client %s", ++ crm_str(op), pcmk__client_name(c)); ++ return false; ++ } ++} ++ + static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) +@@ -2898,15 +2924,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *op = crm_element_value(request, F_STONITH_OPERATION); + const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); + +- /* IPC commands related to fencing configuration may be done only by +- * privileged users (i.e. root or hacluster), because all other users should +- * go through the CIB to have ACLs applied. +- * +- * If no client was given, this is a peer request, which is always allowed. +- */ +- bool allowed = (client == NULL) +- || pcmk_is_set(client->flags, pcmk__client_privileged); +- + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + + if (pcmk_is_set(call_options, st_opt_sync_call)) { +@@ -3077,7 +3094,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { + const char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_device_register(request, &device_id, FALSE); + } else { + rc = -EACCES; +@@ -3088,7 +3105,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); + const char *device_id = crm_element_value(dev, XML_ATTR_ID); + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_device_remove(device_id, FALSE); + } else { + rc = -EACCES; +@@ -3098,7 +3115,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_level_register(request, &device_id); + } else { + rc = -EACCES; +@@ -3109,7 +3126,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { + char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_level_remove(request, &device_id); + } else { + rc = -EACCES; +@@ -3133,14 +3150,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + done: +- if (rc == -EACCES) { +- crm_warn("Rejecting IPC request '%s' from unprivileged client %s", +- crm_str(op), pcmk__client_name(client)); +- } +- + // Reply if result is known + if (need_reply) { +- + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); + } +-- +2.27.0 + + +From 10ca8a5ef5266159bc3f993802aeae6537ceeb11 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 16:59:03 -0600 +Subject: [PATCH 10/19] Low: fencer: return -ETIME for peer fencing timeouts + +94c55684 set the result as pcmk_ok, but it appears that the intent was just to +keep the delegate from being set, and -ETIME should still do that, while being +more appropriate. +--- + daemons/fenced/fenced_remote.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c907cd120..dc7b802da 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -608,7 +608,7 @@ remote_op_timeout_one(gpointer userdata) + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- call_remote_stonith(op, NULL, pcmk_ok); ++ call_remote_stonith(op, NULL, -ETIME); + return FALSE; + } + +-- +2.27.0 + + +From fb2eefeb695cc92e1a2aed6f1f1d2b900d4fb83e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 17:54:56 -0600 +Subject: [PATCH 11/19] Refactor: fencer: functionize common part of timeout + handling + +Previously, remote_op_timeout() was called from multiple places, but only one +of those places needed the full processing. The common part is now in a new +function finalize_timed_out_op() called from all the places, and +remote_op_timeout() now has just the additional processing needed by the one +place plus a call to the new function. + +This will allow a future change to set a different exit reason depending on +which step timed out. +--- + daemons/fenced/fenced_remote.c | 49 +++++++++++++++++++++++----------- + 1 file changed, 34 insertions(+), 15 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index dc7b802da..22c4b0772 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -612,20 +612,18 @@ remote_op_timeout_one(gpointer userdata) + return FALSE; + } + +-static gboolean +-remote_op_timeout(gpointer userdata) ++/*! ++ * \internal ++ * \brief Finalize a remote fencer operation that timed out ++ * ++ * \param[in] op Fencer operation that timed out ++ */ ++static void ++finalize_timed_out_op(remote_fencing_op_t *op) + { +- remote_fencing_op_t *op = userdata; + + op->op_timer_total = 0; + +- if (op->state == st_done) { +- crm_debug("Action '%s' targeting %s for client %s already completed " +- CRM_XS " id=%.8s", +- op->action, op->target, op->client_name, op->id); +- return FALSE; +- } +- + crm_debug("Action '%s' targeting %s for client %s timed out " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); +@@ -637,14 +635,35 @@ remote_op_timeout(gpointer userdata) + */ + op->state = st_done; + remote_op_done(op, NULL, pcmk_ok, FALSE); +- return FALSE; ++ return; + } + + op->state = st_failed; + + remote_op_done(op, NULL, -ETIME, FALSE); ++} + +- return FALSE; ++/*! ++ * \internal ++ * \brief Finalize a remote fencer operation that timed out ++ * ++ * \param[in] userdata Fencer operation that timed out ++ * ++ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) ++ */ ++static gboolean ++remote_op_timeout(gpointer userdata) ++{ ++ remote_fencing_op_t *op = userdata; ++ ++ if (op->state == st_done) { ++ crm_debug("Action '%s' targeting %s for client %s already completed " ++ CRM_XS " id=%.8s", ++ op->action, op->target, op->client_name, op->id); ++ } else { ++ finalize_timed_out_op(userdata); ++ } ++ return G_SOURCE_REMOVE; + } + + static gboolean +@@ -670,7 +689,7 @@ remote_op_query_timeout(gpointer data) + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } +- remote_op_timeout(op); ++ finalize_timed_out_op(op); + } + + return FALSE; +@@ -1675,8 +1694,8 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + crm_info("No remaining peers capable of fencing (%s) %s for client %s " + CRM_XS " state=%s", op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); +- CRM_LOG_ASSERT(op->state < st_done); +- remote_op_timeout(op); ++ CRM_CHECK(op->state < st_done, return); ++ finalize_timed_out_op(op); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { + // int rc = -EHOSTUNREACH; +-- +2.27.0 + + +From c047005a112ac7da5ba62084e39c79db739f0923 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 10:05:18 -0600 +Subject: [PATCH 12/19] Low: fencer: handle malformed manual confirmation + requests better + +Rename stonith_manual_ack() to fenced_handle_manual_confirmation(), and move +more of the manual confirmation handling in handle_request() into it, for +better code isolation. This will also make planned changes easier. + +The one behavioral difference is that a failure of initiate_remote_stonith_op() +will now be ignored rather than segmentation fault trying to dereference NULL. +--- + daemons/fenced/fenced_commands.c | 20 ++++++++++++-------- + daemons/fenced/fenced_remote.c | 29 ++++++++++++++++++++++++----- + daemons/fenced/pacemaker-fenced.h | 2 +- + 3 files changed, 37 insertions(+), 14 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 34c956f5c..6f325b9e8 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3012,14 +3012,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + if (remote_peer || stand_alone) { + rc = stonith_fence(request); + +- } else if (call_options & st_opt_manual_ack) { +- remote_fencing_op_t *rop = NULL; +- xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); +- const char *target = crm_element_value(dev, F_STONITH_TARGET); +- +- crm_notice("Received manual confirmation that %s is fenced", target); +- rop = initiate_remote_stonith_op(client, request, TRUE); +- rc = stonith_manual_ack(request, rop); ++ } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { ++ switch (fenced_handle_manual_confirmation(client, request)) { ++ case pcmk_rc_ok: ++ rc = pcmk_ok; ++ break; ++ case EINPROGRESS: ++ rc = -EINPROGRESS; ++ break; ++ default: ++ rc = -EPROTO; ++ break; ++ } + + } else { + const char *alternate_host = NULL; +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 22c4b0772..60ee5e32e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1003,22 +1003,41 @@ static uint32_t fencing_active_peers(void) + return count; + } + ++/*! ++ * \internal ++ * \brief Process a manual confirmation of a pending fence action ++ * ++ * \param[in] client IPC client that sent confirmation ++ * \param[in] msg Request XML with manual confirmation ++ * ++ * \return Standard Pacemaker return code ++ */ + int +-stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) ++fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + { ++ remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); + ++ CRM_CHECK(dev != NULL, return EPROTO); ++ ++ crm_notice("Received manual confirmation that %s has been fenced", ++ crm_str(crm_element_value(dev, F_STONITH_TARGET))); ++ op = initiate_remote_stonith_op(client, msg, TRUE); ++ if (op == NULL) { ++ return EPROTO; ++ } + op->state = st_done; + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- crm_notice("Injecting manual confirmation that %s is safely off/down", +- crm_element_value(dev, F_STONITH_TARGET)); ++ // For the fencer's purposes, the fencing operation is done + + remote_op_done(op, msg, pcmk_ok, FALSE); + +- // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() +- return -EINPROGRESS; ++ /* For the requester's purposes, the operation is still pending. The ++ * actual result will be sent asynchronously via the operation's done_cb(). ++ */ ++ return EINPROGRESS; + } + + /*! +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 3e41d867e..cf88644f1 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -256,7 +256,7 @@ bool fencing_peer_active(crm_node_t *peer); + + void set_fencing_completed(remote_fencing_op_t * op); + +-int stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op); ++int fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg); + + gboolean node_has_attr(const char *node, const char *name, const char *value); + +-- +2.27.0 + + +From ec60f014b5a8f774aa57a26e40a2b1b94a7e3d3a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 10:35:31 -0600 +Subject: [PATCH 13/19] Low: fencer: handle malformed topology level removal + requests better + +Log the malformed request, and return -EPROTO instead of -EINVAL. If a request +is missing a level number, treat it as malformed instead of as a request to +remove all. +--- + daemons/fenced/fenced_commands.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 6f325b9e8..358844203 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1678,27 +1678,27 @@ stonith_level_register(xmlNode *msg, char **desc) + int + stonith_level_remove(xmlNode *msg, char **desc) + { +- int id = 0; ++ int id = -1; + stonith_topology_t *tp; + char *target; + + /* Unlike additions, removal requests should always have one level tag */ + xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); + +- CRM_CHECK(level != NULL, return -EINVAL); ++ CRM_CHECK(level != NULL, return -EPROTO); + + target = stonith_level_key(level, -1); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); ++ ++ CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), ++ crm_log_xml_warn(msg, "invalid level"); ++ free(target); ++ return -EPROTO); ++ + if (desc) { + *desc = crm_strdup_printf("%s[%d]", target, id); + } + +- /* Sanity-check arguments */ +- if (id >= ST_LEVEL_MAX) { +- free(target); +- return -EINVAL; +- } +- + tp = g_hash_table_lookup(topology, target); + if (tp == NULL) { + guint nentries = g_hash_table_size(topology); +@@ -1714,7 +1714,7 @@ stonith_level_remove(xmlNode *msg, char **desc) + "(%d active %s remaining)", target, nentries, + pcmk__plural_alt(nentries, "entry", "entries")); + +- } else if (id > 0 && tp->levels[id] != NULL) { ++ } else if (tp->levels[id] != NULL) { + guint nlevels; + + g_list_free_full(tp->levels[id], free); +-- +2.27.0 + + +From ee0cfb6b284c2d6d21f8e77bf6ff286b1364235d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:33:05 -0600 +Subject: [PATCH 14/19] Refactor: fencer: avoid obscuring a variable + +handle_request() declared a xmlNode *reply variable, and then one of its "if" +blocks defined another one, obscuring the first. Drop the first declaration, +and instead move it to the one other place that needed it. + +Also remove a redundant assertion. +--- + daemons/fenced/fenced_commands.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 358844203..af0a92450 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2917,7 +2917,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + int rc = -EOPNOTSUPP; + + xmlNode *data = NULL; +- xmlNode *reply = NULL; + bool need_reply = true; + + char *output = NULL; +@@ -2926,8 +2925,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + +- if (pcmk_is_set(call_options, st_opt_sync_call)) { +- CRM_ASSERT(client == NULL || client->request_id == id); ++ if (pcmk_is_set(call_options, st_opt_sync_call) && (client != NULL)) { ++ CRM_ASSERT(client->request_id == id); + } + + if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { +@@ -3156,16 +3155,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- if (pcmk_is_set(call_options, st_opt_sync_call)) { +- CRM_ASSERT(client == NULL || client->request_id == id); +- } +- reply = stonith_construct_reply(request, output, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, output, data, rc); ++ + stonith_send_reply(reply, call_options, remote_peer, client_id); ++ free_xml(reply); + } + + free(output); + free_xml(data); +- free_xml(reply); + + crm_debug("Processed %s request from %s %s: %s (rc=%d)", + op, ((client == NULL)? "peer" : "client"), +-- +2.27.0 + + +From a5fef7b95b7541860e29c1ff33be38db327208fb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:37:10 -0600 +Subject: [PATCH 15/19] Refactor: fencer: add convenience function for setting + protocol error result + +The fencer will soon track and return the full result (rather than just a +legacy return code) for fencing actions, for callbacks and notifications. +To simplify that process as well as move away from the legacy codes in general, +all fencer API operations will be modified to return a full result. + +This convenience function will come in handy for that. +--- + daemons/fenced/pacemaker-fenced.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index cf88644f1..3bc5dc3d1 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -262,6 +262,13 @@ gboolean node_has_attr(const char *node, const char *name, const char *value); + + gboolean node_does_watchdog_fencing(const char *node); + ++static inline void ++fenced_set_protocol_error(pcmk__action_result_t *result) ++{ ++ pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, ++ "Fencer API request missing required information (bug?)"); ++} ++ + extern char *stonith_our_uname; + extern gboolean stand_alone; + extern GHashTable *device_list; +-- +2.27.0 + + +From ed770d36fb34dc7b3344cd326830a6c06cc789ce Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 09:59:51 -0600 +Subject: [PATCH 16/19] Refactor: fencer: make a few functions return void + +... to make planned changes easier. The return values were previously ignored. +--- + daemons/fenced/fenced_commands.c | 17 ++++++++------- + daemons/fenced/fenced_history.c | 6 +----- + daemons/fenced/fenced_remote.c | 35 ++++++++++++++----------------- + daemons/fenced/pacemaker-fenced.c | 6 +++--- + daemons/fenced/pacemaker-fenced.h | 8 +++---- + 5 files changed, 33 insertions(+), 39 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index af0a92450..ea7d281ce 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1411,8 +1411,8 @@ stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) + return pcmk_ok; + } + +-int +-stonith_device_remove(const char *id, gboolean from_cib) ++void ++stonith_device_remove(const char *id, bool from_cib) + { + stonith_device_t *device = g_hash_table_lookup(device_list, id); + guint ndevices = 0; +@@ -1421,7 +1421,7 @@ stonith_device_remove(const char *id, gboolean from_cib) + ndevices = g_hash_table_size(device_list); + crm_info("Device '%s' not found (%d active device%s)", + id, ndevices, pcmk__plural_s(ndevices)); +- return pcmk_ok; ++ return; + } + + if (from_cib) { +@@ -1443,7 +1443,6 @@ stonith_device_remove(const char *id, gboolean from_cib) + (device->cib_registered? " cib" : ""), + (device->api_registered? " api" : "")); + } +- return pcmk_ok; + } + + /*! +@@ -3085,8 +3084,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { +- rc = stonith_fence_history(request, &data, remote_peer, call_options); +- if (call_options & st_opt_discard_reply) { ++ stonith_fence_history(request, &data, remote_peer, call_options); ++ rc = pcmk_ok; ++ if (pcmk_is_set(call_options, st_opt_discard_reply)) { + /* we don't expect answers to the broadcast + * we might have sent out + */ +@@ -3109,7 +3109,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *device_id = crm_element_value(dev, XML_ATTR_ID); + + if (is_privileged(client, op)) { +- rc = stonith_device_remove(device_id, FALSE); ++ stonith_device_remove(device_id, false); ++ rc = pcmk_ok; + } else { + rc = -EACCES; + } +@@ -3179,7 +3180,7 @@ handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) + if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + process_remote_stonith_query(request); + } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { +- process_remote_stonith_exec(request); ++ fenced_process_fencing_reply(request); + } else { + crm_err("Ignoring unknown %s reply from %s %s", + crm_str(op), ((client == NULL)? "peer" : "client"), +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 7127593b6..bc159383c 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -433,14 +433,11 @@ stonith_local_history(gboolean add_id, const char *target) + * a reply from + * \param[in] remote_peer + * \param[in] options call-options from the request +- * +- * \return always success as there is actully nothing that can go really wrong + */ +-int ++void + stonith_fence_history(xmlNode *msg, xmlNode **output, + const char *remote_peer, int options) + { +- int rc = 0; + const char *target = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_NEVER); + xmlNode *out_history = NULL; +@@ -525,5 +522,4 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + *output = stonith_local_history(FALSE, target); + } + free_xml(out_history); +- return rc; + } +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 60ee5e32e..6338aebde 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2086,11 +2086,9 @@ process_remote_stonith_query(xmlNode * msg) + * or attempt another device as appropriate. + * + * \param[in] msg XML reply received +- * +- * \return pcmk_ok on success, -errno on error + */ +-int +-process_remote_stonith_exec(xmlNode * msg) ++void ++fenced_process_fencing_reply(xmlNode *msg) + { + int rc = 0; + const char *id = NULL; +@@ -2098,13 +2096,13 @@ process_remote_stonith_exec(xmlNode * msg) + remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); + +- CRM_CHECK(dev != NULL, return -EPROTO); ++ CRM_CHECK(dev != NULL, return); + + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); +- CRM_CHECK(id != NULL, return -EPROTO); ++ CRM_CHECK(id != NULL, return); + + dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); +- CRM_CHECK(dev != NULL, return -EPROTO); ++ CRM_CHECK(dev != NULL, return); + + crm_element_value_int(dev, F_STONITH_RC, &rc); + +@@ -2125,35 +2123,35 @@ process_remote_stonith_exec(xmlNode * msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- return -EOPNOTSUPP; ++ return; + } + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- return rc; ++ return; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { + crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " +- CRM_XS " rc=%d id=%.8s", ++ CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_strerror(rc), rc, op->id); ++ pcmk_strerror(rc), op->id); + if (rc == pcmk_ok) { + op->state = st_done; + } else { + op->state = st_failed; + } + remote_op_done(op, msg, rc, FALSE); +- return pcmk_ok; ++ return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- return rc; ++ return; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2168,7 +2166,7 @@ process_remote_stonith_exec(xmlNode * msg) + * and notify our local clients. */ + if (op->state == st_done) { + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } + + if ((op->phase == 2) && (rc != pcmk_ok)) { +@@ -2184,14 +2182,14 @@ process_remote_stonith_exec(xmlNode * msg) + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg, rc); +- return rc; ++ return; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } + } + } else if (rc == pcmk_ok && op->devices == NULL) { +@@ -2199,12 +2197,12 @@ process_remote_stonith_exec(xmlNode * msg) + + op->state = st_done; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } else if (rc == -ETIME && op->devices == NULL) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } else { + /* fall-through and attempt other fencing action using another peer */ + } +@@ -2213,7 +2211,6 @@ process_remote_stonith_exec(xmlNode * msg) + crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, + op->client_name, rc); + call_remote_stonith(op, NULL, rc); +- return rc; + } + + gboolean +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index a290e1670..0a8b3bf6f 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -445,7 +445,7 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) + + rsc_id = crm_element_value(match, XML_ATTR_ID); + +- stonith_device_remove(rsc_id, TRUE); ++ stonith_device_remove(rsc_id, true); + } + } + +@@ -610,7 +610,7 @@ watchdog_device_update(void) + } else { + /* be silent if no device - todo parameter to stonith_device_remove */ + if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { +- stonith_device_remove(STONITH_WATCHDOG_ID, TRUE); ++ stonith_device_remove(STONITH_WATCHDOG_ID, true); + } + } + } +@@ -847,7 +847,7 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) + } + if (search != NULL) { + *search = 0; +- stonith_device_remove(rsc_id, TRUE); ++ stonith_device_remove(rsc_id, true); + /* watchdog_device_update called afterwards + to fall back to implicit definition if needed */ + } else { +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 3bc5dc3d1..5162ada75 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -214,7 +214,7 @@ void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, + + int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib); + +-int stonith_device_remove(const char *id, gboolean from_cib); ++void stonith_device_remove(const char *id, bool from_cib); + + char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); +@@ -241,14 +241,14 @@ remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, + gboolean manual_ack); + +-int process_remote_stonith_exec(xmlNode * msg); ++void fenced_process_fencing_reply(xmlNode *msg); + + int process_remote_stonith_query(xmlNode * msg); + + void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); + +-int stonith_fence_history(xmlNode *msg, xmlNode **output, +- const char *remote_peer, int options); ++void stonith_fence_history(xmlNode *msg, xmlNode **output, ++ const char *remote_peer, int options); + + void stonith_fence_history_trim(void); + +-- +2.27.0 + + +From 27df49460930738e77f5ca42536aff1d3bdfcae7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 10:06:43 -0600 +Subject: [PATCH 17/19] Refactor: fencer: drop unnecessary argument when + advancing topology device + +If we're advancing to the next device in a topology level, by necessity that +means any previous device succeeded. +--- + daemons/fenced/fenced_remote.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 6338aebde..d54e6a4ef 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1519,14 +1519,13 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout) + * \internal + * \brief Advance an operation to the next device in its topology + * +- * \param[in,out] op Operation to advance +- * \param[in] device ID of device just completed +- * \param[in] msg XML reply that contained device result (if available) +- * \param[in] rc Return code of device's execution ++ * \param[in] op Fencer operation to advance ++ * \param[in] device ID of device that just completed ++ * \param[in] msg If not NULL, XML reply of last delegated fencing operation + */ + static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, +- xmlNode *msg, int rc) ++ xmlNode *msg) + { + /* Advance to the next device at this topology level, if any */ + if (op->devices) { +@@ -1556,8 +1555,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + + if (op->devices) { + /* Necessary devices remain, so execute the next one */ +- crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)", +- op->target, op->client_name, op->originator, rc); ++ crm_trace("Next targeting %s on behalf of %s@%s", ++ op->target, op->client_name, op->originator); + + // The requested delay has been applied for the first device + if (op->delay > 0) { +@@ -1570,7 +1569,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_ok, FALSE); + } + } + +@@ -1701,7 +1700,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + */ + crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s " + "after successful 'off'", device, op->target); +- advance_topology_device_in_level(op, device, NULL, pcmk_ok); ++ advance_topology_device_in_level(op, device, NULL); + return; + + } else if (op->owner == FALSE) { +@@ -2181,7 +2180,7 @@ fenced_process_fencing_reply(xmlNode *msg) + if (rc == pcmk_ok) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ +- advance_topology_device_in_level(op, device, msg, rc); ++ advance_topology_device_in_level(op, device, msg); + return; + } else { + /* This device failed, time to try another topology level. If no other +-- +2.27.0 + + +From 05437e1339bc1f9071b43e97d5846a939687951d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 11:59:17 -0600 +Subject: [PATCH 18/19] Refactor: fencer: minor renames for consistency + +... per review +--- + daemons/fenced/fenced_remote.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index d54e6a4ef..8feb40147 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -63,7 +63,7 @@ typedef struct device_properties_s { + int delay_base[st_phase_max]; + } device_properties_t; + +-typedef struct st_query_result_s { ++typedef struct { + /* Name of peer that sent this result */ + char *host; + /* Only try peers for non-topology based operations once */ +@@ -95,13 +95,12 @@ sort_strings(gconstpointer a, gconstpointer b) + static void + free_remote_query(gpointer data) + { +- if (data) { +- peer_device_info_t *query = data; ++ if (data != NULL) { ++ peer_device_info_t *peer = data; + +- crm_trace("Free'ing query result from %s", query->host); +- g_hash_table_destroy(query->devices); +- free(query->host); +- free(query); ++ g_hash_table_destroy(peer->devices); ++ free(peer->host); ++ free(peer); + } + } + +-- +2.27.0 + + +From 86974d7cef05bafbed540d02e59514292581ae65 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 08:33:41 -0600 +Subject: [PATCH 19/19] Refactor: fencer: simplify send_async_reply() + +... as suggested in review +--- + daemons/fenced/fenced_commands.c | 28 ++++++++++++---------------- + 1 file changed, 12 insertions(+), 16 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index ea7d281ce..f34cb4f13 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2384,36 +2384,34 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + int pid, bool merged) + { + xmlNode *reply = NULL; +- bool bcast = false; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + ++ log_async_result(cmd, result, pid, NULL, merged); ++ + reply = construct_async_reply(cmd, result); ++ if (merged) { ++ crm_xml_add(reply, F_STONITH_MERGED, "true"); ++ } + +- // If target was also the originator, broadcast fencing results for it + if (!stand_alone && pcmk__is_fencing_action(cmd->action) + && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { +- ++ /* The target was also the originator, so broadcast the result on its ++ * behalf (since it will be unable to). ++ */ + crm_trace("Broadcast '%s' result for %s (target was also originator)", + cmd->action, cmd->victim); + crm_xml_add(reply, F_SUBTYPE, "broadcast"); + crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); +- bcast = true; +- } +- +- log_async_result(cmd, result, pid, NULL, merged); +- +- if (merged) { +- crm_xml_add(reply, F_STONITH_MERGED, "true"); +- } +- crm_log_xml_trace(reply, "Reply"); +- +- if (bcast) { + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); + } else { ++ // Reply only to the originator + stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); + } + ++ crm_log_xml_trace(reply, "Reply"); ++ free_xml(reply); ++ + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); +@@ -2430,8 +2428,6 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } +- +- free_xml(reply); + } + + static void +-- +2.27.0 + diff --git a/SOURCES/006-stateful-metadata.patch b/SOURCES/006-stateful-metadata.patch new file mode 100644 index 0000000..a9ea6f4 --- /dev/null +++ b/SOURCES/006-stateful-metadata.patch @@ -0,0 +1,143 @@ +From b52fe799c89637e2a761a5725c2376db5c05f2d1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 15:51:54 -0600 +Subject: [PATCH 1/2] Low: resources: remove DOCTYPE from OCF 1.1-compliant + agents + +OCF 1.1 replaced the DTD schema with RNG, but DOCTYPE still refers to the DTD. +There's no DOCTYPE for RNG, and DOCTYPE is optional, so just remove it. +--- + extra/resources/Dummy | 3 +-- + extra/resources/HealthIOWait | 3 +-- + extra/resources/Stateful | 3 +-- + extra/resources/attribute | 3 +-- + extra/resources/ping | 3 +-- + extra/resources/remote | 3 +-- + 6 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/extra/resources/Dummy b/extra/resources/Dummy +index a344deac0..56584e564 100755 +--- a/extra/resources/Dummy ++++ b/extra/resources/Dummy +@@ -58,8 +58,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/HealthIOWait b/extra/resources/HealthIOWait +index 43a8b70c4..5f1483ef7 100755 +--- a/extra/resources/HealthIOWait ++++ b/extra/resources/HealthIOWait +@@ -25,8 +25,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/Stateful b/extra/resources/Stateful +index ae3424bbf..0d2062d51 100755 +--- a/extra/resources/Stateful ++++ b/extra/resources/Stateful +@@ -39,8 +39,7 @@ SCORE_PROMOTED=10 + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/attribute b/extra/resources/attribute +index 1800dff8f..a2bd353e0 100755 +--- a/extra/resources/attribute ++++ b/extra/resources/attribute +@@ -57,8 +57,7 @@ END + meta_data() { + cat < +- +- ++ + 1.1 + Manages a node attribute + +diff --git a/extra/resources/ping b/extra/resources/ping +index 6e296979f..7cc6b802d 100755 +--- a/extra/resources/ping ++++ b/extra/resources/ping +@@ -36,8 +36,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/remote b/extra/resources/remote +index a53262bb6..f7e40dc81 100755 +--- a/extra/resources/remote ++++ b/extra/resources/remote +@@ -24,8 +24,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + Pacemaker Remote connection + +-- +2.27.0 + + +From 70f469120f8db6a024c786466ee74a6c7fbd1f43 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 15:53:39 -0600 +Subject: [PATCH 2/2] Fix: resources: use correct syntax in Stateful meta-data + +The OCF standard only allows "0" or "1" for booleans. + +This fixes incorrect ocf:pacemaker:Stateful meta-data syntax introduced by +7024398 as a regression in the 2.1.0 release. +--- + extra/resources/Stateful | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/extra/resources/Stateful b/extra/resources/Stateful +index 0d2062d51..2ebe6725f 100755 +--- a/extra/resources/Stateful ++++ b/extra/resources/Stateful +@@ -57,7 +57,7 @@ Location to store the resource state in + + + +- ++ + + If this is set, the environment will be dumped to this file for every call. + +@@ -65,7 +65,7 @@ If this is set, the environment will be dumped to this file for every call. + + + +- ++ + + The notify action will sleep for this many seconds before returning, + to simulate a long-running notify. +-- +2.27.0 + diff --git a/SOURCES/007-memory-leak.patch b/SOURCES/007-memory-leak.patch new file mode 100644 index 0000000..38ad3a2 --- /dev/null +++ b/SOURCES/007-memory-leak.patch @@ -0,0 +1,39 @@ +From f491d9d5a7ed554fed985de356bb085fdec3421c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 09:01:00 -0600 +Subject: [PATCH] Fix: fencer: avoid memory leak when broadcasting history + differences + +Regression introduced in 2.1.0 by dbc27b2 +--- + daemons/fenced/fenced_history.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index bc159383c..a9c57dc86 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -484,8 +484,6 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + !pcmk__str_eq(remote_peer, stonith_our_uname, pcmk__str_casei)) { + xmlNode *history = get_xpath_object("//" F_STONITH_HISTORY_LIST, + msg, LOG_NEVER); +- GHashTable *received_history = +- history?stonith_xml_history_to_list(history):NULL; + + /* either a broadcast created directly upon stonith-API request + * or a diff as response to such a thing +@@ -497,6 +495,11 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + if (!history || + !crm_is_true(crm_element_value(history, + F_STONITH_DIFFERENTIAL))) { ++ GHashTable *received_history = NULL; ++ ++ if (history != NULL) { ++ received_history = stonith_xml_history_to_list(history); ++ } + out_history = + stonith_local_history_diff_and_merge(received_history, TRUE, NULL); + if (out_history) { +-- +2.27.0 + diff --git a/SOURCES/008-fencing-history.patch b/SOURCES/008-fencing-history.patch new file mode 100644 index 0000000..1ea9ac7 --- /dev/null +++ b/SOURCES/008-fencing-history.patch @@ -0,0 +1,43 @@ +From 0339e89f3238b31df78b864dae8684b82c370741 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 13 Dec 2021 15:22:40 -0600 +Subject: [PATCH] Fix: fencer: get current time correctly + +f52bc8e1ce (2.1.2) introduced a regression by using clock_gettime() with +CLOCK_MONOTONIC to get the current time. Use qb_util_timespec_from_epoch_get() +instead (which as of this writing uses clock_gettime() with CLOCK_REALTIME if +available, and falls back to gettimeofday() if not). +--- + daemons/fenced/fenced_commands.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f34cb4f13..7685cb8c3 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2746,19 +2746,14 @@ bool fencing_peer_active(crm_node_t *peer) + return FALSE; + } + +-void set_fencing_completed(remote_fencing_op_t * op) ++void ++set_fencing_completed(remote_fencing_op_t *op) + { +-#ifdef CLOCK_MONOTONIC + struct timespec tv; + +- clock_gettime(CLOCK_MONOTONIC, &tv); +- ++ qb_util_timespec_from_epoch_get(&tv); + op->completed = tv.tv_sec; + op->completed_nsec = tv.tv_nsec; +-#else +- op->completed = time(NULL); +- op->completed_nsec = 0L; +-#endif + } + + /*! +-- +2.27.0 + diff --git a/SOURCES/009-fencing-reasons.patch b/SOURCES/009-fencing-reasons.patch new file mode 100644 index 0000000..3fb5bc7 --- /dev/null +++ b/SOURCES/009-fencing-reasons.patch @@ -0,0 +1,2985 @@ +From fcd42a5926e9a63d425586552ecc7b543838d352 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 16:57:03 -0600 +Subject: [PATCH 01/23] Feature: fencer: pass full result in async command + replies + +The services library callbacks for async commands, which call +send_async_reply() -> construct_async_reply() to create the reply, now add +fields for exit status, operation status, and exit reason, in addition to the +existing action standard output and legacy return code. + +Nothing uses the new fields yet. +--- + daemons/fenced/fenced_commands.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f34cb4f136..3497428c18 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2415,9 +2415,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); +- int rc = pcmk_rc2legacy(stonith__result2rc(result)); + +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); ++ stonith__xe_set_result(notify_data, result); + crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); + crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); + crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); +@@ -2425,7 +2424,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + } +@@ -2728,9 +2727,8 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); + crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); +- crm_xml_add_int(reply, F_STONITH_RC, +- pcmk_rc2legacy(stonith__result2rc(result))); +- crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); ++ ++ stonith__xe_set_result(reply, result); + return reply; + } + +-- +2.27.0 + + +From 4bac2e9811872f92571e4f5a47d8c5032cfc3016 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:41:29 -0600 +Subject: [PATCH 02/23] Refactor: fencer: track full result for direct agent + actions + +This renames stonith_device_action() to execute_agent_action() for readability, +and has it set a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 95 +++++++++++++++++++------------- + 1 file changed, 56 insertions(+), 39 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 3497428c18..2f59ef84b7 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1729,23 +1729,6 @@ stonith_level_remove(xmlNode *msg, char **desc) + return pcmk_ok; + } + +-/*! +- * \internal +- * \brief Schedule an (asynchronous) action directly on a stonith device +- * +- * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action +- * directly on a specified device. Only list, monitor, and status actions are +- * expected to use this call, though it should work with any agent command. +- * +- * \param[in] msg API message XML with desired action +- * \param[out] output Unused +- * +- * \return -EINPROGRESS on success, -errno otherwise +- * \note If the action is monitor, the device must be registered via the API +- * (CIB registration is not sufficient), because monitor should not be +- * possible unless the device is "started" (API registered). +- */ +- + static char * + list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + { +@@ -1778,8 +1761,23 @@ list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + return rv; + } + +-static int +-stonith_device_action(xmlNode * msg, char **output) ++/*! ++ * \internal ++ * \brief Execute a fence agent action directly (and asynchronously) ++ * ++ * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action ++ * directly on a specified device. Only list, monitor, and status actions are ++ * expected to use this call, though it should work with any agent command. ++ * ++ * \param[in] msg Request XML specifying action ++ * \param[out] result Where to store result of action ++ * ++ * \note If the action is monitor, the device must be registered via the API ++ * (CIB registration is not sufficient), because monitor should not be ++ * possible unless the device is "started" (API registered). ++ */ ++static void ++execute_agent_action(xmlNode *msg, pcmk__action_result_t *result) + { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); + xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); +@@ -1792,39 +1790,56 @@ stonith_device_action(xmlNode * msg, char **output) + crm_info("Malformed API action request: device %s, action %s", + (id? id : "not specified"), + (action? action : "not specified")); +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { ++ // Watchdog agent actions are implemented internally + if (stonith_watchdog_timeout_ms <= 0) { +- return -ENODEV; +- } else { +- if (pcmk__str_eq(action, "list", pcmk__str_casei)) { +- *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); +- return pcmk_ok; +- } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- return pcmk_ok; +- } ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Watchdog fence device not configured"); ++ return; ++ ++ } else if (pcmk__str_eq(action, "list", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result_output(result, ++ list_to_string(stonith_watchdog_targets, ++ "\n", TRUE), ++ NULL); ++ return; ++ ++ } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ return; + } + } + + device = g_hash_table_lookup(device_list, id); +- if ((device == NULL) +- || (!device->api_registered && !strcmp(action, "monitor"))) { ++ if (device == NULL) { ++ crm_info("Ignoring API '%s' action request because device %s not found", ++ action, id); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); ++ return; + ++ } else if (!device->api_registered && !strcmp(action, "monitor")) { + // Monitors may run only on "started" (API-registered) devices +- crm_info("Ignoring API '%s' action request because device %s not found", ++ crm_info("Ignoring API '%s' action request because device %s not active", + action, id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Fence device not active"); ++ return; + } + + cmd = create_async_command(msg); + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + schedule_stonith_command(cmd, device); +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + static void +@@ -2911,8 +2926,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + xmlNode *data = NULL; + bool need_reply = true; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- char *output = NULL; + const char *op = crm_element_value(request, F_STONITH_OPERATION); + const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); + +@@ -2935,8 +2950,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { +- rc = stonith_device_action(request, &output); +- need_reply = (rc != -EINPROGRESS); ++ execute_agent_action(request, &result); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3150,19 +3166,20 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, output, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free(output); + free_xml(data); + + crm_debug("Processed %s request from %s %s: %s (rc=%d)", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), + ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ ++ pcmk__reset_result(&result); + } + + static void +-- +2.27.0 + + +From 9601b2aff1ea6a4eef0bb2701c22c1e971a657eb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 17:31:20 -0600 +Subject: [PATCH 03/23] Refactor: fencer: track full result for local fencing + +This renames stonith_fence() to fence_locally() for readability, and has it set +a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 38 +++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f59ef84b7..bfb0d71e5f 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2626,37 +2626,49 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + } + } + +-static int +-stonith_fence(xmlNode * msg) ++/*! ++ * \internal ++ * \brief Execute a fence action via the local node ++ * ++ * \param[in] msg Fencing request ++ * \param[out] result Where to store result of fence action ++ */ ++static void ++fence_locally(xmlNode *msg, pcmk__action_result_t *result) + { + const char *device_id = NULL; + stonith_device_t *device = NULL; + async_command_t *cmd = create_async_command(msg); + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); + ++ CRM_CHECK(result != NULL, return); ++ + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + device_id = crm_element_value(dev, F_STONITH_DEVICE); +- if (device_id) { ++ if (device_id != NULL) { + device = g_hash_table_lookup(device_list, device_id); + if (device == NULL) { + crm_err("Requested device '%s' is not available", device_id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Requested fence device not found"); ++ return; + } + schedule_stonith_command(cmd, device); + + } else { + const char *host = crm_element_value(dev, F_STONITH_TARGET); + +- if (cmd->options & st_opt_cs_nodeid) { +- int nodeid; +- crm_node_t *node; ++ if (pcmk_is_set(cmd->options, st_opt_cs_nodeid)) { ++ int nodeid = 0; ++ crm_node_t *node = NULL; + + pcmk__scan_min_int(host, &nodeid, 0); + node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY); +- if (node) { ++ if (node != NULL) { + host = node->uname; + } + } +@@ -2666,7 +2678,7 @@ stonith_fence(xmlNode * msg) + TRUE, cmd, stonith_fence_get_devices_cb); + } + +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + xmlNode * +@@ -3016,9 +3028,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +- +- if (remote_peer || stand_alone) { +- rc = stonith_fence(request); ++ if ((remote_peer != NULL) || stand_alone) { ++ fence_locally(request, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { +-- +2.27.0 + + +From b7c7676cfd36fd72d3b29e86a23db97081e19b03 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:06:52 -0600 +Subject: [PATCH 04/23] Low: fencer: handle topology level registration errors + better + +Rename stonith_level_register() to fenced_register_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use a new combination of +PCMK_EXEC_INVALID with CRM_EX_INVALID_PARAM for invalid levels, so it gets +mapped back to the legacy code -EINVAL (which was returned before). +--- + daemons/fenced/fenced_commands.c | 52 +++++++++++++++++++++---------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 3 +- + lib/fencing/st_actions.c | 1 + + 4 files changed, 44 insertions(+), 21 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index bfb0d71e5f..975f8633a4 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1583,20 +1583,19 @@ parse_device_list(const char *devices) + + /*! + * \internal +- * \brief Register a STONITH level for a target ++ * \brief Register a fencing topology level for a target + * + * Given an XML request specifying the target name, level index, and device IDs + * for the level, this will create an entry for the target in the global topology + * table if one does not already exist, then append the specified device IDs to + * the entry's device list for the specified level. + * +- * \param[in] msg XML request for STONITH level registration +- * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") +- * +- * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of registration + */ +-int +-stonith_level_register(xmlNode *msg, char **desc) ++void ++fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + { + int id = 0; + xmlNode *level; +@@ -1607,6 +1606,13 @@ stonith_level_register(xmlNode *msg, char **desc) + stonith_key_value_t *dIter = NULL; + stonith_key_value_t *devices = NULL; + ++ CRM_CHECK(result != NULL, return); ++ ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } ++ + /* Allow the XML here to point to the level tag directly, or wrapped in + * another tag. If directly, don't search by xpath, because it might give + * multiple hits (e.g. if the XML is the CIB). +@@ -1614,11 +1620,15 @@ stonith_level_register(xmlNode *msg, char **desc) + if (pcmk__str_eq(TYPE(msg), XML_TAG_FENCING_LEVEL, pcmk__str_casei)) { + level = msg; + } else { +- level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ } ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; + } +- CRM_CHECK(level != NULL, return -EINVAL); + + mode = stonith_level_kind(level); ++ + target = stonith_level_key(level, mode); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +@@ -1626,18 +1636,26 @@ stonith_level_register(xmlNode *msg, char **desc) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + +- /* Sanity-check arguments */ +- if (mode >= 3 || (id <= 0) || (id >= ST_LEVEL_MAX)) { +- crm_trace("Could not add %s[%d] (%d) to the topology (%d active entries)", target, id, mode, g_hash_table_size(topology)); ++ // Ensure level ID is in allowed range ++ if ((id <= 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology registration for %s with invalid level %d", ++ target, id); + free(target); +- crm_log_xml_err(level, "Bad topology"); +- return -EINVAL; ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; + } + + /* Find or create topology table entry */ + tp = g_hash_table_lookup(topology, target); + if (tp == NULL) { + tp = calloc(1, sizeof(stonith_topology_t)); ++ if (tp == NULL) { ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ strerror(ENOMEM)); ++ return; ++ } + tp->kind = mode; + tp->target = target; + tp->target_value = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_VALUE); +@@ -1671,7 +1689,8 @@ stonith_level_register(xmlNode *msg, char **desc) + crm_info("Target %s has %d active fencing level%s", + tp->target, nlevels, pcmk__plural_s(nlevels)); + } +- return pcmk_ok; ++ ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + int +@@ -3142,7 +3161,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_register(request, &device_id); ++ fenced_register_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 0a8b3bf6f2..469304f67c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -452,8 +452,8 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) + static void + handle_topology_change(xmlNode *match, bool remove) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(match != NULL, return); + crm_trace("Updating %s", ID(match)); +@@ -467,9 +467,10 @@ handle_topology_change(xmlNode *match, bool remove) + free(key); + } + +- rc = stonith_level_register(match, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); +- ++ fenced_register_level(match, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free(desc); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 5162ada75d..cf114fb979 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -218,7 +218,8 @@ void stonith_device_remove(const char *id, bool from_cib); + + char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); +-int stonith_level_register(xmlNode * msg, char **desc); ++void fenced_register_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + int stonith_level_remove(xmlNode * msg, char **desc); + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 7eaa8b0f2b..37fa849847 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -325,6 +325,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + */ + case PCMK_EXEC_INVALID: + switch (result->exit_status) { ++ case CRM_EX_INVALID_PARAM: return EINVAL; + case CRM_EX_INSUFFICIENT_PRIV: return EACCES; + case CRM_EX_PROTOCOL: return EPROTO; + +-- +2.27.0 + + +From 27cedca4070328ecac1761f81c2890059af19dcf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:29:38 -0600 +Subject: [PATCH 05/23] Low: fencer: handle topology level unregistration + errors better + +Rename stonith_level_remove() to fenced_unregister_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use PCMK_EXEC_INVALID with +CRM_EX_INVALID_PARAM for invalid levels, so it gets mapped back to the legacy +code -EINVAL (which reverses the recent change in ec60f014b, both for backward +compatibility and because it makes sense -- a missing parameter is a protocol +error, while an invalid parameter is an invalid parameter error). +--- + daemons/fenced/fenced_commands.c | 52 ++++++++++++++++++++++++------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 4 +-- + 3 files changed, 48 insertions(+), 17 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 975f8633a4..ef41dc0e52 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1693,25 +1693,54 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +-int +-stonith_level_remove(xmlNode *msg, char **desc) ++/*! ++ * \internal ++ * \brief Unregister a fencing topology level for a target ++ * ++ * Given an XML request specifying the target name and level index (or 0 for all ++ * levels), this will remove any corresponding entry for the target from the ++ * global topology table. ++ * ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of unregistration ++ */ ++void ++fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result) + { + int id = -1; + stonith_topology_t *tp; + char *target; ++ xmlNode *level = NULL; ++ ++ CRM_CHECK(result != NULL, return); + +- /* Unlike additions, removal requests should always have one level tag */ +- xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + +- CRM_CHECK(level != NULL, return -EPROTO); ++ // Unlike additions, removal requests should always have one level tag ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + + target = stonith_level_key(level, -1); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +- CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), +- crm_log_xml_warn(msg, "invalid level"); +- free(target); +- return -EPROTO); ++ // Ensure level ID is in allowed range ++ if ((id < 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology unregistration for %s with invalid level %d", ++ target, id); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; ++ } + + if (desc) { + *desc = crm_strdup_printf("%s[%d]", target, id); +@@ -1745,7 +1774,7 @@ stonith_level_remove(xmlNode *msg, char **desc) + } + + free(target); +- return pcmk_ok; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + static char * +@@ -3173,7 +3202,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_remove(request, &device_id); ++ fenced_unregister_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 469304f67c..56acc93f31 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -409,17 +409,18 @@ do_stonith_notify_level(const char *op, int rc, const char *desc) + static void + topology_remove_helper(const char *node, int level) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); + + crm_xml_add(data, F_STONITH_ORIGIN, __func__); + crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + +- rc = stonith_level_remove(data, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); +- ++ fenced_unregister_level(data, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free_xml(data); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index cf114fb979..0006e02e7d 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -220,8 +220,8 @@ char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); + void fenced_register_level(xmlNode *msg, char **desc, + pcmk__action_result_t *result); +- +-int stonith_level_remove(xmlNode * msg, char **desc); ++void fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + stonith_topology_t *find_topology_for_host(const char *host); + +-- +2.27.0 + + +From 3f603defca78eb2bdd46c51a80ed04a4c773442b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 12:22:33 -0600 +Subject: [PATCH 06/23] Log: fencer: track and log full result when handling + requests + +handle_request() now tracks and logs a full result rather than just a +legacy return code. +--- + daemons/fenced/fenced_commands.c | 95 ++++++++++++++++++-------------- + 1 file changed, 53 insertions(+), 42 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index ef41dc0e52..996c18faaa 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2981,9 +2981,7 @@ static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) + { +- int call_options = 0; +- int rc = -EOPNOTSUPP; +- ++ int call_options = st_opt_none; + xmlNode *data = NULL; + bool need_reply = true; + pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +@@ -3006,13 +3004,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + pcmk__ipc_send_xml(client, id, reply, flags); + client->request_id = 0; + free_xml(reply); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { + execute_agent_action(request, &result); + need_reply = (result.execution_status != PCMK_EXEC_PENDING); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3021,7 +3018,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); + do_stonith_async_timeout_update(client_id, call_id, op_timeout); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { +@@ -3033,7 +3030,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + remove_relay_op(request); + + stonith_query(request, remote_peer, client_id, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { +@@ -3055,7 +3052,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { +@@ -3069,27 +3066,27 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value(dev, F_STONITH_TARGET)); + + if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { + if ((remote_peer != NULL) || stand_alone) { + fence_locally(request, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { + case pcmk_rc_ok: +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + break; + case EINPROGRESS: +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, ++ NULL); + break; + default: +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + break; + } + +@@ -3100,17 +3097,15 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *action = crm_element_value(dev, F_STONITH_ACTION); + const char *device = crm_element_value(dev, F_STONITH_DEVICE); + +- if (client) { ++ if (client != NULL) { + int tolerance = 0; + + crm_notice("Client %s wants to fence (%s) %s using %s", + pcmk__client_name(client), action, + target, (device? device : "any device")); +- + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); +- + if (stonith_check_fence_tolerance(tolerance, target, action)) { +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + goto done; + } + +@@ -3143,24 +3138,24 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_xml_add(request, F_STONITH_REMOTE_OP_ID, op->id); + send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, + FALSE); +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + + } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); ++ + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + } +- need_reply = (rc != -EINPROGRESS); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { + stonith_fence_history(request, &data, remote_peer, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + if (pcmk_is_set(call_options, st_opt_discard_reply)) { + /* we don't expect answers to the broadcast + * we might have sent out + */ +- rc = pcmk_ok; + need_reply = false; + } + +@@ -3168,11 +3163,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_device_register(request, &device_id, FALSE); ++ int rc = stonith_device_register(request, &device_id, FALSE); ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ ((rc == pcmk_ok)? NULL : pcmk_strerror(rc))); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3180,22 +3182,25 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + stonith_device_remove(device_id, false); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; + + if (is_privileged(client, op)) { + fenced_register_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3203,11 +3208,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + fenced_unregister_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +@@ -3216,31 +3222,36 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(request, XML_ATTR_ID, &node_id); + name = crm_element_value(request, XML_ATTR_UNAME); + reap_crm_member(node_id, name); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else { + crm_err("Unknown IPC request %s from %s %s", op, + ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client))); ++ pcmk__set_result(&result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, ++ "Unknown IPC request type (bug?)"); + } + + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, ++ pcmk_rc2legacy(stonith__result2rc(&result))); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free_xml(data); +- +- crm_debug("Processed %s request from %s %s: %s (rc=%d)", ++ crm_debug("Processed %s request from %s %s: %s%s%s%s", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), +- ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + ++ free_xml(data); + pcmk__reset_result(&result); + } + +-- +2.27.0 + + +From 5e13199699a4e9279520b3668c072e3db49c9782 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:10:36 -0600 +Subject: [PATCH 07/23] Feature: fencer: pass full result in replies to + requests + +Rename stonith_construct_reply() to fenced_construct_reply() for consistency, +make it take a full result as an argument rather than separate arguments for +legacy return code and output, and add the full result to the reply (along with +the legacy return code, for backward compatibility). + +This is used for peer query replies and some request replies (including replies +to local clients who requested fencing). Other replies, such as those built by +construct_async_reply(), are not affected by this commit. +--- + daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++++++--------- + daemons/fenced/fenced_remote.c | 9 ++++++++- + daemons/fenced/pacemaker-fenced.h | 4 ++-- + 3 files changed, 34 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 996c18faaa..84f89e8daf 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2322,6 +2322,7 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + const char *target = NULL; + int timeout = 0; + xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_NEVER); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); + if (dev) { +@@ -2338,7 +2339,8 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + crm_log_xml_debug(msg, "Query"); + query = calloc(1, sizeof(struct st_query_data)); + +- query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ query->reply = fenced_construct_reply(msg, NULL, &result); + query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; + query->client_id = client_id ? strdup(client_id) : NULL; + query->target = target ? strdup(target) : NULL; +@@ -2729,8 +2731,23 @@ fence_locally(xmlNode *msg, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + ++/*! ++ * \internal ++ * \brief Build an XML reply for a fencing operation ++ * ++ * \param[in] request Request that reply is for ++ * \param[in] data If not NULL, add to reply as call data ++ * \param[in] result Full result of fencing operation ++ * ++ * \return Newly created XML reply ++ * \note The caller is responsible for freeing the result. ++ * \note This has some overlap with construct_async_reply(), but that copies ++ * values from an async_command_t, whereas this one copies them from the ++ * request. ++ */ + xmlNode * +-stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) ++fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *reply = NULL; + +@@ -2738,8 +2755,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- crm_xml_add(reply, F_STONITH_OUTPUT, output); +- crm_xml_add_int(reply, F_STONITH_RC, rc); ++ stonith__xe_set_result(reply, result); + + if (request == NULL) { + /* Most likely, this is the result of a stonith operation that was +@@ -2749,12 +2765,14 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + * @TODO Maybe synchronize this information at start-up? + */ + crm_warn("Missing request information for client notifications for " +- "operation with result %d (initiated before we came up?)", rc); ++ "operation with result '%s' (initiated before we came up?)", ++ pcmk_exec_status_str(result->execution_status)); + + } else { + const char *name = NULL; + const char *value = NULL; + ++ // Attributes to copy from request to reply + const char *names[] = { + F_STONITH_OPERATION, + F_STONITH_CALLID, +@@ -2764,8 +2782,6 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + F_STONITH_CALLOPTS + }; + +- crm_trace("Creating a result reply with%s reply output (rc=%d)", +- (data? "" : "out"), rc); + for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { + name = names[lpc]; + value = crm_element_value(request, name); +@@ -3236,8 +3252,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, +- pcmk_rc2legacy(stonith__result2rc(&result))); ++ xmlNode *reply = fenced_construct_reply(request, data, &result); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8feb401477..baa07d9e78 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -415,7 +415,14 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = stonith_construct_reply(op->request, NULL, data, rc); ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ reply = fenced_construct_reply(op->request, data, &result); ++ } + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0006e02e7d..d5f4bc79fd 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -228,8 +228,8 @@ stonith_topology_t *find_topology_for_host(const char *host); + void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, + gboolean from_peer); + +-xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, +- int rc); ++xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result); + + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); +-- +2.27.0 + + +From b32aa252b321ff40c834d153cb23f8b3be471611 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:43:20 -0600 +Subject: [PATCH 08/23] Log: fencer: grab and log full result when processing + peer fencing replies + +fenced_process_fencing_reply() now checks for the full result, instead of only +a legacy return code, in peer replies, and uses it in log messages. +--- + daemons/fenced/fenced_remote.c | 63 ++++++++++++++++++++-------------- + 1 file changed, 37 insertions(+), 26 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index baa07d9e78..c6369f0051 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2095,21 +2095,21 @@ process_remote_stonith_query(xmlNode * msg) + void + fenced_process_fencing_reply(xmlNode *msg) + { +- int rc = 0; + const char *id = NULL; + const char *device = NULL; + remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return); + + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); + CRM_CHECK(id != NULL, return); + +- dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); ++ dev = stonith__find_xe_with_result(msg); + CRM_CHECK(dev != NULL, return); + +- crm_element_value_int(dev, F_STONITH_RC, &rc); ++ stonith__xe_get_result(dev, &result); + + device = crm_element_value(dev, F_STONITH_DEVICE); + +@@ -2117,7 +2117,7 @@ fenced_process_fencing_reply(xmlNode *msg) + op = g_hash_table_lookup(stonith_remote_op_list, id); + } + +- if (op == NULL && rc == pcmk_ok) { ++ if ((op == NULL) && pcmk__result_ok(&result)) { + /* Record successful fencing operations */ + const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID); + +@@ -2139,16 +2139,19 @@ fenced_process_fencing_reply(xmlNode *msg) + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +- crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " ++ crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_strerror(rc), op->id); +- if (rc == pcmk_ok) { ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2162,28 +2165,35 @@ fenced_process_fencing_reply(xmlNode *msg) + if (pcmk_is_set(op->call_options, st_opt_topology)) { + const char *device = crm_element_value(msg, F_STONITH_DEVICE); + +- crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s " +- CRM_XS " rc=%d", ++ crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, +- op->originator, pcmk_strerror(rc), rc); ++ op->originator, ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + +- if ((op->phase == 2) && (rc != pcmk_ok)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ +- crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s " +- "after successful 'off'", device, rc, op->target); +- rc = pcmk_ok; ++ crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " ++ "after successful 'off'", ++ device, pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : ": ", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ op->target); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (rc == pcmk_ok) { ++ if (pcmk__result_ok(&result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +@@ -2193,29 +2203,30 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + } +- } else if (rc == pcmk_ok && op->devices == NULL) { ++ } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); +- + op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; +- } else if (rc == -ETIME && op->devices == NULL) { ++ } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ + } + + /* Retry on failure */ +- crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, +- op->client_name, rc); +- call_remote_stonith(op, NULL, rc); ++ crm_trace("Next for %s on behalf of %s@%s (result was: %s)", ++ op->target, op->originator, op->client_name, ++ pcmk_exec_status_str(result.execution_status)); ++ call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); + } + + gboolean +-- +2.27.0 + + +From afb5706ac606a8ea883aa1597ee63d9891cc2e13 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:56:30 -0600 +Subject: [PATCH 09/23] Refactor: fencer: pass full result of previous failed + action when initiating peer fencing + +Rename call_remote_stonith() to request_peer_fencing() for readability, and +make it take the full result of the previous failed action, rather than just +its legacy return code, as an argument. + +This does cause one change in behavior: if topology is in use, a previous +attempt failed, and no more peers have the appropriate device, then the +legacy return code returned will be -ENODEV rather than -EHOSTUNREACH. +These are treated similarly internally, and hopefully that will not cause +problems for external code. +--- + daemons/fenced/fenced_remote.c | 89 +++++++++++++++++++++++++--------- + 1 file changed, 67 insertions(+), 22 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c6369f0051..31d5ee6e93 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,12 +76,13 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, +- int rc); + static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + ++static void request_peer_fencing(remote_fencing_op_t *op, ++ peer_device_info_t *peer, ++ pcmk__action_result_t *result); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -609,12 +610,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- call_remote_stonith(op, NULL, -ETIME); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ ++ // Try another device, if appropriate ++ request_peer_fencing(op, NULL, &result); + return FALSE; + } + +@@ -685,9 +690,13 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { ++ // Result won't be used in this case, but we need to pass something ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1533,6 +1542,10 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1569,7 +1582,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", +@@ -1598,15 +1611,30 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + return FALSE; + } + +-void +-call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) ++/*! ++ * \internal ++ * \brief Ask a peer to execute a fencing operation ++ * ++ * \param[in] op Fencing operation to be executed ++ * \param[in] peer If NULL or topology is in use, choose best peer to execute ++ * the fencing, otherwise use this peer ++ * \param[in] result Full result of previous failed attempt, if any (used as ++ * final result only if a previous attempt failed, topology ++ * is not in use, and no devices remain to be attempted) ++ */ ++static void ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, ++ pcmk__action_result_t *result) + { + const char *device = NULL; +- int timeout = op->base_timeout; ++ int timeout; ++ ++ CRM_CHECK(op != NULL, return); + + crm_trace("Action %.8s targeting %s for %s is %s", + op->id, op->target, op->client_name, + stonith_op_state_str(op->state)); ++ timeout = op->base_timeout; + if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) { + peer = stonith_choose_peer(op); + } +@@ -1623,9 +1651,14 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) { +- /* Ignore any peer preference, they might not have the device we need */ +- /* When using topology, stonith_choose_peer() removes the device from +- * further consideration, so be sure to calculate timeout beforehand */ ++ /* Ignore the caller's peer preference if topology is in use, because ++ * that peer might not have access to the required device. With ++ * topology, stonith_choose_peer() removes the device from further ++ * consideration, so the timeout must be calculated beforehand. ++ * ++ * @TODO Basing the total timeout on the caller's preferred peer (above) ++ * is less than ideal. ++ */ + peer = stonith_choose_peer(op); + + device = op->devices->data; +@@ -1722,8 +1755,6 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + finalize_timed_out_op(op); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { +-// int rc = -EHOSTUNREACH; +- + /* if the operation never left the query state, + * but we have all the expected replies, then no devices + * are available to execute the fencing operation. */ +@@ -1735,17 +1766,28 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + } + ++ // This is the only case in which result will be used ++ CRM_CHECK(result != NULL, return); ++ + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- rc = -ENODEV; ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- rc = -EHOSTUNREACH; +- } ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); ++ } ++ /* ... else use result provided by caller -- overwriting it with ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ setting the correct delegate if needed. ++ */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " + "for client %s " CRM_XS " state=%s", +@@ -1754,7 +1796,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + op->state = st_failed; +- remote_op_done(op, NULL, rc, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2004,6 +2046,7 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2038,6 +2081,8 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, + * it is possible fencing levels will be skipped because of the missing +@@ -2045,12 +2090,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } + + } else if (op->state == st_query) { +@@ -2062,12 +2107,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2226,7 +2271,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); +- call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); ++ request_peer_fencing(op, NULL, &result); + } + + gboolean +-- +2.27.0 + + +From 43e08ba7ee1635e47bfaf2a57636101c675b89ae Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:02:04 -0600 +Subject: [PATCH 10/23] Feature: fencer: set exit reason for timeouts waiting + for peer replies + +--- + daemons/fenced/fenced_remote.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 31d5ee6e93..415a7c1b98 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -616,7 +616,9 @@ remote_op_timeout_one(gpointer userdata) + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Peer did not send fence result within timeout"); ++ + + // Try another device, if appropriate + request_peer_fencing(op, NULL, &result); +-- +2.27.0 + + +From 34e5baebac78b7235825b31bebc44e3d65ae45cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:10:28 -0600 +Subject: [PATCH 11/23] Refactor: fencer: pass full result when handling + duplicate actions + +Rename handle_duplicates() to finalize_op_duplicates() for readability, and +make it take a full result rather than a legacy return code as an argument. +--- + daemons/fenced/fenced_remote.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 415a7c1b98..850bfb6eb3 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -439,12 +439,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Finalize all duplicates of a given fencer operation ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { +- GList *iter = NULL; +- +- for (iter = op->duplicates; iter != NULL; iter = iter->next) { ++ for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; + + if (other->state == st_duplicate) { +@@ -452,8 +459,9 @@ handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_strerror(rc), other->id); +- remote_op_done(other, data, rc, TRUE); ++ pcmk_exec_status_str(result->execution_status), ++ other->id); ++ remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); + + } else { + // Possible if (for example) it timed out already +@@ -570,8 +578,13 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + + handle_local_reply_and_notify(op, data, rc); + +- if (dup == FALSE) { +- handle_duplicates(op, data, rc); ++ if (!dup) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ finalize_op_duplicates(op, data, &result); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 939bd6f5f0f79b19d0cc4d869f3c8980fda2e461 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:23:20 -0600 +Subject: [PATCH 12/23] Feature: fencer: set exit reasons for fencing timeouts + +finalize_timed_out_op() now takes an exit reason as an argument. +It is called for fencing timeouts, peer query reply timeouts, +and all capable nodes failing to fence. + +At this point, the exit reason is not used, but that is planned. +--- + daemons/fenced/fenced_remote.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 850bfb6eb3..c10a32442e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -643,10 +643,12 @@ remote_op_timeout_one(gpointer userdata) + * \brief Finalize a remote fencer operation that timed out + * + * \param[in] op Fencer operation that timed out ++ * \param[in] reason Readable description of what step timed out + */ + static void +-finalize_timed_out_op(remote_fencing_op_t *op) ++finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_total = 0; + +@@ -660,13 +662,13 @@ finalize_timed_out_op(remote_fencing_op_t *op) + * devices, and return success. + */ + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ } else { ++ op->state = st_failed; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- +- op->state = st_failed; +- +- remote_op_done(op, NULL, -ETIME, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ pcmk__reset_result(&result); + } + + /*! +@@ -687,7 +689,8 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata); ++ finalize_timed_out_op(userdata, "Fencing could not be completed " ++ "within overall timeout"); + } + return G_SOURCE_REMOVE; + } +@@ -719,7 +722,8 @@ remote_op_query_timeout(gpointer data) + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "No capable peers replied to device query " ++ "within timeout"); + } + + return FALSE; +@@ -1767,7 +1771,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + CRM_XS " state=%s", op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + CRM_CHECK(op->state < st_done, return); +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "All nodes failed, or are unable, to " ++ "fence target"); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { + /* if the operation never left the query state, +-- +2.27.0 + + +From b80b02799260feb98723a460f2f8e8ad5cdc467f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:32:04 -0600 +Subject: [PATCH 13/23] Refactor: fencer: pass full result when finalizing peer + fencing actions + +Rename remote_op_done() to finalize_op() for readability, and make it take a +full result as an argument, rather than a legacy return code. + +This does cause one change in behavior: when all topology levels fail, +the legacy return code returned will be -pcmk_err_generic instead of EINVAL. +--- + daemons/fenced/fenced_history.c | 2 +- + daemons/fenced/fenced_remote.c | 177 ++++++++++++++++++-------------- + 2 files changed, 103 insertions(+), 76 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index bc159383c2..9e38ff0a20 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -374,7 +374,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + set_fencing_completed(op); + /* use -EHOSTUNREACH to not introduce a new return-code that might + trigger unexpected results at other places and to prevent +- remote_op_done from setting the delegate if not present ++ finalize_op from setting the delegate if not present + */ + stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); + } +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c10a32442e..aefc5f311c 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,13 +76,14 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -461,7 +462,7 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); ++ finalize_op(other, data, result, true); + + } else { + // Possible if (for example) it timed out already +@@ -487,104 +488,100 @@ delegate_from_xml(xmlNode *xml) + + /*! + * \internal +- * \brief Finalize a remote operation. ++ * \brief Finalize a peer fencing operation + * +- * \description This function has two code paths. ++ * Clean up after a fencing operation completes. This function has two code ++ * paths: the executioner uses it to broadcast the result to CPG peers, and then ++ * each peer (including the executioner) uses it to process that broadcast and ++ * notify its IPC clients of the result. + * +- * Path 1. This node is the owner of the operation and needs +- * to notify the cpg group via a broadcast as to the operation's +- * results. +- * +- * Path 2. The cpg broadcast is received. All nodes notify their local +- * stonith clients the operation results. +- * +- * So, The owner of the operation first notifies the cluster of the result, +- * and once that cpg notify is received back it notifies all the local clients. +- * +- * Nodes that are passive watchers of the operation will receive the +- * broadcast and only need to notify their local clients the operation finished. +- * +- * \param op, The fencing operation to finalize +- * \param data, The xml msg reply (if present) of the last delegated fencing +- * operation. +- * \param dup, Is this operation a duplicate, if so treat it a little differently +- * making sure the broadcast is not sent out. ++ * \param[in] op Fencer operation that completed ++ * \param[in] data If not NULL, XML reply of last delegated fencing operation ++ * \param[in] result Full operation result ++ * \param[in] dup Whether this operation is a duplicate of another ++ * (in which case, do not broadcast the result) + */ + static void +-remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + ++ CRM_CHECK((op != NULL) && (result != NULL), return); ++ ++ if (op->notify_sent) { ++ // Most likely, this is a timed-out action that eventually completed ++ crm_notice("Operation '%s'%s%s by %s for %s@%s%s: " ++ "Result arrived too late " CRM_XS " id=%.8s", ++ op->action, (op->target? " targeting " : ""), ++ (op->target? op->target : ""), ++ (op->delegate? op->delegate : "unknown node"), ++ op->client_name, op->originator, ++ (op_merged? " (merged)" : ""), ++ op->id); ++ return; ++ } ++ + set_fencing_completed(op); + clear_remote_op_timers(op); + undo_op_remap(op); + +- if (op->notify_sent == TRUE) { +- crm_err("Already sent notifications for '%s' targeting %s by %s for " +- "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s", +- op->action, op->target, +- (op->delegate? op->delegate : "unknown node"), +- op->client_name, op->originator, pcmk_strerror(rc), +- rc, stonith_op_state_str(op->state), op->id); +- goto remote_op_done_cleanup; +- } +- + if (data == NULL) { + data = create_xml_node(NULL, "remote-op"); + local_data = data; + + } else if (op->delegate == NULL) { +- switch (rc) { +- case -ENODEV: +- case -EHOSTUNREACH: ++ switch (result->execution_status) { ++ case PCMK_EXEC_NO_FENCE_DEVICE: + break; ++ case PCMK_EXEC_INVALID: ++ if (result->exit_status == CRM_EX_EXPIRED) { ++ break; ++ } ++ // else fall through + default: + op->delegate = delegate_from_xml(data); + break; + } + } + +- if(dup) { +- op_merged = TRUE; +- } else if (crm_element_value(data, F_STONITH_MERGED)) { +- op_merged = TRUE; +- } ++ if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) { ++ op_merged = true; ++ } + + /* Tell everyone the operation is done, we will continue + * with doing the local notifications once we receive + * the broadcast back. */ + subt = crm_element_value(data, F_SUBTYPE); +- if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { ++ if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE)); +- goto remote_op_done_cleanup; ++ stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ free_xml(local_data); ++ return; + } + +- if (rc == pcmk_ok || dup) { +- level = LOG_NOTICE; +- } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ if (pcmk__result_ok(result) || dup ++ || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +- +- do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s " ++ do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) " + CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""), + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id); ++ (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : ": "), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ op->id); + +- handle_local_reply_and_notify(op, data, rc); ++ handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); + + if (!dup) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- finalize_op_duplicates(op, data, &result); ++ finalize_op_duplicates(op, data, result); + } + + /* Free non-essential parts of the record +@@ -594,20 +591,27 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + g_list_free_full(op->query_results, free_remote_query); + op->query_results = NULL; + } +- + if (op->request) { + free_xml(op->request); + op->request = NULL; + } + +- remote_op_done_cleanup: + free_xml(local_data); + } + ++/*! ++ * \internal ++ * \brief Finalize a watchdog fencer op after the waiting time expires ++ * ++ * \param[in] userdata Fencer operation that completed ++ * ++ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) ++ */ + static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -615,8 +619,9 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return FALSE; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, &result, false); ++ return G_SOURCE_REMOVE; + } + + static gboolean +@@ -667,7 +672,7 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + op->state = st_failed; + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, NULL, &result, false); + pcmk__reset_result(&result); + } + +@@ -1064,9 +1069,13 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- // For the fencer's purposes, the fencing operation is done ++ { ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, &result, false); ++ } + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1200,6 +1209,16 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + return op; + } + ++/*! ++ * \internal ++ * \brief Create a peer fencing operation from a request, and initiate it ++ * ++ * \param[in] client IPC client that made request (NULL to get from request) ++ * \param[in] request Request XML ++ * \param[in] manual_ack Whether this is a manual action confirmation ++ * ++ * \return Newly created operation on success, otherwise NULL ++ */ + remote_fencing_op_t * + initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + gboolean manual_ack) +@@ -1234,9 +1253,17 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + + switch (op->state) { + case st_failed: +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- remote_op_done(op, NULL, -EINVAL, FALSE); ++ // advance_topology_level() exhausted levels ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, &result, false); ++ pcmk__reset_result(&result); ++ } + return op; + + case st_duplicate: +@@ -1607,7 +1634,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ finalize_op(op, msg, &result, false); + } + } + +@@ -1805,7 +1832,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } + /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from + setting the correct delegate if needed. + */ + +@@ -1816,7 +1843,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); ++ finalize_op(op, NULL, result, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2216,7 +2243,7 @@ fenced_process_fencing_reply(xmlNode *msg) + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2241,7 +2268,7 @@ fenced_process_fencing_reply(xmlNode *msg) + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + +@@ -2268,20 +2295,20 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + } + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ +-- +2.27.0 + + +From 8f19c09f1b961ba9aa510b7dcd1875bbabcddcdc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:39:23 -0600 +Subject: [PATCH 14/23] Refactor: fencer: pass full result when broadcasting + replies + +Rename stonith_bcast_result_to_peers() to fenced_broadcast_op_result() for +consistency, and make it take the full result as an argument instead of a +legacy return code. The full result is not yet used, but that is planned. +--- + daemons/fenced/fenced_history.c | 18 ++++++++++++------ + daemons/fenced/fenced_remote.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 9 ++------- + 3 files changed, 26 insertions(+), 16 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 9e38ff0a20..1e07a9815a 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -359,24 +359,29 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); + + g_hash_table_iter_init(&iter, remote_history); + while (g_hash_table_iter_next(&iter, NULL, (void **)&op)) { +- + if (stonith__op_state_pending(op->state) && + pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ + crm_warn("Failing pending operation %.8s originated by us but " + "known only from peer history", op->id); + op->state = st_failed; + set_fencing_completed(op); +- /* use -EHOSTUNREACH to not introduce a new return-code that might +- trigger unexpected results at other places and to prevent +- finalize_op from setting the delegate if not present +- */ +- stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); ++ ++ /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() ++ * from setting a delegate ++ */ ++ pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ "Initiated by earlier fencer " ++ "process and presumed failed"); ++ fenced_broadcast_op_result(op, &result, false); + } + + g_hash_table_iter_steal(&iter); +@@ -391,6 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + ++ pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index aefc5f311c..a0f026c790 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -374,12 +374,21 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + return notify_data; + } + ++/*! ++ * \internal ++ * \brief Broadcast a fence result notification to all CPG peers ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * \param[in] op_merged Whether this operation is a duplicate of another ++ */ + void +-stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, rc); ++ xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -558,7 +567,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ fenced_broadcast_op_result(op, result, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index d5f4bc79fd..ed47ab046c 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -153,13 +153,8 @@ typedef struct remote_fencing_op_s { + + } remote_fencing_op_t; + +-/*! +- * \internal +- * \brief Broadcast the result of an operation to the peers. +- * \param op, Operation whose result should be broadcast +- * \param rc, Result of the operation +- */ +-void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 3396e66b4c9cca895c7412b66159fd2342de1911 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:42:46 -0600 +Subject: [PATCH 15/23] Feature: fencer: add full result to local replies + +handle_local_reply_and_notify() now takes the full result as an argument +instead of a legacy return code, and adds it to the reply to the local +requester. It does not add it to notifications yet, but that is planned. +--- + daemons/fenced/fenced_remote.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index a0f026c790..329e06c444 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -409,8 +409,17 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + return; + } + ++/*! ++ * \internal ++ * \brief Reply to a local request originator and notify all subscribed clients ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -421,26 +430,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, rc); ++ notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- reply = fenced_construct_reply(op->request, data, &result); +- } ++ reply = fenced_construct_reply(op->request, data, result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + + /* mark this op as having notify's already sent */ +@@ -587,7 +589,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + ((result->exit_reason == NULL)? "" : result->exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); ++ handle_local_reply_and_notify(op, data, result); + + if (!dup) { + finalize_op_duplicates(op, data, result); +-- +2.27.0 + + +From 004583f3ef908cbd9dc6305597cb55d5ad22882c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:47:13 -0600 +Subject: [PATCH 16/23] Refactor: fencer: pass full result when sending device + notifications + +Rename do_stonith_notify_device() to fenced_send_device_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 15 +++++++++++++-- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 84f89e8daf..86a761dfab 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3190,7 +3190,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3204,7 +3204,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 56acc93f31..42e167ce78 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -394,10 +394,21 @@ do_stonith_notify_config(const char *op, int rc, + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a device change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or ++ * STONITH_OP_DEVICE_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc ID of device that changed ++ */ + void +-do_stonith_notify_device(const char *op, int rc, const char *desc) ++fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + + void +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index ed47ab046c..0b63680171 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -230,7 +230,9 @@ void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + + void do_stonith_notify(const char *type, int result, xmlNode *data); +-void do_stonith_notify_device(const char *op, int rc, const char *desc); ++void fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + void do_stonith_notify_level(const char *op, int rc, const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, +-- +2.27.0 + + +From ee0777d5ca99d8d2d7805d4a73241ab696c68751 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:51:55 -0600 +Subject: [PATCH 17/23] Refactor: fencer: pass full result when sending + topology notifications + +Rename do_stonith_notify_level() to fenced_send_level_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 21 +++++++++++++++------ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 86a761dfab..2f3dbb035a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3216,7 +3216,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3229,7 +3229,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 42e167ce78..773cf57f6b 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -411,10 +411,21 @@ fenced_send_device_notification(const char *op, + do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a topology level change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc String representation of level ([]) ++ */ + void +-do_stonith_notify_level(const char *op, int rc, const char *desc) ++fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); + } + + static void +@@ -429,8 +440,7 @@ topology_remove_helper(const char *node, int level) + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + + fenced_unregister_level(data, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_DEL, &result, desc); + pcmk__reset_result(&result); + free_xml(data); + free(desc); +@@ -480,8 +490,7 @@ handle_topology_change(xmlNode *match, bool remove) + } + + fenced_register_level(match, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_ADD, &result, desc); + pcmk__reset_result(&result); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0b63680171..8503e813bf 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -233,7 +233,9 @@ void do_stonith_notify(const char *type, int result, xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-void do_stonith_notify_level(const char *op, int rc, const char *desc); ++void fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, +-- +2.27.0 + + +From deec1ea9bcd7e0062755aa8b74358bfd12e4b9f0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:53:26 -0600 +Subject: [PATCH 18/23] Refactor: fencer: pass full result when sending + configuration notifications + +Rename do_stonith_notify_config() to send_config_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/pacemaker-fenced.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 773cf57f6b..d64358e07f 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -379,8 +379,19 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_trace("Notify complete"); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a configuration change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, ++ * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc Description of what changed ++ * \param[in] active Current number of devices or topologies in use ++ */ + static void +-do_stonith_notify_config(const char *op, int rc, ++send_config_notification(const char *op, const pcmk__action_result_t *result, + const char *desc, int active) + { + xmlNode *notify_data = create_xml_node(NULL, op); +@@ -390,7 +401,7 @@ do_stonith_notify_config(const char *op, int rc, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, rc, notify_data); ++ do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + free_xml(notify_data); + } + +@@ -408,7 +419,7 @@ fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); ++ send_config_notification(op, result, desc, g_hash_table_size(device_list)); + } + + /*! +@@ -425,7 +436,7 @@ fenced_send_level_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); ++ send_config_notification(op, result, desc, g_hash_table_size(topology)); + } + + static void +-- +2.27.0 + + +From 432e4445b630fb158482a5f6de1e0e41697a381f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:56:12 -0600 +Subject: [PATCH 19/23] Feature: fencer: pass full result when sending + notifications + +Rename do_stonith_notify() to fenced_send_notification() for consistency, and +make it take the full result as an argument rather than a legacy return code, +and add the full result to the notifications. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/fenced_history.c | 6 +++--- + daemons/fenced/fenced_remote.c | 6 +++--- + daemons/fenced/pacemaker-fenced.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 5 files changed, 23 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f3dbb035a..54ebc12947 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2489,8 +2489,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 1e07a9815a..44310ed77b 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, + g_hash_table_foreach_remove(stonith_remote_op_list, + stonith_remove_history_entry, + (gpointer) target); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +@@ -402,7 +402,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + + if (updated) { + stonith_fence_history_trim(); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + if (cnt == 0) { +@@ -473,7 +473,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + is done so send a notification for anything + that smells like history-sync + */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY_SYNCED, NULL, NULL); + if (crm_element_value(msg, F_STONITH_CALLID)) { + /* this is coming from the stonith-API + * +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 329e06c444..16c181b4b0 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -442,8 +442,8 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; +@@ -1211,7 +1211,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + if (op->state != st_duplicate) { + /* kick history readers */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + /* safe to trim as long as that doesn't touch pending ops */ +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index d64358e07f..6b31b814a3 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -356,8 +356,17 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Notify relevant IPC clients of a fencing operation result ++ * ++ * \param[in] type Notification type ++ * \param[in] result Result of fencing operation (assume success if NULL) ++ * \param[in] data If not NULL, add to notification as call data ++ */ + void +-do_stonith_notify(const char *type, int result, xmlNode *data) ++fenced_send_notification(const char *type, const pcmk__action_result_t *result, ++ xmlNode *data) + { + /* TODO: Standardize the contents of data */ + xmlNode *update_msg = create_xml_node(NULL, "notify"); +@@ -367,7 +376,7 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); + crm_xml_add(update_msg, F_SUBTYPE, type); + crm_xml_add(update_msg, F_STONITH_OPERATION, type); +- crm_xml_add_int(update_msg, F_STONITH_RC, result); ++ stonith__xe_set_result(update_msg, result); + + if (data != NULL) { + add_message_xml(update_msg, F_STONITH_CALLDATA, data); +@@ -401,7 +410,7 @@ send_config_notification(const char *op, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); ++ fenced_send_notification(op, result, notify_data); + free_xml(notify_data); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 8503e813bf..502fcc9a29 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -229,7 +229,9 @@ xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + +-void do_stonith_notify(const char *type, int result, xmlNode *data); ++void fenced_send_notification(const char *type, ++ const pcmk__action_result_t *result, ++ xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-- +2.27.0 + + +From 86deababe506c2bb8259538e5380b6a78dc4b770 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:58:03 -0600 +Subject: [PATCH 20/23] Feature: fencer: pass full result when sending + notifications + +Rename create_op_done_notify() to fencing_result2xml() for readability, +make it take the full result as an argument rather than a legacy return code, +and add the full result to broadcasts and notifications. +--- + daemons/fenced/fenced_remote.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 16c181b4b0..4cf723e6df 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -356,13 +356,22 @@ undo_op_remap(remote_fencing_op_t *op) + } + } + ++/*! ++ * \internal ++ * \brief Create notification data XML for a fencing operation result ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * ++ * \return Newly created XML to add as notification data ++ * \note The caller is responsible for freeing the result. ++ */ + static xmlNode * +-create_op_done_notify(remote_fencing_op_t * op, int rc) ++fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + + crm_xml_add_int(notify_data, "state", op->state); +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); + crm_xml_add(notify_data, F_STONITH_TARGET, op->target); + crm_xml_add(notify_data, F_STONITH_ACTION, op->action); + crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); +@@ -371,6 +380,7 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + ++ stonith__xe_set_result(notify_data, result); + return notify_data; + } + +@@ -388,7 +398,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); ++ xmlNode *notify_data = fencing_result2xml(op, result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -430,7 +440,6 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); +@@ -442,13 +451,14 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ ++ notify_data = fencing_result2xml(op, result); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; + free_xml(reply); +- free_xml(notify_data); + } + + /*! +-- +2.27.0 + + +From 2814cde97520b63ca5f9baf3df37d73507e89d34 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 15 Dec 2021 17:40:52 -0600 +Subject: [PATCH 21/23] Low: fencer: restore check for invalid topology level + target + +... per review. b7c7676c mistakenly dropped it +--- + daemons/fenced/fenced_commands.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 54ebc12947..1a4a791385 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1636,6 +1636,16 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + ++ // Ensure a valid target was specified ++ if ((mode < 0) || (mode > 2)) { ++ crm_warn("Ignoring topology level registration without valid target"); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level target"); ++ return; ++ } ++ + // Ensure level ID is in allowed range + if ((id <= 0) || (id >= ST_LEVEL_MAX)) { + crm_warn("Ignoring topology registration for %s with invalid level %d", +@@ -1643,7 +1653,7 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + free(target); + crm_log_xml_warn(level, "Bad level"); + pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, +- "Invalid topology level"); ++ "Invalid topology level number"); + return; + } + +-- +2.27.0 + + +From c82806f9e16abcea00025fd3a290477aef2d8d83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:23:29 -0600 +Subject: [PATCH 22/23] Low: fencer: free result memory when processing fencing + replies + +found in review +--- + daemons/fenced/fenced_remote.c | 24 +++++++++++++++--------- + 1 file changed, 15 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 4cf723e6df..9fda9ef060 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2241,14 +2241,14 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- return; ++ goto done; + } + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- return; ++ goto done; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +@@ -2265,14 +2265,15 @@ fenced_process_fencing_reply(xmlNode *msg) + op->state = st_failed; + } + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- return; ++ goto done; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2290,7 +2291,7 @@ fenced_process_fencing_reply(xmlNode *msg) + * and notify our local clients. */ + if (op->state == st_done) { + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + + if ((op->phase == 2) && !pcmk__result_ok(&result)) { +@@ -2310,27 +2311,30 @@ fenced_process_fencing_reply(xmlNode *msg) + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- return; ++ goto done; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + } ++ + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else { + /* fall-through and attempt other fencing action using another peer */ + } +@@ -2340,6 +2344,8 @@ fenced_process_fencing_reply(xmlNode *msg) + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); + request_peer_fencing(op, NULL, &result); ++done: ++ pcmk__reset_result(&result); + } + + gboolean +-- +2.27.0 + + +From 137bf97fdb39043eebb02a0d3ebbe47ee8c7044c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:26:22 -0600 +Subject: [PATCH 23/23] Log: fencer: clarify timeout message + +... as suggested by review +--- + daemons/fenced/fenced_remote.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 9fda9ef060..1e237150c5 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -656,7 +656,7 @@ remote_op_timeout_one(gpointer userdata) + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Peer did not send fence result within timeout"); ++ "Peer did not return fence result within timeout"); + + + // Try another device, if appropriate +-- +2.27.0 + diff --git a/SOURCES/010-probe-failures.patch b/SOURCES/010-probe-failures.patch new file mode 100644 index 0000000..d90fc3c --- /dev/null +++ b/SOURCES/010-probe-failures.patch @@ -0,0 +1,4157 @@ +From f2e51898735b5e9990464141fc4aea3dd83f5067 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 14:36:41 -0400 +Subject: [PATCH 01/21] Refactor: scheduler: Use bool in unpack_rsc_op. + +Previously, we were using bool but TRUE/FALSE. Instead, use the actual +values. +--- + lib/pengine/unpack.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b1e84110a2..ecc7275e15 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3671,7 +3671,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + const char *task = NULL; + const char *task_key = NULL; + const char *exit_reason = NULL; +- bool expired = FALSE; ++ bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; + +@@ -3727,7 +3727,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if ((status != PCMK_EXEC_NOT_INSTALLED) + && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { +- expired = TRUE; ++ expired = true; + } + + if (!strcmp(task, CRMD_ACTION_STATUS)) { +-- +2.27.0 + + +From 4c961b8e670d336a368c7fd1535c247e40c6b48e Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 15:07:01 -0400 +Subject: [PATCH 02/21] Refactor: scheduler: Add functions for determining if + an op is a probe. + +--- + include/crm/common/util.h | 3 + + lib/common/operations.c | 21 +++++++ + lib/common/tests/operations/Makefile.am | 6 +- + .../tests/operations/pcmk_is_probe_test.c | 37 +++++++++++++ + .../tests/operations/pcmk_xe_is_probe_test.c | 55 +++++++++++++++++++ + lib/pengine/unpack.c | 12 ++-- + lib/pengine/utils.c | 5 +- + 7 files changed, 127 insertions(+), 12 deletions(-) + create mode 100644 lib/common/tests/operations/pcmk_is_probe_test.c + create mode 100644 lib/common/tests/operations/pcmk_xe_is_probe_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index 2728b64492..fbea6e560c 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -72,6 +72,9 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + const char *timeout); + #define CRM_DEFAULT_OP_TIMEOUT_S "20s" + ++bool pcmk_is_probe(const char *task, guint interval); ++bool pcmk_xe_is_probe(xmlNode *xml_op); ++ + int compare_version(const char *version1, const char *version2); + + /* coverity[+kill] */ +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 366c189702..978df79082 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -537,3 +537,24 @@ pcmk__is_fencing_action(const char *action) + { + return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); + } ++ ++bool ++pcmk_is_probe(const char *task, guint interval) ++{ ++ if (task == NULL) { ++ return false; ++ } ++ ++ return (interval == 0) && pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none); ++} ++ ++bool ++pcmk_xe_is_probe(xmlNode *xml_op) ++{ ++ const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *interval_ms_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS); ++ int interval_ms; ++ ++ pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); ++ return pcmk_is_probe(task, interval_ms); ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index c8814ff0a8..2e3d0b0679 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -1,5 +1,5 @@ + # +-# Copyright 2020 the Pacemaker project contributors ++# Copyright 2020-2021 the Pacemaker project contributors + # + # The version control history for this file may have further details. + # +@@ -12,6 +12,8 @@ LDADD = $(top_builddir)/lib/common/libcrmcommon.la -lcmocka + include $(top_srcdir)/mk/tap.mk + + # Add "_test" to the end of all test program names to simplify .gitignore. +-check_PROGRAMS = parse_op_key_test ++check_PROGRAMS = parse_op_key_test \ ++ pcmk_is_probe_test \ ++ pcmk_xe_is_probe_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_is_probe_test.c b/lib/common/tests/operations/pcmk_is_probe_test.c +new file mode 100644 +index 0000000000..9b449f1a70 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_is_probe_test.c +@@ -0,0 +1,37 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++is_probe_test(void **state) ++{ ++ assert_false(pcmk_is_probe(NULL, 0)); ++ assert_false(pcmk_is_probe("", 0)); ++ assert_false(pcmk_is_probe("blahblah", 0)); ++ assert_false(pcmk_is_probe("monitor", 1)); ++ assert_true(pcmk_is_probe("monitor", 0)); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/common/tests/operations/pcmk_xe_is_probe_test.c b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +new file mode 100644 +index 0000000000..0283d1c145 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +@@ -0,0 +1,55 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_probe_test(void **state) ++{ ++ xmlNode *node = NULL; ++ ++ assert_false(pcmk_xe_is_probe(NULL)); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_is_probe(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index ecc7275e15..7c0c66e696 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -83,7 +83,6 @@ is_dangling_guest_node(pe_node_t *node) + return FALSE; + } + +- + /*! + * \brief Schedule a fence action for a node + * +@@ -2984,7 +2983,6 @@ static void + unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, + enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + pe_action_t *action = NULL; + +@@ -2998,10 +2996,7 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + *last_failure = xml_op; + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; +- } ++ is_probe = pcmk_xe_is_probe(xml_op); + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3163,8 +3158,9 @@ determine_op_status( + } + + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; ++ is_probe = pcmk_xe_is_probe(xml_op); ++ ++ if (is_probe) { + task = "probe"; + } + +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index c5eda3898e..07753e173a 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -1066,8 +1066,7 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + { + int timeout_ms = 0; + const char *value = NULL; +- bool is_probe = pcmk__str_eq(action->task, RSC_STATUS, pcmk__str_casei) +- && (interval_ms == 0); ++ bool is_probe = false; + #if ENABLE_VERSIONED_ATTRS + pe_rsc_action_details_t *rsc_details = NULL; + #endif +@@ -1094,6 +1093,8 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + + CRM_CHECK(action && action->rsc, return); + ++ is_probe = pcmk_is_probe(action->task, interval_ms); ++ + // Cluster-wide + pe__unpack_dataset_nvpairs(data_set->op_defaults, XML_TAG_META_SETS, &rule_data, + action->meta, NULL, FALSE, data_set); +-- +2.27.0 + + +From 09f32df97ab5064a15ba5a1fb3970d5c64ee7b30 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:47:22 -0500 +Subject: [PATCH 03/21] Refactor: scheduler: Move setting interval_ms in + determine_op_status. + +This can now happen in the only place it's being used. +--- + lib/pengine/unpack.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 7c0c66e696..b9986d2462 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3142,7 +3142,6 @@ static int + determine_op_status( + pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + int result = PCMK_EXEC_DONE; + const char *key = get_op_key(xml_op); +@@ -3157,7 +3156,6 @@ determine_op_status( + exit_reason = ""; + } + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + is_probe = pcmk_xe_is_probe(xml_op); + + if (is_probe) { +@@ -3230,12 +3228,17 @@ determine_op_status( + result = PCMK_EXEC_ERROR_FATAL; + break; + +- case PCMK_OCF_UNIMPLEMENT_FEATURE: ++ case PCMK_OCF_UNIMPLEMENT_FEATURE: { ++ guint interval_ms = 0; ++ crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); ++ + if (interval_ms > 0) { + result = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through ++ } ++ + case PCMK_OCF_NOT_INSTALLED: + case PCMK_OCF_INVALID_PARAM: + case PCMK_OCF_INSUFFICIENT_PRIV: +-- +2.27.0 + + +From 6c8f47453afd6c100fddc45187faff17e15f7bfe Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:57:57 -0500 +Subject: [PATCH 04/21] Refactor: scheduler: Add pcmk_xe_mask_failed_probe. + +Given an xmlNodePtr for a resource operation, this function will +determine whether it is a failed probe operation that should not be +displayed in crm_mon (or other places, I suppose) or not. +--- + include/crm/common/util.h | 1 + + lib/common/operations.c | 17 ++ + lib/common/tests/operations/Makefile.am | 3 +- + .../pcmk_xe_mask_probe_failure_test.c | 162 ++++++++++++++++++ + 4 files changed, 182 insertions(+), 1 deletion(-) + create mode 100644 lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index fbea6e560c..784069ba1b 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -74,6 +74,7 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + + bool pcmk_is_probe(const char *task, guint interval); + bool pcmk_xe_is_probe(xmlNode *xml_op); ++bool pcmk_xe_mask_probe_failure(xmlNode *xml_op); + + int compare_version(const char *version1, const char *version2); + +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 978df79082..54482b8863 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -558,3 +558,20 @@ pcmk_xe_is_probe(xmlNode *xml_op) + pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); + return pcmk_is_probe(task, interval_ms); + } ++ ++bool ++pcmk_xe_mask_probe_failure(xmlNode *xml_op) ++{ ++ int status = PCMK_EXEC_UNKNOWN; ++ int rc = PCMK_OCF_OK; ++ ++ if (!pcmk_xe_is_probe(xml_op)) { ++ return false; ++ } ++ ++ crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); ++ crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); ++ ++ return rc == PCMK_OCF_NOT_INSTALLED || rc == PCMK_OCF_INVALID_PARAM || ++ status == PCMK_EXEC_NOT_INSTALLED; ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index 2e3d0b0679..457c5f7c7a 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -14,6 +14,7 @@ include $(top_srcdir)/mk/tap.mk + # Add "_test" to the end of all test program names to simplify .gitignore. + check_PROGRAMS = parse_op_key_test \ + pcmk_is_probe_test \ +- pcmk_xe_is_probe_test ++ pcmk_xe_is_probe_test \ ++ pcmk_xe_mask_probe_failure_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +new file mode 100644 +index 0000000000..a13f6d98f4 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +@@ -0,0 +1,162 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_not_probe_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* Not worth testing this thoroughly since it's just a duplicate of whether ++ * pcmk_op_is_probe works or not. ++ */ ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++op_does_not_have_right_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++check_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* PCMK_EXEC_NOT_SUPPORTED */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_DONE */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_NOT_INSTALLED */ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_HARD */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_FATAL */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_not_probe_test), ++ cmocka_unit_test(op_does_not_have_right_values_test), ++ cmocka_unit_test(check_values_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +-- +2.27.0 + + +From c9ce1aaf93cd20bb01e80102dda0ffffb07e6472 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 14:26:31 -0500 +Subject: [PATCH 05/21] Refactor: scheduler: Combine op status and rc remapping + into one function. + +Well, not quite. Doing the remapping is complicated enough to where it +makes sense to have them in separate functions. However, they can both +be called from a single new function that takes the place of the +previous two calls in unpack_rsc_op. +--- + lib/pengine/unpack.c | 157 ++++++++++++++++++++----------------------- + 1 file changed, 72 insertions(+), 85 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b9986d2462..b659f319fb 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3121,36 +3121,68 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + /*! + * \internal +- * \brief Remap operation status based on action result ++ * \brief Remap informational monitor results and operation status + * +- * Given an action result, determine an appropriate operation status for the +- * purposes of responding to the action (the status provided by the executor is +- * not directly usable since the executor does not know what was expected). ++ * For the monitor results, certain OCF codes are for providing extended information ++ * to the user about services that aren't yet failed but not entirely healthy either. ++ * These must be treated as the "normal" result by Pacemaker. ++ * ++ * For operation status, the action result can be used to determine an appropriate ++ * status for the purposes of responding to the action. The status provided by the ++ * executor is not directly usable since the executor does not know what was expected. + * ++ * \param[in] xml_op Operation history entry XML from CIB status + * \param[in,out] rsc Resource that operation history entry is for +- * \param[in] rc Actual return code of operation +- * \param[in] target_rc Expected return code of operation + * \param[in] node Node where operation was executed +- * \param[in] xml_op Operation history entry XML from CIB status +- * \param[in,out] on_fail What should be done about the result + * \param[in] data_set Current cluster working set ++ * \param[in,out] on_fail What should be done about the result ++ * \param[in] target_rc Expected return code of operation ++ * \param[in,out] rc Actual return code of operation ++ * \param[in,out] status Operation execution status ++ * ++ * \note If the result is remapped and the node is not shutting down or failed, ++ * the operation will be recorded in the data set's list of failed operations ++ * to highlight it for the user. + * +- * \return Operation status based on return code and action info + * \note This may update the resource's current and next role. + */ +-static int +-determine_op_status( +- pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) +-{ ++static void ++remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set, enum action_fail_response *on_fail, ++ int target_rc, int *rc, int *status) { + bool is_probe = false; +- int result = PCMK_EXEC_DONE; +- const char *key = get_op_key(xml_op); + const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *key = get_op_key(xml_op); + const char *exit_reason = crm_element_value(xml_op, + XML_LRM_ATTR_EXIT_REASON); + ++ if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { ++ int remapped_rc = pcmk__effective_rc(*rc); ++ ++ if (*rc != remapped_rc) { ++ crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); ++ if (!node->details->shutdown || node->details->online) { ++ record_failed_op(xml_op, node, rsc, data_set); ++ } ++ ++ *rc = remapped_rc; ++ } ++ } ++ ++ /* If the executor reported an operation status of anything but done or ++ * error, consider that final. But for done or error, we know better whether ++ * it should be treated as a failure or not, because we know the expected ++ * result. ++ */ ++ if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { ++ return; ++ } ++ + CRM_ASSERT(rsc); +- CRM_CHECK(task != NULL, return PCMK_EXEC_ERROR); ++ CRM_CHECK(task != NULL, ++ *status = PCMK_EXEC_ERROR; return); ++ ++ *status = PCMK_EXEC_DONE; + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3171,23 +3203,23 @@ determine_op_status( + * those versions or processing of saved CIB files from those versions, + * so we do not need to care much about this case. + */ +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", + key, node->details->uname); + +- } else if (target_rc != rc) { +- result = PCMK_EXEC_ERROR; ++ } else if (target_rc != *rc) { ++ *status = PCMK_EXEC_ERROR; + pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", + key, node->details->uname, + target_rc, services_ocf_exitcode_str(target_rc), +- rc, services_ocf_exitcode_str(rc), ++ *rc, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason); + } + +- switch (rc) { ++ switch (*rc) { + case PCMK_OCF_OK: + if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, "Probe found %s active on %s at %s", + rsc->id, node->details->uname, + last_change_str(xml_op)); +@@ -3195,10 +3227,10 @@ determine_op_status( + break; + + case PCMK_OCF_NOT_RUNNING: +- if (is_probe || (target_rc == rc) ++ if (is_probe || (target_rc == *rc) + || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { + +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + rsc->role = RSC_ROLE_STOPPED; + + /* clear any previous failure actions */ +@@ -3208,8 +3240,8 @@ determine_op_status( + break; + + case PCMK_OCF_RUNNING_PROMOTED: +- if (is_probe && (rc != target_rc)) { +- result = PCMK_EXEC_DONE; ++ if (is_probe && (*rc != target_rc)) { ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, + "Probe found %s active and promoted on %s at %s", + rsc->id, node->details->uname, +@@ -3221,11 +3253,11 @@ determine_op_status( + case PCMK_OCF_DEGRADED_PROMOTED: + case PCMK_OCF_FAILED_PROMOTED: + rsc->role = RSC_ROLE_PROMOTED; +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + break; + + case PCMK_OCF_NOT_CONFIGURED: +- result = PCMK_EXEC_ERROR_FATAL; ++ *status = PCMK_EXEC_ERROR_FATAL; + break; + + case PCMK_OCF_UNIMPLEMENT_FEATURE: { +@@ -3233,7 +3265,7 @@ determine_op_status( + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + + if (interval_ms > 0) { +- result = PCMK_EXEC_NOT_SUPPORTED; ++ *status = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through +@@ -3248,26 +3280,27 @@ determine_op_status( + pe_proc_err("No further recovery can be attempted for %s " + "because %s on %s failed (%s%s%s) at %s " + CRM_XS " rc=%d id=%s", rsc->id, task, +- node->details->uname, services_ocf_exitcode_str(rc), ++ node->details->uname, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason, +- last_change_str(xml_op), rc, ID(xml_op)); ++ last_change_str(xml_op), *rc, ID(xml_op)); + pe__clear_resource_flags(rsc, pe_rsc_managed); + pe__set_resource_flags(rsc, pe_rsc_block); + } +- result = PCMK_EXEC_ERROR_HARD; ++ *status = PCMK_EXEC_ERROR_HARD; + break; + + default: +- if (result == PCMK_EXEC_DONE) { ++ if (*status == PCMK_EXEC_DONE) { + crm_info("Treating unknown exit status %d from %s of %s " + "on %s at %s as failure", +- rc, task, rsc->id, node->details->uname, ++ *rc, task, rsc->id, node->details->uname, + last_change_str(xml_op)); +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + } + break; + } +- return result; ++ ++ pe_rsc_trace(rsc, "Remapped %s status to %d", key, *status); + } + + // return TRUE if start or monitor last failure but parameters changed +@@ -3622,41 +3655,6 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + } + } + +-/*! +- * \internal +- * \brief Remap informational monitor results to usual values +- * +- * Certain OCF result codes are for providing extended information to the +- * user about services that aren't yet failed but not entirely healthy either. +- * These must be treated as the "normal" result by Pacemaker. +- * +- * \param[in] rc Actual result of a monitor action +- * \param[in] xml_op Operation history XML +- * \param[in] node Node that operation happened on +- * \param[in] rsc Resource that operation happened to +- * \param[in] data_set Cluster working set +- * +- * \return Result code that pacemaker should use +- * +- * \note If the result is remapped, and the node is not shutting down or failed, +- * the operation will be recorded in the data set's list of failed +- * operations, to highlight it for the user. +- */ +-static int +-remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, +- const pe_resource_t *rsc, pe_working_set_t *data_set) +-{ +- int remapped_rc = pcmk__effective_rc(rc); +- +- if (rc != remapped_rc) { +- crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); +- if (!node->details->shutdown || node->details->online) { +- record_failed_op(xml_op, node, rsc, data_set); +- } +- } +- return remapped_rc; +-} +- + static void + unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + xmlNode **last_failure, enum action_fail_response *on_fail, +@@ -3712,7 +3710,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + node->details->uname, rsc->id); + } + +- /* It should be possible to call remap_monitor_rc() first then call ++ /* It should be possible to call remap_operation() first then call + * check_operation_expiry() only if rc != target_rc, because there should + * never be a fail count without at least one unexpected result in the + * resource history. That would be more efficient by avoiding having to call +@@ -3729,9 +3727,8 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + +- if (!strcmp(task, CRMD_ACTION_STATUS)) { +- rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); +- } ++ remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, ++ &rc, &status); + + if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); +@@ -3761,16 +3758,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + +- /* If the executor reported an operation status of anything but done or +- * error, consider that final. But for done or error, we know better whether +- * it should be treated as a failure or not, because we know the expected +- * result. +- */ +- if(status == PCMK_EXEC_DONE || status == PCMK_EXEC_ERROR) { +- status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); +- pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); +- } +- + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From 9fdca1999872b3930cf18b7d807ddb259f23e8a5 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:08:16 -0500 +Subject: [PATCH 06/21] Test: cts-cli: Add test output for a native resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 16 +++++++++++ + cts/cli/regression.crm_mon.exp | 50 ++++++++++++++++++++++++++-------- + 2 files changed, 55 insertions(+), 11 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index e6c6894b6f..b7817e4775 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,16 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -94,6 +104,9 @@ + + + ++ ++ ++ + + + +@@ -135,6 +148,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8714f917a9..d12dce3ae8 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,6 +3485,9 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3495,7 +3498,7 @@ Active Resources: + + + +- ++ + + + +@@ -3548,6 +3551,7 @@ Active Resources: + + + ++ + + + +@@ -3574,6 +3578,9 @@ Active Resources: + + + ++ ++ ++ + + + +@@ -3603,6 +3610,9 @@ Active Resources: + + + ++ ++ ++ + + + =#=#=#= End test: XML output of partially active resources - OK (0) =#=#=#= +@@ -3614,7 +3624,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3631,6 +3641,10 @@ Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3640,13 +3654,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: ++ * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] +@@ -3676,6 +3691,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3695,6 +3712,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3704,7 +3724,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3722,7 +3742,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3741,7 +3761,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3759,7 +3779,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,7 +3797,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3806,6 +3826,7 @@ Inactive Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: + * Node: cluster01: +@@ -3826,6 +3847,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3845,6 +3868,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3854,7 +3880,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3865,6 +3891,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +@@ -3875,7 +3902,7 @@ Full List of Resources: + + + +- ++ + + + +@@ -3905,6 +3932,7 @@ Full List of Resources: + + + ++ + + + +-- +2.27.0 + + +From 1c54d0bbb74d066d55a56eae28d1a579b8854604 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:17:52 -0500 +Subject: [PATCH 07/21] Test: cts-cli: Add test output for a cloned resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 3 +++ + cts/cli/regression.crm_mon.exp | 12 ++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index b7817e4775..1f9dc156aa 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -107,6 +107,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d12dce3ae8..d093bd8106 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3488,6 +3488,7 @@ Active Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3581,6 +3582,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3612,6 +3616,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3645,6 +3650,7 @@ Full List of Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,6 +3699,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3715,6 +3723,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3849,6 +3858,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3871,6 +3882,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +-- +2.27.0 + + +From 9408f08c07eb531ff84b07bf959f3d681ebf2b78 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:48:16 -0500 +Subject: [PATCH 08/21] Test: cts-cli: Change the resources in + partially-active-group. + +dummy-2 is now not running because it failed to start due to an +unimplemented feature. I don't know what could possibly be +unimplemented about a dummy resource, but it's not important. + +There is also a new dummy-3 resource that acts exactly the same as +dummy-2. This preserves checking that the inactive member output can +still be displayed. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 6 +++- + cts/cli/regression.crm_mon.exp | 62 +++++++++++++++++++++++----------- + 2 files changed, 47 insertions(+), 21 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1f9dc156aa..1ce80ea58a 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -54,7 +54,8 @@ + + + +- ++ ++ + + + +@@ -104,6 +105,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d093bd8106..8cf3a1215e 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,8 +3485,10 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3499,12 +3501,12 @@ Failed Resource Actions: + + + +- ++ + + + + +- ++ + + + +@@ -3546,11 +3548,14 @@ Failed Resource Actions: + + + +- ++ + + + +- ++ ++ ++ ++ + + + +@@ -3579,6 +3584,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3615,6 +3623,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3629,7 +3638,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3645,10 +3654,12 @@ Full List of Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3660,7 +3671,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3676,7 +3687,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3697,6 +3708,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3722,6 +3735,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3733,7 +3747,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3742,6 +3756,7 @@ Node List: + Active Resources: + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group + =#=#=#= Begin test: Text output of partially active group, with inactive resources =#=#=#= +@@ -3751,7 +3766,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3760,7 +3775,8 @@ Node List: + Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3770,7 +3786,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3788,7 +3804,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3796,7 +3812,10 @@ Node List: + + Active Resources: + * Resource Group: partially-active-group (1 member inactive): +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ ++Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3806,7 +3825,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3820,7 +3839,7 @@ Node List: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +- * 1 (ocf:pacemaker:Dummy): Active ++ * 2 (ocf:pacemaker:Dummy): Active + * 1 (ocf:pacemaker:remote): Active + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: +@@ -3834,7 +3853,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3856,6 +3875,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3881,6 +3902,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3892,7 +3914,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3914,7 +3936,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 85e76b8bdb4de261a9cb4858eeedd49fba0346a1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:55:51 -0500 +Subject: [PATCH 09/21] Test: cts-cli: Add a failed probe on a new dummy-4 + resource. + +This is to verify that these resources which are part of a group are +displayed properly. No code changes will be necessary, since groups are +just several other resources all in the same pile. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 4 +++ + cts/cli/regression.crm_mon.exp | 51 ++++++++++++++++++++++------------ + 2 files changed, 37 insertions(+), 18 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1ce80ea58a..d4d4a70848 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,7 @@ + + + ++ + + + +@@ -108,6 +109,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8cf3a1215e..c524b199e3 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3483,12 +3483,13 @@ Active Resources: + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3501,7 +3502,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3548,7 +3549,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3556,6 +3557,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3587,6 +3589,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3624,6 +3629,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3638,7 +3644,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3656,10 +3662,12 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3671,7 +3679,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3687,7 +3695,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3710,6 +3718,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3736,6 +3746,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3747,14 +3758,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= +@@ -3766,7 +3777,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,6 +3788,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3786,14 +3798,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + =#=#=#= End test: Text output of active member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of active member of partially active group +@@ -3804,14 +3816,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +@@ -3825,7 +3837,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3853,7 +3865,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3877,6 +3889,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3903,6 +3917,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3914,7 +3929,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3936,7 +3951,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 206d733b6ce8e0ffcad243d282e8baa8c3ff72b4 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 23 Nov 2021 14:33:47 -0500 +Subject: [PATCH 10/21] Test: cts-cli: Add test output for a bundle resource + with a failed probe op. + +This just changes the existing failed bundle resource from not starting +to failing with a reason. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 9 ++++++++ + cts/cli/regression.crm_mon.exp | 40 +++++++++++++++++++++++++--------- + 2 files changed, 39 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index d4d4a70848..5981fc653c 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -178,5 +178,14 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index c524b199e3..b690a26fb6 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3482,7 +3482,7 @@ Active Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3492,6 +3492,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3509,7 +3510,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3540,7 +3541,9 @@ Failed Resource Actions: + + + +- ++ ++ ++ + + + +@@ -3626,12 +3629,18 @@ Failed Resource Actions: + + + ++ ++ ++ ++ ++ + + + + + + ++ + + + +@@ -3657,7 +3666,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3670,6 +3679,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,7 +3703,7 @@ Full List of Resources: + * Stopped: [ cluster02 ] + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + +@@ -3743,12 +3753,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3856,14 +3870,14 @@ Node List: + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: + * 1 (ocf:heartbeat:apache): Active ++ * GuestNode httpd-bundle-1@cluster01: online: ++ * Resources: ++ * 1 (ocf:heartbeat:apache): Active + + Inactive Resources: + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] + * Stopped: [ cluster02 ] +- * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped +@@ -3914,12 +3928,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3939,7 +3957,7 @@ Full List of Resources: + * Started: [ cluster01 ] + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node +@@ -3972,7 +3990,9 @@ Full List of Resources: + + + +- ++ ++ ++ + + + +-- +2.27.0 + + +From 6240a28d36c0349e3b1d7f52c36106580c53bb01 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 22 Nov 2021 10:59:10 -0500 +Subject: [PATCH 11/21] Test: cts: Add --show-detail to a couple of the crm_mon + tests. + +This straightens out a couple differences in output between running +tests locally (where --enable-compat-2.0 is not given, which would +automatically add --show-detail) and running tests under mock (where +that option is given). + +Note that this only really matters for failed resource actions, which +were not previously output as part of any crm_mon regression test. It +is only the patches in this series that have introduced those, and thus +this difference. +--- + cts/cli/regression.crm_mon.exp | 131 ++++++++++++++++++++------------- + cts/cts-cli.in | 10 +-- + 2 files changed, 83 insertions(+), 58 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b690a26fb6..d7b9d98e2c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3466,33 +3466,42 @@ Operations: + =#=#=#= Begin test: Text output of partially active resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3649,24 +3658,32 @@ Failed Resource Actions: + =#=#=#= Begin test: Text output of partially active resources, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3675,46 +3692,54 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3734,7 +3759,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3758,11 +3783,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3826,14 +3851,14 @@ Active Resources: + =#=#=#= Begin test: Text output of inactive member of partially active group =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +@@ -3841,27 +3866,27 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Node cluster01: online: ++ * Node cluster01 (1): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active + * 1 (ocf:pacemaker:ping): Active + * 1 (ocf:pacemaker:remote): Active + * 1 (stonith:fence_xvm): Active +- * Node cluster02: online: ++ * Node cluster02 (2): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +@@ -3876,20 +3901,20 @@ Node List: + + Inactive Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3909,7 +3934,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3933,11 +3958,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index d32bfb7ed1..457816afab 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -420,7 +420,7 @@ function test_crm_mon() { + export CIB_file="$test_home/cli/crm_mon-partial.xml" + + desc="Text output of partially active resources" +- cmd="crm_mon -1" ++ cmd="crm_mon -1 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="XML output of partially active resources" +@@ -428,13 +428,13 @@ function test_crm_mon() { + test_assert_validate $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources" +- cmd="crm_mon -1 -r" ++ cmd="crm_mon -1 -r --show-detail" + test_assert $CRM_EX_OK 0 + + # XML already includes inactive resources + + desc="Complete brief text output, with inactive resources" +- cmd="crm_mon -1 -r --include=all --brief" ++ cmd="crm_mon -1 -r --include=all --brief --show-detail" + test_assert $CRM_EX_OK 0 + + # XML does not have a brief output option +@@ -452,11 +452,11 @@ function test_crm_mon() { + test_assert $CRM_EX_OK 0 + + desc="Text output of inactive member of partially active group" +- cmd="crm_mon -1 --resource=dummy-2" ++ cmd="crm_mon -1 --resource=dummy-2 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Complete brief text output grouped by node, with inactive resources" +- cmd="crm_mon -1 -r --include=all --group-by-node --brief" ++ cmd="crm_mon -1 -r --include=all --group-by-node --brief --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources, filtered by node" +-- +2.27.0 + + +From da14053e5957d84ed0647688d37733adc2f988a3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 29 Nov 2021 15:05:42 -0500 +Subject: [PATCH 12/21] Test: scheduler: Add tests for failed probe operations. + +This adds identical sets of tests for primitive resources and cloned +resources. For the moment, the output reflects the current state of the +code. No changes have been made to properly handle these operations +yet. + +Each set has three resources, and each is set up with a slightly +different configuration of probe failures: + +(1) - Maskable probe failure on each node. +(2) - Maskable probe failure on one node, successful "not running" probe + on the other node. The resource should be started on the node + where "not running" was returned. +(3) - Maskable probe failure on one node, non-maskable probe failure on + the other node. The resource should not be running anywhere, and + should be stopped on the node with the non-maskable failure. +--- + cts/cts-scheduler.in | 2 + + cts/scheduler/dot/failed-probe-clone.dot | 30 ++++ + cts/scheduler/dot/failed-probe-primitive.dot | 4 + + cts/scheduler/exp/failed-probe-clone.exp | 141 ++++++++++++++++++ + cts/scheduler/exp/failed-probe-primitive.exp | 20 +++ + .../scores/failed-probe-clone.scores | 33 ++++ + .../scores/failed-probe-primitive.scores | 9 ++ + .../summary/failed-probe-clone.summary | 46 ++++++ + .../summary/failed-probe-primitive.summary | 27 ++++ + cts/scheduler/xml/failed-probe-clone.xml | 110 ++++++++++++++ + cts/scheduler/xml/failed-probe-primitive.xml | 71 +++++++++ + 11 files changed, 493 insertions(+) + create mode 100644 cts/scheduler/dot/failed-probe-clone.dot + create mode 100644 cts/scheduler/dot/failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/failed-probe-clone.exp + create mode 100644 cts/scheduler/exp/failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/failed-probe-clone.scores + create mode 100644 cts/scheduler/scores/failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/failed-probe-clone.summary + create mode 100644 cts/scheduler/summary/failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/failed-probe-clone.xml + create mode 100644 cts/scheduler/xml/failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 17fd6cefdf..3abcbc6c9d 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -113,6 +113,8 @@ TESTS = [ + [ "probe-3", "Probe (pending node)" ], + [ "probe-4", "Probe (pending node + stopped resource)" ], + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], ++ [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], ++ [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/failed-probe-clone.dot b/cts/scheduler/dot/failed-probe-clone.dot +new file mode 100644 +index 0000000000..90536b46ed +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-clone.dot +@@ -0,0 +1,30 @@ ++ digraph "g" { ++"ping-1_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-1_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2-clone_running_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2-clone_start_0" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2-clone_start_0" -> "ping-2_start_0 cluster02" [ style = bold] ++"ping-2-clone_start_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-2_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_monitor_10000 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_start_0 cluster02" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2_start_0 cluster02" -> "ping-2_monitor_10000 cluster02" [ style = bold] ++"ping-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3-clone_running_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_start_0" -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3-clone_start_0" -> "ping-3_start_0 " [ style = dashed] ++"ping-3-clone_start_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_stop_0" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3-clone_stop_0" -> "ping-3_stop_0 cluster01" [ style = bold] ++"ping-3-clone_stop_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3-clone_stopped_0" -> "ping-3-clone_start_0" [ style = dashed] ++"ping-3-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-3_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3_start_0 " -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3_start_0 " [ style=dashed color="red" fontcolor="black"] ++"ping-3_stop_0 cluster01" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3_stop_0 cluster01" -> "ping-3_start_0 " [ style = dashed] ++"ping-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/dot/failed-probe-primitive.dot b/cts/scheduler/dot/failed-probe-primitive.dot +new file mode 100644 +index 0000000000..6e0c83216a +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-primitive.dot +@@ -0,0 +1,4 @@ ++ digraph "g" { ++"dummy-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/failed-probe-clone.exp b/cts/scheduler/exp/failed-probe-clone.exp +new file mode 100644 +index 0000000000..6be18935bf +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-clone.exp +@@ -0,0 +1,141 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/exp/failed-probe-primitive.exp b/cts/scheduler/exp/failed-probe-primitive.exp +new file mode 100644 +index 0000000000..d0d8aa44dc +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-primitive.exp +@@ -0,0 +1,20 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/failed-probe-clone.scores b/cts/scheduler/scores/failed-probe-clone.scores +new file mode 100644 +index 0000000000..7418b7f153 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-clone.scores +@@ -0,0 +1,33 @@ ++ ++pcmk__clone_allocate: ping-1-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:1 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-3-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__native_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/scores/failed-probe-primitive.scores b/cts/scheduler/scores/failed-probe-primitive.scores +new file mode 100644 +index 0000000000..f313029451 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-primitive.scores +@@ -0,0 +1,9 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-3 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-3 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +new file mode 100644 +index 0000000000..ca15c302aa +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -0,0 +1,46 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * ping-3 (ocf:pacemaker:ping): FAILED cluster01 ++ * Stopped: [ cluster02 ] ++ ++Transition Summary: ++ * Start ping-2:0 ( cluster02 ) ++ * Stop ping-3:0 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Cluster action: clear_failcount for ping-1 on cluster02 ++ * Cluster action: clear_failcount for ping-1 on cluster01 ++ * Cluster action: clear_failcount for ping-2 on cluster02 ++ * Cluster action: clear_failcount for ping-2 on cluster01 ++ * Pseudo action: ping-2-clone_start_0 ++ * Cluster action: clear_failcount for ping-3 on cluster01 ++ * Cluster action: clear_failcount for ping-3 on cluster02 ++ * Pseudo action: ping-3-clone_stop_0 ++ * Resource action: ping-2 start on cluster02 ++ * Pseudo action: ping-2-clone_running_0 ++ * Resource action: ping-3 stop on cluster01 ++ * Pseudo action: ping-3-clone_stopped_0 ++ * Resource action: ping-2 monitor=10000 on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Started: [ cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * Stopped: [ cluster01 cluster02 ] +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +new file mode 100644 +index 0000000000..a634e7f00b +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -0,0 +1,27 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 ++ ++Transition Summary: ++ * Start dummy-2 ( cluster02 ) ++ * Stop dummy-3 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Resource action: dummy-2 start on cluster02 ++ * Resource action: dummy-3 stop on cluster01 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped +diff --git a/cts/scheduler/xml/failed-probe-clone.xml b/cts/scheduler/xml/failed-probe-clone.xml +new file mode 100644 +index 0000000000..f677585bab +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-clone.xml +@@ -0,0 +1,110 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/xml/failed-probe-primitive.xml b/cts/scheduler/xml/failed-probe-primitive.xml +new file mode 100644 +index 0000000000..0c2f6416f5 +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-primitive.xml +@@ -0,0 +1,71 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 271d50e7d6b0ee5ef670b571c6d7aae9272b75ad Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 11 Nov 2021 13:57:05 -0500 +Subject: [PATCH 13/21] Feature: scheduler: Don't output failed resource + probes... + +in the crm_mon "Failed Resource Actions" section. It is expected that +these one-off probes will fail, in which case displaying them in that +section can just come across as confusing to the user. + +And update the crm_mon test output to account for these changes. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 20 -------------------- + lib/pengine/pe_output.c | 4 ++++ + 2 files changed, 4 insertions(+), 20 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d7b9d98e2c..b1643f8b29 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3498,10 +3498,6 @@ Active Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3646,10 +3642,6 @@ Failed Resource Actions: + + + +- +- +- +- + + + +@@ -3693,10 +3685,6 @@ Full List of Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3784,10 +3772,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3959,10 +3943,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c +index 715e001d51..84684598dd 100644 +--- a/lib/pengine/pe_output.c ++++ b/lib/pengine/pe_output.c +@@ -1370,6 +1370,10 @@ failed_action_list(pcmk__output_t *out, va_list args) { + continue; + } + ++ if (pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ + id = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); + if (parse_op_key(id ? id : ID(xml_op), &rsc, NULL, NULL) == FALSE) { + continue; +-- +2.27.0 + + +From 90f641b9223c64701d494297ce3dd3382365acb8 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 10:11:19 -0500 +Subject: [PATCH 14/21] Feature: scheduler: Add a function for finding a failed + probe action... + +for a given resource ID. Optionally, a node ID can also be given to +restrict the failed probe action to one run on the given node. +Otherwise, just the first failed probe action for the resource ID will +be returned. + +See: rhbz#1506372 +--- + include/crm/pengine/internal.h | 2 ++ + lib/pengine/utils.c | 42 ++++++++++++++++++++++++++++++++++ + 2 files changed, 44 insertions(+) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 8c8fbaca90..58dd2e8727 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -574,4 +574,6 @@ gboolean pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean che + gboolean pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + ++xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); ++ + #endif +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 07753e173a..3151f0120b 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2569,3 +2569,45 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + + return resources; + } ++ ++xmlNode * ++pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) ++{ ++ const char *rsc_id = rsc->id; ++ ++ for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; ++ xml_op = pcmk__xml_next(xml_op)) { ++ const char *value = NULL; ++ char *op_id = NULL; ++ ++ /* This resource operation is not a failed probe. */ ++ if (!pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ ++ /* This resource operation was not run on the given node. Note that if name is ++ * NULL, this will always succeed. ++ */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); ++ if (value == NULL || !pcmk__str_eq(value, name, pcmk__str_casei|pcmk__str_null_matches)) { ++ continue; ++ } ++ ++ /* This resource operation has no operation_key. */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); ++ if (!parse_op_key(value ? value : ID(xml_op), &op_id, NULL, NULL)) { ++ continue; ++ } ++ ++ /* This resource operation's ID does not match the rsc_id we are looking for. */ ++ if (!pcmk__str_eq(op_id, rsc_id, pcmk__str_none)) { ++ free(op_id); ++ continue; ++ } ++ ++ free(op_id); ++ return xml_op; ++ } ++ ++ return NULL; ++} +-- +2.27.0 + + +From 2ad9774fe994554243078b131799fed0d1a6dffd Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 15:43:24 -0500 +Subject: [PATCH 15/21] Feature: scheduler: Display the reason why a native rsc + probe failed. + +If inactive resources are being shown, add an extra blurb of text to any +stopped resources that have a failed probe action indicating why the +probe failed. + +And then add a new primitive resource to crm_mon-partial.xml with a +failed probe operation and update the expected test output. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 10 +++++----- + cts/scheduler/summary/failed-probe-primitive.summary | 8 ++++---- + cts/scheduler/summary/multiply-active-stonith.summary | 2 +- + lib/pengine/native.c | 11 +++++++++++ + 4 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b1643f8b29..4333caa11c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3680,8 +3680,8 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +@@ -3811,7 +3811,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3889,7 +3889,7 @@ Inactive Resources: + * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Node Attributes: + * Node: cluster01 (1): +@@ -3963,7 +3963,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +index a634e7f00b..ea8edae494 100644 +--- a/cts/scheduler/summary/failed-probe-primitive.summary ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -4,8 +4,8 @@ Current cluster status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped +- * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 + + Transition Summary: +@@ -22,6 +22,6 @@ Revised Cluster Status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-3 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (not installed) +diff --git a/cts/scheduler/summary/multiply-active-stonith.summary b/cts/scheduler/summary/multiply-active-stonith.summary +index 8ce21d68ee..ec37de03b0 100644 +--- a/cts/scheduler/summary/multiply-active-stonith.summary ++++ b/cts/scheduler/summary/multiply-active-stonith.summary +@@ -25,4 +25,4 @@ Revised Cluster Status: + + * Full List of Resources: + * fencer (stonith:fence_ipmilan): Started node3 +- * rsc1 (lsb:rsc1): Stopped ++ * rsc1 (lsb:rsc1): Stopped (not installed) +diff --git a/lib/pengine/native.c b/lib/pengine/native.c +index 36121c527f..a95c90c09a 100644 +--- a/lib/pengine/native.c ++++ b/lib/pengine/native.c +@@ -599,6 +599,17 @@ pcmk__native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node + g_string_append_printf(outstr, " %s", node->details->uname); + } + ++ // Failed probe operation ++ if (native_displayable_role(rsc) == RSC_ROLE_STOPPED) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node ? node->details->uname : NULL); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_string_append_printf(outstr, " (%s) ", services_ocf_exitcode_str(rc)); ++ } ++ } ++ + // Flags, as: ( [...]) + if (node && !(node->details->online) && node->details->unclean) { + have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); +-- +2.27.0 + + +From b9ca2e834ee01b35c03f153438ef8828b609fb38 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 18 Nov 2021 10:41:42 -0500 +Subject: [PATCH 16/21] Refactor: scheduler: Rearrange pe__clone_default. + +Instead of the single stopped list, maintain a hash table where the keys +are nodes and the values are the status of the node. For now, this is +just "Stopped" or "Stopped (disabled)" but in the future will be +expanded to cover failed probe operations. +--- + lib/pengine/clone.c | 103 +++++++++++++++++++++++++++++++++++--------- + 1 file changed, 82 insertions(+), 21 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 5569c6b6e9..58fb24d24e 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -28,6 +28,55 @@ + #define UNPROMOTED_INSTANCES RSC_ROLE_UNPROMOTED_S + #endif + ++static GList * ++sorted_hash_table_values(GHashTable *table) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!g_list_find_custom(retval, value, (GCompareFunc) strcmp)) { ++ retval = g_list_prepend(retval, (char *) value); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) strcmp); ++ return retval; ++} ++ ++static GList * ++nodes_with_status(GHashTable *table, const char *status) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!strcmp((char *) value, status)) { ++ retval = g_list_prepend(retval, key); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) pcmk__numeric_strcasecmp); ++ return retval; ++} ++ ++static char * ++node_list_to_str(GList *list) ++{ ++ char *retval = NULL; ++ size_t len = 0; ++ ++ for (GList *iter = list; iter != NULL; iter = iter->next) { ++ pcmk__add_word(&retval, &len, (char *) iter->data); ++ } ++ ++ return retval; ++} ++ + static void + clone_header(pcmk__output_t *out, int *rc, pe_resource_t *rsc, clone_variant_data_t *clone_data) + { +@@ -710,10 +759,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + ++ GHashTable *stopped = pcmk__strkey_table(free, free); ++ + char *list_text = NULL; +- char *stopped_list = NULL; + size_t list_text_len = 0; +- size_t stopped_list_len = 0; + + GList *promoted_list = NULL; + GList *started_list = NULL; +@@ -768,7 +817,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, child_rsc->id); ++ g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + + } else if (is_set_recursive(child_rsc, pe_rsc_orphan, TRUE) +@@ -822,7 +871,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- free(stopped_list); ++ g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -890,23 +939,15 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- const char *state = "Stopped"; +- enum rsc_role_e role = configured_role(rsc); +- +- if (role == RSC_ROLE_STOPPED) { +- state = "Stopped (disabled)"; +- } +- + if (!pcmk_is_set(rsc->flags, pe_rsc_unique) + && (clone_data->clone_max > active_instances)) { + + GList *nIter; + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + +- /* Custom stopped list for non-unique clones */ +- free(stopped_list); +- stopped_list = NULL; +- stopped_list_len = 0; ++ /* Custom stopped table for non-unique clones */ ++ g_hash_table_destroy(stopped); ++ stopped = pcmk__strkey_table(free, free); + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -922,19 +963,39 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, +- node->details->uname); ++ const char *state = "Stopped"; ++ ++ if (configured_role(rsc) == RSC_ROLE_STOPPED) { ++ state = "Stopped (disabled)"; ++ } ++ ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); + } + } + g_list_free(list); + } + +- if (stopped_list != NULL) { ++ if (g_hash_table_size(stopped) > 0) { ++ GList *list = sorted_hash_table_values(stopped); ++ + clone_header(out, &rc, rsc, clone_data); + +- out->list_item(out, NULL, "%s: [ %s ]", state, stopped_list); +- free(stopped_list); +- stopped_list_len = 0; ++ for (GList *status_iter = list; status_iter != NULL; status_iter = status_iter->next) { ++ const char *status = status_iter->data; ++ GList *nodes = nodes_with_status(stopped, status); ++ char *str = node_list_to_str(nodes); ++ ++ if (str != NULL) { ++ out->list_item(out, NULL, "%s: [ %s ]", status, str); ++ free(str); ++ } ++ ++ g_list_free(nodes); ++ } ++ ++ g_list_free(list); ++ g_hash_table_destroy(stopped); + + /* If there are no instances of this clone (perhaps because there are no + * nodes configured), simply output the clone header by itself. This can +-- +2.27.0 + + +From 0228a64cea412936fb8ee91b0f83f9800048d3ba Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 10:06:18 -0500 +Subject: [PATCH 17/21] Feature: scheduler: Display the reason why a clone rsc + probe failed. + +This is similar to the previous commit that adds reasons for primitive +resources. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 8 +++---- + .../summary/failed-probe-clone.summary | 14 +++++++------ + include/crm/pengine/internal.h | 2 ++ + lib/pengine/clone.c | 21 +++++++++++++++++-- + lib/pengine/utils.c | 7 +++++++ + 5 files changed, 40 insertions(+), 12 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 4333caa11c..5688500ce5 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3479,7 +3479,7 @@ Node List: + Active Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3663,7 +3663,7 @@ Node List: + Full List of Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3705,7 +3705,7 @@ Full List of Resources: + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 +@@ -3886,7 +3886,7 @@ Node List: + Inactive Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +index ca15c302aa..febee14400 100644 +--- a/cts/scheduler/summary/failed-probe-clone.summary ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -5,12 +5,13 @@ Current cluster status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: + * ping-3 (ocf:pacemaker:ping): FAILED cluster01 +- * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster02 ] + + Transition Summary: + * Start ping-2:0 ( cluster02 ) +@@ -38,9 +39,10 @@ Revised Cluster Status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: + * Started: [ cluster02 ] +- * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster02 ] +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 58dd2e8727..2b20da6e5f 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -576,4 +576,6 @@ gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean ch + + xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); + ++const char *pe__clone_child_id(pe_resource_t *rsc); ++ + #endif +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 58fb24d24e..ef4bdc0edf 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -963,14 +963,23 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node->details->uname); + const char *state = "Stopped"; + + if (configured_role(rsc) == RSC_ROLE_STOPPED) { + state = "Stopped (disabled)"; + } + +- g_hash_table_insert(stopped, strdup(node->details->uname), +- strdup(state)); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ crm_strdup_printf("Stopped (%s)", services_ocf_exitcode_str(rc))); ++ } else { ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); ++ } + } + } + g_list_free(list); +@@ -1113,3 +1122,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent + + return !passes; + } ++ ++const char * ++pe__clone_child_id(pe_resource_t *rsc) ++{ ++ clone_variant_data_t *clone_data = NULL; ++ get_clone_variant_data(clone_data, rsc); ++ return ID(clone_data->xml_obj_child); ++} +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 3151f0120b..6c4f3b6971 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2573,8 +2573,15 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + xmlNode * + pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) + { ++ pe_resource_t *parent = uber_parent(rsc); + const char *rsc_id = rsc->id; + ++ if (rsc->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(rsc); ++ } else if (parent->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(parent); ++ } ++ + for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; + xml_op = pcmk__xml_next(xml_op)) { + const char *value = NULL; +-- +2.27.0 + + +From cf8b01da93fce87526617fefdcee6eb9f6ecdbd1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 24 Nov 2021 10:57:05 -0500 +Subject: [PATCH 18/21] Test: cts-cli: Update the last-rc-change sed + expression. + +This can now occur in both the XML output (where it's wrapped in double +quotes) and the text output (where it's wrapped in single quotes and +followed by a comma). In addition, a plus or minus can occur in the +time string. + +The "{0,1}" syntax takes the place of a "?" for marking the optional +comma. In FreeBSD sed, "?" doesn't mean anything special. +--- + cts/cli/regression.crm_mon.exp | 12 ++++++------ + cts/cts-cli.in | 2 +- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 5688500ce5..957758832d 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3497,7 +3497,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3641,7 +3641,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3684,7 +3684,7 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3771,7 +3771,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3850,7 +3850,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3942,7 +3942,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index 457816afab..72e9a1e912 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -1870,7 +1870,7 @@ for t in $tests; do + -e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \ +- -e 's/ last-rc-change=\"[A-Za-z0-9: ]*\"//'\ ++ -e "s/ last-rc-change=['\"][-+A-Za-z0-9: ]*['\"],\{0,1\}//" \ + -e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\ + -e 's/^Entity: line [0-9][0-9]*: //'\ + -e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \ +-- +2.27.0 + + +From dea61f1b6507fbc978e040c1555384d8d7ffa9f3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 16:23:14 -0500 +Subject: [PATCH 19/21] Fix: include: Bump feature set to 3.12.0. + +This is for the scheduler handling changing regarding maskable probe +failures. + +See: rhbz#1506372. +--- + include/crm/crm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 04d2324d75..16b35e9c55 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.11.0" ++# define CRM_FEATURE_SET "3.12.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +-- +2.27.0 + + +From fef2c61ef462c221809dc91467ea1e96d5478c74 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 6 Dec 2021 16:42:15 -0500 +Subject: [PATCH 20/21] Feature: scheduler: Handle masked probes in the + scheduler. + +These probe operations get their rc/status codes mapped to not +running/done, but still ensures they end up in the list of failed +operations so tool output continues to display them properly. + +Note that failures on bundled resources do not get masked. + +There are no test case changes for this patch. + +See: rhbz#1506372. +--- + lib/pengine/unpack.c | 42 +++++++++++++++++++++++++++++++++++++----- + 1 file changed, 37 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b659f319fb..f3583e97d8 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3169,6 +3169,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, + } + } + ++ if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { ++ *status = PCMK_EXEC_DONE; ++ *rc = PCMK_OCF_NOT_RUNNING; ++ } ++ + /* If the executor reported an operation status of anything but done or + * error, consider that final. But for done or error, we know better whether + * it should be treated as a failure or not, because we know the expected +@@ -3567,12 +3572,12 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + CRM_ASSERT(rsc); + CRM_ASSERT(xml_op); + +- if (rc == PCMK_OCF_NOT_RUNNING) { +- clear_past_failure = TRUE; +- +- } else if (rc == PCMK_OCF_NOT_INSTALLED) { ++ if (rc == PCMK_OCF_NOT_INSTALLED || (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op))) { + rsc->role = RSC_ROLE_STOPPED; + ++ } else if (rc == PCMK_OCF_NOT_RUNNING) { ++ clear_past_failure = TRUE; ++ + } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) { + if (last_failure) { + const char *op_key = get_op_key(xml_op); +@@ -3661,8 +3666,10 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + pe_working_set_t *data_set) + { + int rc = 0; ++ int old_rc = 0; + int task_id = 0; + int target_rc = 0; ++ int old_target_rc = 0; + int status = PCMK_EXEC_UNKNOWN; + guint interval_ms = 0; + const char *task = NULL; +@@ -3671,6 +3678,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; ++ bool maskable_probe_failure = false; + + CRM_CHECK(rsc && node && xml_op, return); + +@@ -3727,10 +3735,22 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + ++ old_rc = rc; ++ old_target_rc = target_rc; ++ + remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, + &rc, &status); + +- if (expired && (rc != target_rc)) { ++ maskable_probe_failure = !pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op); ++ ++ if (expired && maskable_probe_failure && old_rc != old_target_rc) { ++ if (rsc->role <= RSC_ROLE_STOPPED) { ++ rsc->role = RSC_ROLE_UNKNOWN; ++ } ++ ++ goto done; ++ ++ } else if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); + + if (interval_ms == 0) { +@@ -3758,6 +3778,18 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + ++ if (maskable_probe_failure) { ++ crm_notice("Treating probe result '%s' for %s on %s as 'not running'", ++ services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, ++ on_fail, data_set); ++ crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); ++ ++ record_failed_op(xml_op, node, rsc, data_set); ++ resource_location(parent, node, -INFINITY, "masked-probe-failure", data_set); ++ goto done; ++ } ++ + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From ccff6eb60598f389008b0621447056457da79671 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 4 Jan 2022 10:14:48 -0500 +Subject: [PATCH 21/21] Test: scheduler: Add tests for expired, masked probe + failures. + +dummy-1 is a stopped resource with an expired masked probe failure. +This probe should be rescheduled. dummy-2 is a started resource with an +expired masked probe failure. This probe should not be rescheduled. +--- + cts/cts-scheduler.in | 1 + + .../dot/expired-failed-probe-primitive.dot | 8 ++ + .../exp/expired-failed-probe-primitive.exp | 45 ++++++++++++ + .../expired-failed-probe-primitive.scores | 7 ++ + .../expired-failed-probe-primitive.summary | 26 +++++++ + .../xml/expired-failed-probe-primitive.xml | 73 +++++++++++++++++++ + 6 files changed, 160 insertions(+) + create mode 100644 cts/scheduler/dot/expired-failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/expired-failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/expired-failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/expired-failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/expired-failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 3abcbc6c9d..7bc41a0936 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -115,6 +115,7 @@ TESTS = [ + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], + [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], + [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], ++ [ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/expired-failed-probe-primitive.dot b/cts/scheduler/dot/expired-failed-probe-primitive.dot +new file mode 100644 +index 0000000000..610c2b8047 +--- /dev/null ++++ b/cts/scheduler/dot/expired-failed-probe-primitive.dot +@@ -0,0 +1,8 @@ ++ digraph "g" { ++"dummy-1_monitor_0 cluster01" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"dummy-1_monitor_0 cluster02" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-1_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-2_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/expired-failed-probe-primitive.exp b/cts/scheduler/exp/expired-failed-probe-primitive.exp +new file mode 100644 +index 0000000000..3c2cbfe411 +--- /dev/null ++++ b/cts/scheduler/exp/expired-failed-probe-primitive.exp +@@ -0,0 +1,45 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/expired-failed-probe-primitive.scores b/cts/scheduler/scores/expired-failed-probe-primitive.scores +new file mode 100644 +index 0000000000..51ae5510e6 +--- /dev/null ++++ b/cts/scheduler/scores/expired-failed-probe-primitive.scores +@@ -0,0 +1,7 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 +diff --git a/cts/scheduler/summary/expired-failed-probe-primitive.summary b/cts/scheduler/summary/expired-failed-probe-primitive.summary +new file mode 100644 +index 0000000000..ac0604e84f +--- /dev/null ++++ b/cts/scheduler/summary/expired-failed-probe-primitive.summary +@@ -0,0 +1,26 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Transition Summary: ++ * Start dummy-1 ( cluster02 ) ++ ++Executing Cluster Transition: ++ * Resource action: dummy-1 monitor on cluster02 ++ * Resource action: dummy-1 monitor on cluster01 ++ * Resource action: dummy-2 monitor on cluster01 ++ * Resource action: dummy-1 start on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +diff --git a/cts/scheduler/xml/expired-failed-probe-primitive.xml b/cts/scheduler/xml/expired-failed-probe-primitive.xml +new file mode 100644 +index 0000000000..684aa73f92 +--- /dev/null ++++ b/cts/scheduler/xml/expired-failed-probe-primitive.xml +@@ -0,0 +1,73 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + diff --git a/SOURCES/011-fencing-reasons.patch b/SOURCES/011-fencing-reasons.patch new file mode 100644 index 0000000..4422ca0 --- /dev/null +++ b/SOURCES/011-fencing-reasons.patch @@ -0,0 +1,1450 @@ +From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:25:21 -0600 +Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function + renames + +--- + doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index a51220cac9..68158484ce 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -106,7 +106,7 @@ or messaging layer callback, which calls: + the number of active peers), and if this is the last expected reply, + calls + +- * ``call_remote_stonith()``, which calculates the timeout and sends ++ * ``request_peer_fencing()``, which calculates the timeout and sends + ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target + node has a fencing "topology" (which allows specifications such as + "this node can be fenced either with device A, or devices B and C in +@@ -156,7 +156,7 @@ returns, and calls + * done callback (``st_child_done()``), which calls ``schedule_stonith_command()`` + for a new device if there are further required actions to execute or if the + original action failed, then builds and sends an XML reply to the original +- fencer (via ``stonith_send_async_reply()``), then checks whether any ++ fencer (via ``send_async_reply()``), then checks whether any + pending actions are the same as the one just executed and merges them if so. + + Fencing replies +@@ -169,18 +169,18 @@ messaging layer callback, which calls: + + * ``handle_reply()``, which calls + +- * ``process_remote_stonith_exec()``, which calls either +- ``call_remote_stonith()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ * ``fenced_process_fencing_reply()``, which calls either ++ ``request_peer_fencing()`` (to retry a failed operation, or try the next ++ device in a topology is appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or +- ``remote_op_done()`` (if the operation is definitively failed or ++ ``finalize_op()`` (if the operation is definitively failed or + successful). + +- * remote_op_done() broadcasts the result to all peers. ++ * ``finalize_op()`` broadcasts the result to all peers. + + Finally, all peers receive the broadcast result and call + +-* ``remote_op_done()``, which sends the result to all local clients. ++* ``finalize_op()``, which sends the result to all local clients. + + + .. index:: +-- +2.27.0 + + +From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 09:58:16 -0600 +Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action + callback data + +stonith_callback_data_t previously only contained the legacy return code for +the action. Use its new opaque member to store the full result, along with +accessors (available only internally for now). +--- + include/crm/fencing/internal.h | 3 ++ + lib/fencing/st_client.c | 99 ++++++++++++++++++++++++++-------- + 2 files changed, 81 insertions(+), 21 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index f0d294a0b3..eff689e59b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++int stonith__exit_status(stonith_callback_data_t *data); ++int stonith__execution_status(stonith_callback_data_t *data); ++const char *stonith__exit_reason(stonith_callback_data_t *data); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2ca094566b..9d93ffd481 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) + * \param[in] st Fencer API connection + * \param[in] call_id If positive, call ID of completed fence action, otherwise + * legacy return code for early action failure +- * \param[in] rc Legacy return code for action result ++ * \param[in] result Full result for action + * \param[in] userdata User data to pass to callback + * \param[in] callback Fence action callback to invoke + */ + static void +-invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, ++invoke_fence_action_callback(stonith_t *st, int call_id, ++ pcmk__action_result_t *result, ++ void *userdata, + void (*callback) (stonith_t *st, + stonith_callback_data_t *data)) + { + stonith_callback_data_t data = { 0, }; + + data.call_id = call_id; +- data.rc = rc; ++ data.rc = pcmk_rc2legacy(stonith__result2rc(result)); + data.userdata = userdata; ++ data.opaque = (void *) result; + + callback(st, &data); + } +@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *cb_info = NULL; +- int rc = pcmk_ok; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(stonith != NULL, return); + CRM_CHECK(stonith->st_private != NULL, return); +@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + + if (msg == NULL) { + // Fencer didn't reply in time +- rc = -ETIME; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Timeout waiting for reply from fencer"); + CRM_LOG_ASSERT(call_id > 0); + + } else { + // We have the fencer reply +- +- if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { +- rc = -pcmk_err_generic; +- } +- + if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) + || (call_id <= 0)) { + crm_log_xml_warn(msg, "Bad fencer reply"); + } ++ stonith__xe_get_result(msg, &result); + } + + if (call_id > 0) { +@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + } + + if ((cb_info != NULL) && (cb_info->callback != NULL) +- && (rc == pcmk_ok || !(cb_info->only_success))) { ++ && (pcmk__result_ok(&result) || !(cb_info->only_success))) { + crm_trace("Invoking callback %s for call %d", + crm_str(cb_info->id), call_id); +- invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, +- cb_info->callback); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ cb_info->user_data, cb_info->callback); + +- } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { +- crm_warn("Fencing action without registered callback failed: %s", +- pcmk_strerror(rc)); ++ } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { ++ crm_warn("Fencing action without registered callback failed: %d (%s)", ++ result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); + crm_log_xml_debug(msg, "Failed fence update"); + } + + if (private->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); +- invoke_fence_action_callback(stonith, call_id, rc, NULL, ++ invoke_fence_action_callback(stonith, call_id, &result, NULL, + private->op_callback); + } + + if (cb_info != NULL) { + stonith_api_del_callback(stonith, call_id, FALSE); + } ++ pcmk__reset_result(&result); + } + + static gboolean +@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti + CRM_CHECK(stonith->st_private != NULL, return -EINVAL); + private = stonith->st_private; + +- if (call_id == 0) { ++ if (call_id == 0) { // Add global callback + private->op_callback = callback; + +- } else if (call_id < 0) { ++ } else if (call_id < 0) { // Call failed immediately, so call callback now + if (!(options & st_opt_report_only_success)) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); +- invoke_fence_action_callback(stonith, call_id, call_id, user_data, +- callback); ++ pcmk__set_result(&result, CRM_EX_ERROR, ++ stonith__legacy2status(call_id), NULL); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ user_data, callback); + } else { + crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); + } +@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, + freeXpathObject(xpath); + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit status from callback data ++ */ ++int ++stonith__exit_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Execution status from callback data ++ */ ++int ++stonith__execution_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit reason from callback data ++ */ ++const char * ++stonith__exit_reason(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_reason; ++} ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 9 Nov 2021 16:16:03 -0600 +Subject: [PATCH 03/17] Log: controller: improve fencing result messages + +Now that fence callbacks get the full result, we can log a better message. +Also check for error conditions better, improve message wording, and ensure +only a single message is logged per result. +--- + daemons/controld/controld_fencing.c | 83 +++++++++++++++++++---------- + 1 file changed, 56 insertions(+), 27 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f5a252c813..f8d2fc13f4 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + int stonith_id = -1; + int transition_id = -1; + crm_action_t *action = NULL; +- int call_id = data->call_id; +- int rc = data->rc; +- char *userdata = data->userdata; +- +- CRM_CHECK(userdata != NULL, return); +- crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, +- pcmk_strerror(rc), rc); ++ const char *target = NULL; + +- if (AM_I_DC == FALSE) { ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence operation %d result: " ++ "No transition key given (bug?)", ++ ((data == NULL)? -1 : data->call_id)); + return; + } + +- /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ +- /* op->call_id, op->optype, op->node_name, op->op_result, */ +- /* (char *)op->node_list, op->private_data); */ ++ if (!AM_I_DC) { ++ const char *reason = stonith__exit_reason(data); ++ ++ if (reason == NULL) { ++ reason = pcmk_exec_status_str(stonith__execution_status(data)); ++ } ++ crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s", ++ data->call_id, stonith__exit_status(data), reason, ++ (const char *) data->userdata); ++ return; ++ } + +- /* filter out old STONITH actions */ +- CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), ++ CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, ++ &stonith_id, NULL), + goto bail); + +- if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei) +- || transition_graph->id != transition_id) { +- crm_info("Ignoring STONITH action initiated outside of the current transition"); ++ if (transition_graph->complete || (stonith_id < 0) ++ || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none) ++ || (transition_graph->id != transition_id)) { ++ crm_info("Ignoring fence operation %d result: " ++ "Not from current transition " CRM_XS ++ " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", ++ data->call_id, pcmk__btoa(transition_graph->complete), ++ stonith_id, uuid, te_uuid, transition_id, transition_graph->id); + goto bail; + } + + action = controld_get_action(stonith_id); + if (action == NULL) { +- crm_err("Stonith action not matched"); ++ crm_err("Ignoring fence operation %d result: " ++ "Action %d not found in transition graph (bug?) " ++ CRM_XS " uuid=%s transition=%d", ++ data->call_id, stonith_id, uuid, transition_id); ++ goto bail; ++ } ++ ++ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (target == NULL) { ++ crm_err("Ignoring fence operation %d result: No target given (bug?)", ++ data->call_id); + goto bail; + } + + stop_te_timer(action->timer); +- if (rc == pcmk_ok) { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (stonith__exit_status(data) == CRM_EX_OK) { + const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + const char *op = crm_meta_value(action->params, "stonith_action"); + +- crm_info("Stonith operation %d for %s passed", call_id, target); ++ crm_notice("Fence operation %d for %s passed", data->call_id, target); + if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { + te_action_confirmed(action, NULL); + if (pcmk__str_eq("on", op, pcmk__str_casei)) { +@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + st_fail_count_reset(target); + + } else { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + enum transition_action abort_action = tg_restart; ++ int status = stonith__execution_status(data); ++ const char *reason = stonith__exit_reason(data); + ++ if (reason == NULL) { ++ if (status == PCMK_EXEC_DONE) { ++ reason = "Agent returned error"; ++ } else { ++ reason = pcmk_exec_status_str(status); ++ } ++ } + crm__set_graph_action_flags(action, pcmk__graph_action_failed); +- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", +- call_id, target, pcmk_strerror(rc)); + + /* If no fence devices were available, there's no use in immediately + * checking again, so don't start a new transition in that case. + */ +- if (rc == -ENODEV) { +- crm_warn("No devices found in cluster to fence %s, giving up", +- target); ++ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { ++ crm_warn("Fence operation %d for %s failed: %s " ++ "(aborting transition and giving up for now)", ++ data->call_id, target, reason); + abort_action = tg_stop; ++ } else { ++ crm_notice("Fence operation %d for %s failed: %s " ++ "(aborting transition)", data->call_id, target, reason); + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + trigger_graph(); + + bail: +- free(userdata); ++ free(data->userdata); + free(uuid); + return; + } +-- +2.27.0 + + +From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:37:16 -0600 +Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc() + function + +action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as +appropriate to the action standard. However, it was called only from a place +that did not process stonith actions, so that place can just call +services_result2ocf() directly. + +This will simplify planned changes. +--- + daemons/execd/execd_commands.c | 24 ++++++------------------ + 1 file changed, 6 insertions(+), 18 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5bb2aab692..5e123e322e 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc) + return rc; + } + +-static int +-action_get_uniform_rc(svc_action_t *action) +-{ +- lrmd_cmd_t *cmd = action->cb_data; +- +- if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH, +- pcmk__str_casei)) { +- return stonith2uniform_rc(cmd->action, action->rc); +- } else { +- enum ocf_exitcode code = services_result2ocf(action->standard, +- cmd->action, action->rc); +- +- // Cast variable instead of function return to keep compilers happy +- return (int) code; +- } +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -848,6 +831,7 @@ action_complete(svc_action_t * action) + { + lrmd_rsc_t *rsc; + lrmd_cmd_t *cmd = action->cb_data; ++ enum ocf_exitcode code; + + #ifdef PCMK__TIME_USE_CGT + const char *rclass = NULL; +@@ -867,8 +851,12 @@ action_complete(svc_action_t * action) + #endif + + cmd->last_pid = action->pid; +- pcmk__set_result(&(cmd->result), action_get_uniform_rc(action), ++ ++ // Cast variable instead of function return to keep compilers happy ++ code = services_result2ocf(action->standard, cmd->action, action->rc); ++ pcmk__set_result(&(cmd->result), (int) code, + action->status, services__exit_reason(action)); ++ + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + + #ifdef PCMK__TIME_USE_CGT +-- +2.27.0 + + +From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:39:30 -0600 +Subject: [PATCH 05/17] Feature: executor: use full result from fencer for + fence actions + +Now that fence callbacks get the full result, we can improve the executor +command result for fence actions. stonith_action_complete() now takes a +full result, allowing the executor to use that directly rather than map a +legacy return code. +--- + daemons/execd/execd_commands.c | 140 +++++++++++++++++++-------------- + 1 file changed, 80 insertions(+), 60 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5e123e322e..e722994012 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -8,6 +8,7 @@ + */ + + #include ++#include + + #include + +@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) + } + } + +-static int +-stonith2uniform_rc(const char *action, int rc) +-{ +- switch (rc) { +- case pcmk_ok: +- rc = PCMK_OCF_OK; +- break; +- +- case -ENODEV: +- /* This should be possible only for probes in practice, but +- * interpret for all actions to be safe. +- */ +- if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- rc = PCMK_OCF_NOT_RUNNING; +- } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) { +- rc = PCMK_OCF_OK; +- } else { +- rc = PCMK_OCF_NOT_INSTALLED; +- } +- break; +- +- case -EOPNOTSUPP: +- rc = PCMK_OCF_UNIMPLEMENT_FEATURE; +- break; +- +- default: +- rc = PCMK_OCF_UNKNOWN_ERROR; +- break; +- } +- return rc; +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -988,46 +957,84 @@ action_complete(svc_action_t * action) + cmd_finalize(cmd, rsc); + } + ++/*! ++ * \internal ++ * \brief Process the result of a fence device action (start, stop, or monitor) ++ * ++ * \param[in] cmd Fence device action that completed ++ * \param[in] exit_status Fencer API exit status for action ++ * \param[in] execution_status Fencer API execution status for action ++ * \param[in] exit_reason Human-friendly detail, if action failed ++ */ + static void +-stonith_action_complete(lrmd_cmd_t * cmd, int rc) ++stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, ++ enum pcmk_exec_status execution_status, ++ const char *exit_reason) + { + // This can be NULL if resource was removed before command completed + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + +- cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc); ++ // Simplify fencer exit status to uniform exit status ++ if (exit_status != CRM_EX_OK) { ++ exit_status = PCMK_OCF_UNKNOWN_ERROR; ++ } + +- /* This function may be called with status already set to cancelled, if a +- * pending action was aborted. Otherwise, we need to determine status from +- * the fencer return code. +- */ +- if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { +- cmd->result.execution_status = stonith__legacy2status(rc); ++ if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { ++ /* An in-flight fence action was cancelled. The execution status is ++ * already correct, so don't overwrite it. ++ */ ++ execution_status = PCMK_EXEC_CANCELLED; + +- // Simplify status codes from fencer +- switch (cmd->result.execution_status) { ++ } else { ++ /* Some execution status codes have specific meanings for the fencer ++ * that executor clients may not expect, so map them to a simple error ++ * status. ++ */ ++ switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: +- cmd->result.execution_status = PCMK_EXEC_ERROR; ++ execution_status = PCMK_EXEC_ERROR; + break; +- default: ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: ++ /* This should be possible only for probes in practice, but ++ * interpret for all actions to be safe. ++ */ ++ if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_NOT_RUNNING; ++ ++ } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_OK; ++ ++ } else { ++ exit_status = PCMK_OCF_NOT_INSTALLED; ++ } ++ execution_status = PCMK_EXEC_ERROR; + break; +- } + +- // Certain successful actions change the known state of the resource +- if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { +- if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK +- } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING +- } ++ case PCMK_EXEC_NOT_SUPPORTED: ++ exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; ++ break; ++ ++ default: ++ break; + } + } + +- // Give the user more detail than an OCF code +- if (rc != -pcmk_err_generic) { +- cmd->result.exit_reason = strdup(pcmk_strerror(rc)); ++ pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); ++ ++ // Certain successful actions change the known state of the resource ++ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { ++ ++ if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { ++ rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ ++ } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { ++ rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ } + } + + /* The recurring timer should not be running at this point in any case, but +@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + static void + lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- stonith_action_complete(data->userdata, data->rc); ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence action result: " ++ "Invalid callback arguments (bug?)"); ++ } else { ++ stonith_action_complete((lrmd_cmd_t *) data->userdata, ++ stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); ++ } + } + + void +@@ -1097,7 +1112,9 @@ stonith_connection_failed(void) + crm_err("Connection to fencer failed, finalizing %d pending operations", + g_list_length(cmd_list)); + for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { +- stonith_action_complete(cmd_iter->data, -ENOTCONN); ++ stonith_action_complete((lrmd_cmd_t *) cmd_iter->data, ++ CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, ++ "Lost connection to fencer"); + } + g_list_free(cmd_list); + } +@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +- if (rc == 0) { ++ if (rc == pcmk_ok) { + do_monitor = TRUE; + } + +@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + } + } + +- stonith_action_complete(cmd, rc); ++ stonith_action_complete(cmd, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); + } + + static int +-- +2.27.0 + + +From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 16:15:05 -0600 +Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence + results as a hard error + +Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to +PCMK_EXEC_ERROR to keep handling of that situation the same as before the new +code was added. + +However, the earlier handling was less than ideal -- a resource action that +failed due to missing secrets would be retried on the same node, and almost +certainly fail again for the same reason. Now, the executor passes along +PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the +CIB status, and the scheduler will treat it as a hard error (i.e. not retrying +on the same node). + +Backward compatibility isn't a problem because the scheduler treats unknown +status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to +handle it as before. The CRM feature set has been bumped so the handling can't +flip back and forth in a mixed-version cluster. +--- + daemons/execd/execd_commands.c | 1 - + include/crm/crm.h | 4 ++-- + lib/pengine/unpack.c | 3 --- + 3 files changed, 2 insertions(+), 6 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index e722994012..4ced6d1d5c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_SECRETS: + execution_status = PCMK_EXEC_ERROR; + break; + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 16b35e9c55..56b07cb12a 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.12.0" ++# define CRM_FEATURE_SET "3.13.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 3e0384cd2a..8a2d2a6d6d 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + case PCMK_EXEC_INVALID: + break; // Not done, do error handling + +- /* These should only be possible in fence action results, not operation +- * history, but have some handling in place as a fail-safe. +- */ + case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: + status = PCMK_EXEC_ERROR_HARD; +-- +2.27.0 + + +From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:05:20 -0600 +Subject: [PATCH 07/17] Low: executor: improve result for fence device probes + +Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy +return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and +rename to fence_probe_result). Set an appropriate exit reason when available. +--- + daemons/execd/execd_commands.c | 57 ++++++++++++++++++++++++++------- + daemons/execd/pacemaker-execd.h | 9 +++++- + 2 files changed, 54 insertions(+), 12 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 4ced6d1d5c..6e5505e973 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg) + rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); + rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); + rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc); +- rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running" ++ ++ // Initialize fence device probes (to return "not running") ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; + return rsc; + } + +@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ rsc->fence_probe_result = PCMK_EXEC_DONE; // "running" + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running" + } + } + +@@ -1081,14 +1083,13 @@ stonith_connection_failed(void) + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + /* If we registered this fence device, we don't know whether the + * fencer still has the registration or not. Cause future probes to +- * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or +- * started successfully. This is especially important if the +- * controller also went away (possibly due to a cluster layer +- * restart) and won't receive our client notification of any +- * monitors finalized below. ++ * return an error until the resource is stopped or started ++ * successfully. This is especially important if the controller also ++ * went away (possibly due to a cluster layer restart) and won't ++ * receive our client notification of any monitors finalized below. + */ +- if (rsc->st_probe_rc == pcmk_ok) { +- rsc->st_probe_rc = pcmk_err_generic; ++ if (rsc->fence_probe_result == PCMK_EXEC_DONE) { ++ rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED; + } + + if (rsc->active) { +@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) + return rc; + } + ++/*! ++ * \internal ++ * \brief Finalize the result of a fence device probe ++ * ++ * \param[in] cmd Probe action ++ * \param[in] probe_result Probe result ++ */ ++static void ++finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result) ++{ ++ int exit_status = CRM_EX_ERROR; ++ const char *reason = NULL; ++ ++ switch (probe_result) { ++ case PCMK_EXEC_DONE: // Device is "running" ++ exit_status = CRM_EX_OK; ++ break; ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running" ++ break; ++ ++ case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed() ++ reason = "Lost connection to fencer"; ++ break; ++ ++ default: // Shouldn't be possible ++ probe_result = PCMK_EXEC_ERROR; ++ reason = "Invalid fence device probe result (bug?)"; ++ break; ++ } ++ stonith_action_complete(cmd, exit_status, probe_result, reason); ++} ++ + static void + lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + { +@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + if (cmd->interval_ms > 0) { + do_monitor = TRUE; + } else { +- rc = rsc->st_probe_rc; ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; + } + } + +diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h +index 51ef8d22e6..057d889584 100644 +--- a/daemons/execd/pacemaker-execd.h ++++ b/daemons/execd/pacemaker-execd.h +@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s { + * that have been handed off from the pending ops list. */ + GList *recurring_ops; + +- int st_probe_rc; // What value should be returned for a probe if stonith ++ /* If this resource is a fence device, probes are handled internally by the ++ * executor, and this value indicates the result that should currently be ++ * returned for probes. It should be one of: ++ * PCMK_EXEC_DONE (to indicate "running"), ++ * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or ++ * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost"). ++ */ ++ enum pcmk_exec_status fence_probe_result; + + crm_trigger_t *work; + } lrmd_rsc_t; +-- +2.27.0 + + +From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:14:48 -0600 +Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for + probes + +For fence devices, probe results are based on earlier state determinations, +so handle them before requiring an active fencer connection. The effect may be +negligible, but it would allow probes to proceed while waiting for a +reconnection. +--- + daemons/execd/execd_commands.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 6e5505e973..5999ba19c9 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + stonith_t *stonith_api = get_stonith_connection(); + +- if (!stonith_api) { ++ if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) ++ && (cmd->interval_ms == 0)) { ++ // Probes don't require a fencer connection ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; ++ ++ } else if (stonith_api == NULL) { + rc = -ENOTCONN; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + rc = execd_stonith_stop(stonith_api, rsc); + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { +- if (cmd->interval_ms > 0) { +- do_monitor = TRUE; +- } else { +- finalize_fence_device_probe(cmd, rsc->fence_probe_result); +- return; +- } ++ do_monitor = TRUE; + } + + if (do_monitor) { +-- +2.27.0 + + +From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:20:34 -0600 +Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence + device actions + +... and set an exit reason. Previously, it would return success for unsupported +actions. It shouldn't be possible, but it would be nice to have an indication +of what is wrong if a bug is introduced. +--- + daemons/execd/execd_commands.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5999ba19c9..772d6446dc 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + do_monitor = TRUE; ++ ++ } else { ++ stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, ++ PCMK_EXEC_ERROR, ++ "Invalid fence device action (bug?)"); ++ return; + } + + if (do_monitor) { +-- +2.27.0 + + +From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:24:07 -0600 +Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection + +--- + daemons/execd/execd_commands.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 772d6446dc..7ae309d94c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + return; + + } else if (stonith_api == NULL) { +- rc = -ENOTCONN; ++ stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, ++ "No connection to fencer"); ++ return; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +-- +2.27.0 + + +From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 14:42:26 -0600 +Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence + callback + +--- + daemons/fenced/cts-fence-helper.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2adb032f24..c2b55d73b9 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2020 the Pacemaker project contributors ++ * Copyright 2009-2021 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e) + static void + st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- crm_notice("Call id %d completed with rc %d", data->call_id, data->rc); ++ crm_notice("Call %d exited %d: %s (%s)", ++ data->call_id, stonith__exit_status(data), ++ stonith__execution_status(data), ++ crm_str(stonith__exit_reason(data))); + } + + static void +-- +2.27.0 + + +From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:10:14 -0600 +Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of + legacy return code + +--- + daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++---------------- + 1 file changed, 37 insertions(+), 40 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index c2b55d73b9..2739f57804 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -34,23 +34,12 @@ + static GMainLoop *mainloop = NULL; + static crm_trigger_t *trig = NULL; + static int mainloop_iter = 0; +-static int callback_rc = 0; ++static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + typedef void (*mainloop_test_iteration_cb) (int check_event); + + #define MAINLOOP_DEFAULT_TIMEOUT 2 + +-#define mainloop_test_done(pass) \ +- if (pass) { \ +- crm_info("SUCCESS - %s", __func__); \ +- mainloop_iter++; \ +- mainloop_set_trigger(trig); \ +- } else { \ +- crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \ +- crm_exit(CRM_EX_ERROR); \ +- } \ +- callback_rc = 0; \ +- +- + enum test_modes { + test_standard = 0, // test using a specific developer environment + test_passive, // watch notifications only +@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call; + static int expected_notifications = 0; + static int verbose = 0; + ++static void ++mainloop_test_done(const char *origin, bool pass) ++{ ++ if (pass) { ++ crm_info("SUCCESS - %s", origin); ++ mainloop_iter++; ++ mainloop_set_trigger(trig); ++ result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.exit_status = CRM_EX_OK; ++ } else { ++ crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); ++ crm_exit(CRM_EX_ERROR); ++ } ++} ++ ++ + static void + dispatch_helper(int timeout) + { +@@ -385,7 +391,9 @@ static void + static void + mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- callback_rc = data->rc; ++ pcmk__set_result(&result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + iterate_mainloop_tests(TRUE); + } + +@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != 0) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event) + if (check_event) { + uint32_t diff = (time(NULL) - begin); + +- if (callback_rc != -ETIME) { +- mainloop_test_done(FALSE); ++ if (result.execution_status != PCMK_EXEC_TIMEOUT) { ++ mainloop_test_done(__func__, false); + } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { + crm_err + ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", + CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } else { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + return; + } +@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event) + rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != -ENODEV) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, ++ (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); + return; + } + + rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -483,18 +484,14 @@ test_async_monitor(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); + if (rc < 0) { + crm_err("monitor failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + register_callback_helper(rc); +@@ -531,7 +528,7 @@ test_register_async_devices(int check_event) + params); + stonith_key_value_freeall(params, 1, 1); + +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + + static void +@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event) + int rc = stonith_api_connect_retry(st, crm_system_name, 10); + + if (rc == pcmk_ok) { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + return; + } + crm_err("API CONNECTION FAILURE"); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + static void +-- +2.27.0 + + +From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 10:27:45 -0600 +Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo + +caught in review +--- + doc/sphinx/Pacemaker_Development/components.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index 68158484ce..c4d10fc9f5 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -171,7 +171,7 @@ messaging layer callback, which calls: + + * ``fenced_process_fencing_reply()``, which calls either + ``request_peer_fencing()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ device in a topology if appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or + ``finalize_op()`` (if the operation is definitively failed or + successful). +-- +2.27.0 + + +From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:05:40 -0600 +Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how + fencing history works + +--- + doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index c4d10fc9f5..760da77c9b 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call + * ``finalize_op()``, which sends the result to all local clients. + + ++.. index:: ++ single: fence history ++ ++Fencing History ++_______________ ++ ++The fencer keeps a running history of all fencing operations. The bulk of the ++relevant code is in `fenced_history.c` and ensures the history is synchronized ++across all nodes even if a node leaves and rejoins the cluster. ++ ++In libstonithd, this information is represented by `stonith_history_t` and is ++queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and ++`stonith_admin` use this API to display the history. ++ ++ + .. index:: + single: scheduler + single: pacemaker-schedulerd +-- +2.27.0 + + +From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:25:31 -0600 +Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a + callback + +--- + lib/fencing/st_client.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 9d93ffd481..4823751267 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + cb_info->user_data, cb_info->callback); + + } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { +- crm_warn("Fencing action without registered callback failed: %d (%s)", ++ crm_warn("Fencing action without registered callback failed: %d (%s%s%s)", + result.exit_status, +- pcmk_exec_status_str(result.execution_status)); ++ pcmk_exec_status_str(result.execution_status), ++ ((result.exit_reason == NULL)? "" : ": "), ++ ((result.exit_reason == NULL)? "" : result.exit_reason)); + crm_log_xml_debug(msg, "Failed fence update"); + } + +-- +2.27.0 + + +From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:28:27 -0600 +Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent + +... even if the line runs a little long +--- + daemons/execd/execd_commands.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 7ae309d94c..bc3b392b2c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2012-2021 the Pacemaker project contributors ++ * Copyright 2012-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + stonith_action_complete(cmd, + ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), + stonith__legacy2status(rc), +- rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); ++ ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); + } + + static int +-- +2.27.0 + + +From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:29:03 -0600 +Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution + status for completed tests + +It doesn't matter since the value is only checked against a couple of specific +failure values, but this is less confusing. +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2739f57804..e222a59f9f 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass) + crm_info("SUCCESS - %s", origin); + mainloop_iter++; + mainloop_set_trigger(trig); +- result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.execution_status = PCMK_EXEC_DONE; + result.exit_status = CRM_EX_OK; + } else { + crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, +-- +2.27.0 + diff --git a/SOURCES/012-notify-crash.patch b/SOURCES/012-notify-crash.patch new file mode 100644 index 0000000..c18e4f5 --- /dev/null +++ b/SOURCES/012-notify-crash.patch @@ -0,0 +1,65 @@ +From ed8b2c86ab77aaa3d7fd688c049ad5e1b922a9c6 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Thu, 13 Jan 2022 02:56:55 -0800 +Subject: [PATCH] Fix: liblrmd: Avoid double-free during notify operation + +This commit fixes a regression introduced by 31c7fa8a, causing a +double-free in notify operations. lrmd_dispatch_internal() assigns the +exit_reason string directly from an XML node to a new lrmd_event_data_t +object (without duplicating), and this string gets freed twice. + +Free #1: pcmk__create_history_xml() (reached via callback) calls +lrmd__set_result(), which frees event.exit_reason and sets it to NULL. +Free #2: lrmd_ipc_dispatch() frees the XML node, which contains a +pointer to the exit_reason string just freed, after +lrmd_dispatch_internal() returns. + +Prior to 31c7fa8a, pcmk__create_history_xml reset event.rc and +event.op_status but **not** event.exit_reason. + +In this commit we simply make a copy of event.exit_reason in +lrmd_dispatch_internal() before the callback. This way we don't have to +worry about whatever happens in the callback, and we can continue to +unset the exit_reason alongside the rc and op_status. The added overhead +should be minimal. + +This commit also makes a copy of output. That's not strictly necessary +but adds some futureproofing and allows us to call lrmd__reset_result() +at the end of lrmd_dispatch_internal(). + +Resolves: RHBZ#2039675 + +Signed-off-by: Reid Wahl +--- + lib/lrmd/lrmd_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index ee31bb5ae9..5131a648b7 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -305,9 +305,10 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR); + event.type = lrmd_event_exec_complete; + +- // No need to duplicate the memory, so don't use setter functions +- event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT); +- event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON); ++ /* output and exit_reason may be freed by a callback */ ++ event.output = crm_element_value_copy(msg, F_LRMD_RSC_OUTPUT); ++ lrmd__set_result(&event, event.rc, event.op_status, ++ crm_element_value(msg, F_LRMD_RSC_EXIT_REASON)); + + event.params = xml2list(msg); + } else if (pcmk__str_eq(type, LRMD_OP_NEW_CLIENT, pcmk__str_none)) { +@@ -324,6 +325,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + if (event.params) { + g_hash_table_destroy(event.params); + } ++ lrmd__reset_result(&event); + } + + // \return Always 0, to indicate that IPC mainloop source should be kept +-- +2.27.0 + diff --git a/SOURCES/013-probe-failures.patch b/SOURCES/013-probe-failures.patch new file mode 100644 index 0000000..c13867e --- /dev/null +++ b/SOURCES/013-probe-failures.patch @@ -0,0 +1,26 @@ +From 186d5a02fba919c455fd6eeb050b4be107f82159 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 13 Jan 2022 17:02:47 -0500 +Subject: [PATCH] Low: scheduler: Use the old RC code to log maskable probe + failures. + +--- + lib/pengine/unpack.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 8a2d2a6d6d..b01f86257a 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3780,7 +3780,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if (maskable_probe_failure) { + crm_notice("Treating probe result '%s' for %s on %s as 'not running'", +- services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ services_ocf_exitcode_str(old_rc), rsc->id, node->details->uname); + update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, + on_fail, data_set); + crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); +-- +2.27.0 + diff --git a/SOURCES/014-pcmk_delay_base.patch b/SOURCES/014-pcmk_delay_base.patch new file mode 100644 index 0000000..8aba265 --- /dev/null +++ b/SOURCES/014-pcmk_delay_base.patch @@ -0,0 +1,43 @@ +From 9d812b0401d4cedef53a3cc3653ec782a5c49e37 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 13 Jan 2022 10:42:02 -0600 +Subject: [PATCH] Doc: fencer: improve pcmk_delay_base meta-data + +Update its type, since its value can now be a node map as well as a string, +and add more detail to its description. +--- + daemons/fenced/pacemaker-fenced.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 1b954be5a4..12f331496c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -1548,13 +1548,17 @@ main(int argc, char **argv) + PCMK_STONITH_DELAY_BASE); + printf(" Enable a base delay for " + "fencing actions and specify base delay value.\n"); +- printf(" This prevents double fencing when " +- "different delays are configured on the nodes.\nUse this to " +- "enable a static delay for fencing actions.\nThe overall delay " +- "is derived from a random delay value adding this static delay " +- "so that the sum is kept below the maximum delay.\nSet to eg. " +- "node1:1s;node2:5 to set different value per node.\n"); +- printf(" \n"); ++ printf(" This enables a static delay for " ++ "fencing actions, which can help avoid \"death matches\" where " ++ "two nodes try to fence each other at the same time. If " ++ PCMK_STONITH_DELAY_MAX " is also used, a random delay will be " ++ "added such that the total delay is kept below that value.\n" ++ "This can be set to a single time value to apply to any node " ++ "targeted by this device (useful if a separate device is " ++ "configured for each target), or to a node map (for example, " ++ "\"node1:1s;node2:5\") to set a different value per target.\n" ++ " \n"); ++ printf(" \n"); + printf(" \n"); + + printf(" \n", +-- +2.27.0 + diff --git a/SOURCES/015-fencing-reasons.patch b/SOURCES/015-fencing-reasons.patch new file mode 100644 index 0000000..c53b6c9 --- /dev/null +++ b/SOURCES/015-fencing-reasons.patch @@ -0,0 +1,1093 @@ +From 87365f49b1bee0baa536783865fbd835a9cacc97 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 2 Dec 2021 16:12:24 -0600 +Subject: [PATCH 01/11] Refactor: libstonithd: functionize getting notification + data XML + +Also, only get the data when needed. +--- + lib/fencing/st_client.c | 32 +++++++++++++++++++++++--------- + 1 file changed, 23 insertions(+), 9 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 4823751267..72a0a49408 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1312,6 +1312,23 @@ stonith_dump_pending_callbacks(stonith_t * stonith) + return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); + } + ++/*! ++ * \internal ++ * \brief Get the data section of a fencer notification ++ * ++ * \param[in] msg Notification XML ++ * \param[in] ntype Notification type ++ */ ++static xmlNode * ++get_event_data_xml(xmlNode *msg, const char *ntype) ++{ ++ char *data_addr = crm_strdup_printf("//%s", ntype); ++ xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); ++ ++ free(data_addr); ++ return data; ++} ++ + /* + + +@@ -1336,17 +1353,18 @@ xml_to_event(xmlNode * msg) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); +- char *data_addr = crm_strdup_printf("//%s", ntype); +- xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); + + crm_log_xml_trace(msg, "stonith_notify"); + + crm_element_value_int(msg, F_STONITH_RC, &(event->result)); + + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); ++ xmlNode *data = get_event_data_xml(msg, ntype); + +- if (data) { ++ if (data == NULL) { ++ crm_err("No data for %s event", ntype); ++ crm_log_xml_notice(msg, "BadEvent"); ++ } else { + event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); + event->action = crm_element_value_copy(data, F_STONITH_ACTION); + event->target = crm_element_value_copy(data, F_STONITH_TARGET); +@@ -1354,14 +1372,10 @@ xml_to_event(xmlNode * msg) + event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); + event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); + event->device = crm_element_value_copy(data, F_STONITH_DEVICE); +- +- } else { +- crm_err("No data for %s event", ntype); +- crm_log_xml_notice(msg, "BadEvent"); + } ++ event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); + } + +- free(data_addr); + return event; + } + +-- +2.27.0 + + +From 448f86a029d5d7e3c255d813929003a8cc2cffba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:01:23 -0600 +Subject: [PATCH 02/11] Refactor: fencing: parse full result from fencer + notifications + +stonith_event_t previously contained only the legacy return code for the +notification event. Use its new opaque member to store the full result, along +with accessors (available only internally for now). Nothing uses them yet. +--- + include/crm/fencing/internal.h | 5 +++ + lib/fencing/st_client.c | 68 ++++++++++++++++++++++++++++++++-- + 2 files changed, 70 insertions(+), 3 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index eff689e59b..acc16d05e9 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,10 +187,15 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++ + int stonith__exit_status(stonith_callback_data_t *data); + int stonith__execution_status(stonith_callback_data_t *data); + const char *stonith__exit_reason(stonith_callback_data_t *data); + ++int stonith__event_exit_status(stonith_event_t *event); ++int stonith__event_execution_status(stonith_event_t *event); ++const char *stonith__event_exit_reason(stonith_event_t *event); ++ + /*! + * \internal + * \brief Is a fencing operation in pending state? +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 72a0a49408..f58b3a6745 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1349,15 +1349,23 @@ get_event_data_xml(xmlNode *msg, const char *ntype) + + */ + static stonith_event_t * +-xml_to_event(xmlNode * msg) ++xml_to_event(xmlNode *msg, pcmk__action_result_t *result) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); + ++ CRM_ASSERT((event != NULL) && (result != NULL)); ++ + crm_log_xml_trace(msg, "stonith_notify"); + +- crm_element_value_int(msg, F_STONITH_RC, &(event->result)); ++ // All notification types have the operation result ++ event->opaque = result; ++ stonith__xe_get_result(msg, result); ++ ++ // @COMPAT The API originally provided the result as a legacy return code ++ event->result = pcmk_rc2legacy(stonith__result2rc(result)); + ++ // Fence notifications have additional information + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { + xmlNode *data = get_event_data_xml(msg, ntype); + +@@ -1392,6 +1400,7 @@ event_free(stonith_event_t * event) + free(event->executioner); + free(event->device); + free(event->client_origin); ++ pcmk__reset_result((pcmk__action_result_t *) (event->opaque)); + free(event); + } + +@@ -1402,6 +1411,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + stonith_notify_client_t *entry = data; + stonith_event_t *st_event = NULL; + const char *event = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + if (blob->xml == NULL) { + crm_warn("Skipping callback - NULL message"); +@@ -1427,7 +1437,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + return; + } + +- st_event = xml_to_event(blob->xml); ++ st_event = xml_to_event(blob->xml, &result); + + crm_trace("Invoking callback for %p/%s event...", entry, event); + entry->notify(blob->stonith, st_event); +@@ -2366,6 +2376,58 @@ stonith__exit_reason(stonith_callback_data_t *data) + return ((pcmk__action_result_t *) data->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit status from event ++ */ ++int ++stonith__event_exit_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Execution status from event ++ */ ++int ++stonith__event_execution_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit reason from event ++ */ ++const char * ++stonith__event_exit_reason(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_reason; ++} ++ ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 8dab65e65fe760052d1151749a7bfb2203445813 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:02:28 -0600 +Subject: [PATCH 03/11] Refactor: fencing: parse full result from synchronous + fencer replies + +stonith_send_command() now parses the full result from synchronous fencer +replies, and maps that to a legacy return code, rather than parse the legacy +return code directly. + +The full result is not used yet, and won't be until we can break backward API +compatibility, since the API functions that call stonith_send_command() +currently return a legacy code. +--- + lib/fencing/st_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index f58b3a6745..5fec7529e3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1537,11 +1537,13 @@ stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNod + crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); + + if (reply_id == stonith->call_id) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Synchronous reply %d received", reply_id); + +- if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { +- rc = -ENOMSG; +- } ++ stonith__xe_get_result(op_reply, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); ++ pcmk__reset_result(&result); + + if ((call_options & st_opt_discard_reply) || output_data == NULL) { + crm_trace("Discarding reply"); +-- +2.27.0 + + +From 1beb319d8c62ab93b4c08b26a4e03151906c6189 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 6 Dec 2021 17:13:44 -0600 +Subject: [PATCH 04/11] Log: fencing: improve cts-fence-helper result logs + +Use the full result from the fencing event +--- + daemons/fenced/cts-fence-helper.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index e222a59f9f..858cddc9de 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -125,10 +125,14 @@ st_callback(stonith_t * st, stonith_event_t * e) + crm_exit(CRM_EX_DISCONNECT); + } + +- crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)", +- e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed", +- e->target, e->executioner ? e->executioner : "", +- pcmk_strerror(e->result), e->id); ++ crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)", ++ ((e->operation == NULL)? "unknown" : e->operation), ++ ((e->target == NULL)? "no node" : e->target), ++ ((e->executioner == NULL)? "any node" : e->executioner), ++ ((e->origin == NULL)? "unknown client" : e->origin), ++ pcmk_exec_status_str(stonith__event_execution_status(e)), ++ stonith__event_exit_status(e), ++ ((e->id == NULL)? "none" : e->id)); + + if (expected_notifications) { + expected_notifications--; +-- +2.27.0 + + +From b26f701833ade5d7441fba317832d6e827bd16d0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:52:09 -0600 +Subject: [PATCH 05/11] Test: cts-fence-helper: update expected return code + +Before recent changes, libstonithd obtained the fence API's legacy result code +directly from the fencer's XML reply, meaning that the legacy code was the +result of the fencer's mapping of the full result (including the action stderr). + +After those changes, libstonithd now ignores the legacy code in the fencer's +reply, and instead maps the legacy code itself from the full result in the +fencer's reply. + +However, the fencer's reply does not have the action stderr, so failures that +mapped to -pcmk_err_generic on the server side now map to -ENODATA on the +client side. Update cts-fence-helper's expected return code to match (neither +code is particularly useful, so there wouldn't be much benefit from having the +fencer pass the action stderr with replies, which would be considerable +additional work). +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 858cddc9de..e3113452ef 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -207,10 +207,10 @@ run_fence_failure_test(void) + "Register device1 for failure test", 1, 0); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), +- "Fence failure results off", 1, -pcmk_err_generic); ++ "Fence failure results off", 1, -ENODATA); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0), +- "Fence failure results reboot", 1, -pcmk_err_generic); ++ "Fence failure results reboot", 1, -ENODATA); + + single_test(st->cmds->remove_device(st, st_opts, "test-id1"), + "Remove device1 for failure test", 1, 0); +-- +2.27.0 + + +From 123429de229c2148e320c76530b95e6ba458b9f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:28:48 -0600 +Subject: [PATCH 06/11] Low: controller: compare fencing targets + case-insensitively + +... since they are node names +--- + daemons/controld/controld_fencing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f8d2fc13f4..70e141dc28 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -466,7 +466,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + return; + + } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { ++ && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { + + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster +-- +2.27.0 + + +From 3a067b8e58b3aefb49b2af1c35d0ad28b2de8784 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:37:56 -0600 +Subject: [PATCH 07/11] Refactor: controller: best practices for handling + fencing notifications + +Rename tengine_stonith_notify() to handle_fence_notification(), rename its +st_event argument to event, add a doxygen block, and use some new variables and +reformatting to make it easier to follow (and change later). +--- + daemons/controld/controld_fencing.c | 131 ++++++++++++++++------------ + 1 file changed, 75 insertions(+), 56 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 70e141dc28..00626444da 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -435,39 +435,59 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) + } + } + ++/*! ++ * \internal ++ * \brief Handle an event notification from the fencing API ++ * ++ * \param[in] st Fencing API connection ++ * \param[in] event Fencing API event notification ++ */ + static void +-tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) ++handle_fence_notification(stonith_t *st, stonith_event_t *event) + { ++ bool succeeded = true; ++ const char *executioner = "the cluster"; ++ const char *client = "a client"; ++ + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, + (unsigned long) getpid()); + } + +- if (st_event == NULL) { ++ if (event == NULL) { + crm_err("Notify data not found"); + return; + } + +- crmd_alert_fencing_op(st_event); ++ if (event->executioner != NULL) { ++ executioner = event->executioner; ++ } ++ if (event->client_origin != NULL) { ++ client = event->client_origin; ++ } + +- if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- st_event->origin); +- /* TODO: Hook up st_event->device */ +- return; ++ if (event->result != pcmk_ok) { ++ succeeded = false; ++ } + +- } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_err("Unfencing of %s by %s failed: %s (%d)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- pcmk_strerror(st_event->result), st_event->result); +- return; ++ crmd_alert_fencing_op(event); + +- } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { ++ if (pcmk__str_eq("on", event->action, pcmk__str_none)) { ++ // Unfencing doesn't need special handling, just a log message ++ if (succeeded) { ++ crm_notice("%s was successfully unfenced by %s (at the request of %s)", ++ event->target, executioner, event->origin); ++ /* TODO: Hook up event->device */ ++ } else { ++ crm_err("Unfencing of %s by %s failed: %s (%d)", ++ event->target, executioner, ++ pcmk_strerror(st_event->result), st_event->result); ++ } ++ return; ++ } + ++ if (succeeded ++ && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) { + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster + * communication is in use. +@@ -478,44 +498,41 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * our subsequent election votes as "not part of our cluster". + */ + crm_crit("We were allegedly just fenced by %s for %s!", +- st_event->executioner? st_event->executioner : "the cluster", +- st_event->origin); /* Dumps blackbox if enabled */ ++ executioner, event->origin); // Dumps blackbox if enabled + if (fence_reaction_panic) { + pcmk__panic(__func__); + } else { + crm_exit(CRM_EX_FATAL); + } +- return; ++ return; // Should never get here + } + +- /* Update the count of stonith failures for this target, in case we become ++ /* Update the count of fencing failures for this target, in case we become + * DC later. The current DC has already updated its fail count in + * tengine_stonith_callback(). + */ +- if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- if (st_event->result == pcmk_ok) { +- st_fail_count_reset(st_event->target); ++ if (!AM_I_DC ++ && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, ++ pcmk__str_casei)) { ++ ++ if (succeeded) { ++ st_fail_count_reset(event->target); + } else { +- st_fail_count_increment(st_event->target); ++ st_fail_count_increment(event->target); + } + } + + crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " + CRM_XS " initiator=%s ref=%s", +- st_event->target, st_event->result == pcmk_ok ? "" : " not", +- st_event->action, +- st_event->executioner ? st_event->executioner : "", +- (st_event->client_origin? st_event->client_origin : ""), +- pcmk_strerror(st_event->result), +- st_event->origin, st_event->id); +- +- if (st_event->result == pcmk_ok) { +- crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, ++ event->target, (succeeded? "" : " not"), ++ event->action, executioner, client, ++ pcmk_strerror(event->result), ++ event->origin, event->id); ++ ++ if (succeeded) { ++ crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, + CRM_GET_PEER_ANY); + const char *uuid = NULL; +- gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, +- fsa_our_uname, +- pcmk__str_casei); + + if (peer == NULL) { + return; +@@ -523,10 +540,9 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + + uuid = crm_peer_uuid(peer); + +- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); +- if(AM_I_DC) { ++ if (AM_I_DC) { + /* The DC always sends updates */ +- send_stonith_update(NULL, st_event->target, uuid); ++ send_stonith_update(NULL, event->target, uuid); + + /* @TODO Ideally, at this point, we'd check whether the fenced node + * hosted any guest nodes, and call remote_node_down() for them. +@@ -536,31 +552,33 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * on the scheduler creating fence pseudo-events for the guests. + */ + +- if (st_event->client_origin +- && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { +- +- /* Abort the current transition graph if it wasn't us +- * that invoked stonith to fence someone ++ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { ++ /* Abort the current transition if it wasn't the cluster that ++ * initiated fencing. + */ +- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); +- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); ++ crm_info("External fencing operation from %s fenced %s", ++ client, event->target); ++ abort_transition(INFINITY, tg_restart, ++ "External Fencing Operation", NULL); + } + + /* Assume it was our leader if we don't currently have one */ +- } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) ++ } else if (pcmk__str_eq(fsa_our_dc, event->target, ++ pcmk__str_null_matches|pcmk__str_casei) + && !pcmk_is_set(peer->flags, crm_remote_node)) { + + crm_notice("Fencing target %s %s our leader", +- st_event->target, (fsa_our_dc? "was" : "may have been")); ++ event->target, (fsa_our_dc? "was" : "may have been")); + + /* Given the CIB resyncing that occurs around elections, + * have one node update the CIB now and, if the new DC is different, + * have them do so too after the election + */ +- if (we_are_executioner) { +- send_stonith_update(NULL, st_event->target, uuid); ++ if (pcmk__str_eq(event->executioner, fsa_our_uname, ++ pcmk__str_casei)) { ++ send_stonith_update(NULL, event->target, uuid); + } +- add_stonith_cleanup(st_event->target); ++ add_stonith_cleanup(event->target); + } + + /* If the target is a remote node, and we host its connection, +@@ -569,7 +587,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * so the failure might not otherwise be detected until the next poke. + */ + if (pcmk_is_set(peer->flags, crm_remote_node)) { +- remote_ra_fail(st_event->target); ++ remote_ra_fail(event->target); + } + + crmd_peer_down(peer, TRUE); +@@ -632,7 +650,7 @@ te_connect_stonith(gpointer user_data) + tengine_stonith_connection_destroy); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_FENCE, +- tengine_stonith_notify); ++ handle_fence_notification); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_HISTORY_SYNCED, + tengine_stonith_history_synced); +@@ -837,7 +855,8 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +- * check it. Non-DC nodes will increment it in tengine_stonith_notify(). ++ * check it. Non-DC nodes will increment it in ++ * handle_fence_notification(). + */ + st_fail_count_increment(target); + abort_for_stonith_failure(abort_action, target, NULL); +-- +2.27.0 + + +From 5ec9dcbbe1ee7f6252968f87d7df5a5ea17244fb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:40:21 -0600 +Subject: [PATCH 08/11] Log: controller: improve messages when handling fencing + notifications + +Now that the fencing API provides a full result including exit reasons with +fencing event notifications, make the controller logs more useful and +consistent. +--- + daemons/controld/controld_fencing.c | 34 ++++++++++++++++++++--------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 00626444da..0aa9ef083c 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -448,6 +448,8 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + bool succeeded = true; + const char *executioner = "the cluster"; + const char *client = "a client"; ++ const char *reason = NULL; ++ int exec_status; + + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, +@@ -466,22 +468,31 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + client = event->client_origin; + } + +- if (event->result != pcmk_ok) { ++ exec_status = stonith__event_execution_status(event); ++ if ((stonith__event_exit_status(event) != CRM_EX_OK) ++ || (exec_status != PCMK_EXEC_DONE)) { + succeeded = false; ++ if (exec_status == PCMK_EXEC_DONE) { ++ exec_status = PCMK_EXEC_ERROR; ++ } + } ++ reason = stonith__event_exit_reason(event); + + crmd_alert_fencing_op(event); + + if (pcmk__str_eq("on", event->action, pcmk__str_none)) { + // Unfencing doesn't need special handling, just a log message + if (succeeded) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- event->target, executioner, event->origin); ++ crm_notice("%s was unfenced by %s at the request of %s@%s", ++ event->target, executioner, client, event->origin); + /* TODO: Hook up event->device */ + } else { +- crm_err("Unfencing of %s by %s failed: %s (%d)", ++ crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", + event->target, executioner, +- pcmk_strerror(st_event->result), st_event->result); ++ pcmk_exec_status_str(exec_status), ++ ((reason == NULL)? "" : ": "), ++ ((reason == NULL)? "" : reason), ++ stonith__event_exit_status(event)); + } + return; + } +@@ -522,12 +533,15 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + } + } + +- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " +- CRM_XS " initiator=%s ref=%s", ++ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " ++ "%s%s%s%s " CRM_XS " event=%s", + event->target, (succeeded? "" : " not"), +- event->action, executioner, client, +- pcmk_strerror(event->result), +- event->origin, event->id); ++ event->action, executioner, client, event->origin, ++ (succeeded? "OK" : pcmk_exec_status_str(exec_status)), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); + + if (succeeded) { + crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, +-- +2.27.0 + + +From fb484933ce7c8f3325300a9e01a114db1bbb5b70 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:33:15 -0600 +Subject: [PATCH 09/11] Refactor: controller: move alert functions into own + source file + +--- + daemons/controld/Makefile.am | 1 + + daemons/controld/controld_alerts.c | 92 +++++++++++++++++++++++++ + daemons/controld/controld_execd_state.c | 75 -------------------- + 3 files changed, 93 insertions(+), 75 deletions(-) + create mode 100644 daemons/controld/controld_alerts.c + +diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am +index db45bcba4a..0a29925c0b 100644 +--- a/daemons/controld/Makefile.am ++++ b/daemons/controld/Makefile.am +@@ -43,6 +43,7 @@ pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ + $(CLUSTERLIBS) + + pacemaker_controld_SOURCES = pacemaker-controld.c \ ++ controld_alerts.c \ + controld_attrd.c \ + controld_callbacks.c \ + controld_based.c \ +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +new file mode 100644 +index 0000000000..bd92795cf0 +--- /dev/null ++++ b/daemons/controld/controld_alerts.c +@@ -0,0 +1,92 @@ ++/* ++ * Copyright 2012-2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU General Public License version 2 ++ * or later (GPLv2+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static GList *crmd_alert_list = NULL; ++ ++void ++crmd_unpack_alerts(xmlNode *alerts) ++{ ++ pe_free_alert_list(crmd_alert_list); ++ crmd_alert_list = pe_unpack_alerts(alerts); ++} ++ ++void ++crmd_alert_node_event(crm_node_t *node) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ node->uname, node->id, node->state); ++} ++ ++void ++crmd_alert_fencing_op(stonith_event_t * e) ++{ ++ char *desc; ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", ++ e->action, e->target, ++ (e->executioner? e->executioner : ""), ++ e->client_origin, e->origin, ++ pcmk_strerror(e->result), e->id); ++ ++ lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ e->target, e->operation, desc, e->result); ++ free(desc); ++} ++ ++void ++crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, ++ op); ++} +diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c +index 67c376a426..5dce6c6d59 100644 +--- a/daemons/controld/controld_execd_state.c ++++ b/daemons/controld/controld_execd_state.c +@@ -777,78 +777,3 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state, + */ + return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options); + } +- +-/* +- * Functions for sending alerts via local executor connection +- */ +- +-static GList *crmd_alert_list = NULL; +- +-void +-crmd_unpack_alerts(xmlNode *alerts) +-{ +- pe_free_alert_list(crmd_alert_list); +- crmd_alert_list = pe_unpack_alerts(alerts); +-} +- +-void +-crmd_alert_node_event(crm_node_t *node) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- node->uname, node->id, node->state); +-} +- +-void +-crmd_alert_fencing_op(stonith_event_t * e) +-{ +- char *desc; +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- +- lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- e->target, e->operation, desc, e->result); +- free(desc); +-} +- +-void +-crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, +- op); +-} +-- +2.27.0 + + +From 3d0b57406bcde6682623e9d62c8ee95878345eb1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:25:41 -0600 +Subject: [PATCH 10/11] Feature: controller,tools: improve description for + fencing alerts/traps + +This functionizes creating a description for fencing events, so it can be used +by both the controller for alerts and crm_mon for traps, for consistency. + +Now that we have the full result including exit reason, we can improve the +description, but the format is kept similar to before to minimize the change. + +The alert/trap also includes the legacy return code for the event, but we can't +change that now because lrmd_send_fencing_alert() and the alert/trap +environment variables are public API. +--- + daemons/controld/controld_alerts.c | 8 ++----- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_client.c | 38 ++++++++++++++++++++++++++++++ + tools/crm_mon.c | 5 ++-- + 4 files changed, 43 insertions(+), 9 deletions(-) + +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +index bd92795cf0..2e0a67dba2 100644 +--- a/daemons/controld/controld_alerts.c ++++ b/daemons/controld/controld_alerts.c +@@ -12,6 +12,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -62,12 +63,7 @@ crmd_alert_fencing_op(stonith_event_t * e) + return; + } + +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- ++ desc = stonith__event_description(e); + lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, + e->target, e->operation, desc, e->result); + free(desc); +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index acc16d05e9..d2b49f831a 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -195,6 +195,7 @@ const char *stonith__exit_reason(stonith_callback_data_t *data); + int stonith__event_exit_status(stonith_event_t *event); + int stonith__event_execution_status(stonith_event_t *event); + const char *stonith__event_exit_reason(stonith_event_t *event); ++char *stonith__event_description(stonith_event_t *event); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 5fec7529e3..b1de912b2a 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -2429,6 +2429,44 @@ stonith__event_exit_reason(stonith_event_t *event) + return ((pcmk__action_result_t *) event->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return a human-friendly description of a fencing event ++ * ++ * \param[in] event Event to describe ++ * ++ * \return Newly allocated string with description of \p event ++ * \note The caller is responsible for freeing the return value. ++ * This function asserts on memory errors and never returns NULL. ++ * \note This currently is useful only for events of type ++ * T_STONITH_NOTIFY_FENCE. ++ */ ++char * ++stonith__event_description(stonith_event_t *event) ++{ ++ const char *reason; ++ const char *status; ++ ++ if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) { ++ status = pcmk_exec_status_str(stonith__event_execution_status(event)); ++ } else if (stonith__event_exit_status(event) != CRM_EX_OK) { ++ status = pcmk_exec_status_str(PCMK_EXEC_ERROR); ++ } else { ++ status = crm_exit_str(CRM_EX_OK); ++ } ++ reason = stonith__event_exit_reason(event); ++ ++ return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)", ++ event->action, event->target, ++ (event->executioner? event->executioner : "the cluster"), ++ (event->client_origin? event->client_origin : "a client"), ++ event->origin, status, ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); ++} ++ + + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START +diff --git a/tools/crm_mon.c b/tools/crm_mon.c +index a6c459aaf7..e7b4fe2847 100644 +--- a/tools/crm_mon.c ++++ b/tools/crm_mon.c +@@ -2237,9 +2237,8 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e) + /* disconnect cib as well and have everything reconnect */ + mon_cib_connection_destroy(NULL); + } else if (options.external_agent) { +- char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", +- e->operation, e->origin, e->target, pcmk_strerror(e->result), +- e->id); ++ char *desc = stonith__event_description(e); ++ + send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); + free(desc); + } +-- +2.27.0 + + +From 2fe03c2165680c717a1f6106c5150be7d117f1a5 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 14 Jan 2022 10:45:03 -0600 +Subject: [PATCH 11/11] Low: controller: compare case-sensitively where + appropriate + +--- + daemons/controld/controld_fencing.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 0aa9ef083c..15954b2358 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -524,7 +524,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + */ + if (!AM_I_DC + && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, +- pcmk__str_casei)) { ++ pcmk__str_none)) { + + if (succeeded) { + st_fail_count_reset(event->target); +-- +2.27.0 + diff --git a/SOURCES/016-fencing-crash.patch b/SOURCES/016-fencing-crash.patch new file mode 100644 index 0000000..c514c64 --- /dev/null +++ b/SOURCES/016-fencing-crash.patch @@ -0,0 +1,56 @@ +From e330568504ec379ea42460d21a2e20b1652d9445 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Fri, 14 Jan 2022 01:35:35 -0800 +Subject: [PATCH] Fix: fencing: Don't set stonith action to pending if fork + fails + +Currently, we set a stonith action to pending if +services_action_async_fork_notify() returns true. However, "true" means +that the svc_action should not be freed. This might be because the +svc_action forked successfully and is pending, or it might be because +the svc_action has already been freed. + +In the case of stonith actions, if we fail to fork, the stonith_action_t +object stored in svc_action->cb_data gets freed by the done callback, +and services_action_async_fork_notify() returns true. If we try to set +the action to pending, it causes a segfault. + +This commit moves the "set to pending" step to the +stonith_action_async_forked() callback. We avoid the segfault and only +set it to pending if it's actually pending. + +A slight difference in ordering was required to achieve this. Now, the +action gets set to pending immediately before being added to the +mainloop, instead of immediately after. + +Signed-off-by: Reid Wahl +--- + lib/fencing/st_actions.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index e4e43225cd..306001af69 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -550,6 +550,9 @@ stonith_action_async_forked(svc_action_t *svc_action) + (action->fork_cb) (svc_action->pid, action->userdata); + } + ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, ++ NULL); ++ + crm_trace("Child process %d performing action '%s' successfully forked", + action->pid, action->action); + } +@@ -619,8 +622,6 @@ internal_stonith_action_execute(stonith_action_t * action) + if (services_action_async_fork_notify(svc_action, + &stonith_action_async_done, + &stonith_action_async_forked)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, +- PCMK_EXEC_PENDING, NULL); + return pcmk_ok; + } + +-- +2.27.0 + diff --git a/SOURCES/017-fencing-reasons.patch b/SOURCES/017-fencing-reasons.patch new file mode 100644 index 0000000..1e100ec --- /dev/null +++ b/SOURCES/017-fencing-reasons.patch @@ -0,0 +1,875 @@ +From 523f62eb235836a01ea039c23ada261a494f7b32 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:22:47 -0600 +Subject: [PATCH 01/11] Feature: libpacemaker: improve result for high-level + fencing API + +Previously, pcmk__fencing_action()'s helpers for asynchronous fencing actions +initialized the result to a generic error, and then overrode that only on +success. + +Now, set a detailed result for early failures, and use the full result when +available from the fencing API. + +A standard return code is still returned to callers at this point. +--- + lib/pacemaker/pcmk_fence.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 7d6acd0de6..125e1b268b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -32,8 +32,8 @@ static struct { + unsigned int timeout; + unsigned int tolerance; + int delay; +- int rc; +-} async_fence_data; ++ pcmk__action_result_t result; ++} async_fence_data = { NULL, }; + + static int + handle_level(stonith_t *st, char *target, int fence_level, +@@ -76,14 +76,13 @@ handle_level(stonith_t *st, char *target, int fence_level, + static void + notify_callback(stonith_t * st, stonith_event_t * e) + { +- if (e->result != pcmk_ok) { +- return; +- } ++ if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { + +- if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) && +- pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { +- +- async_fence_data.rc = e->result; ++ pcmk__set_result(&async_fence_data.result, ++ stonith__event_exit_status(e), ++ stonith__event_execution_status(e), ++ stonith__event_exit_reason(e)); + g_main_loop_quit(mainloop); + } + } +@@ -91,8 +90,9 @@ notify_callback(stonith_t * st, stonith_event_t * e) + static void + fence_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- async_fence_data.rc = data->rc; +- ++ pcmk__set_result(&async_fence_data.result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + g_main_loop_quit(mainloop); + } + +@@ -106,6 +106,8 @@ async_fence_helper(gpointer user_data) + if (rc != pcmk_ok) { + fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, NULL); + return TRUE; + } + +@@ -121,6 +123,8 @@ async_fence_helper(gpointer user_data) + + if (call_id < 0) { + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_ERROR, pcmk_strerror(call_id)); + return TRUE; + } + +@@ -146,7 +150,8 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + async_fence_data.timeout = timeout; + async_fence_data.tolerance = tolerance; + async_fence_data.delay = delay; +- async_fence_data.rc = pcmk_err_generic; ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, PCMK_EXEC_UNKNOWN, ++ NULL); + + trig = mainloop_add_trigger(G_PRIORITY_HIGH, async_fence_helper, NULL); + mainloop_set_trigger(trig); +@@ -156,7 +161,7 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- return pcmk_legacy2rc(async_fence_data.rc); ++ return stonith__result2rc(&async_fence_data.result); + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 008868fae5d1b0d6d8dc61f7acfb3856801ddd52 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:36:10 -0600 +Subject: [PATCH 02/11] Refactor: libpacemaker: add exit reason to high-level + fencing API + +Nothing uses it as of this commit +--- + include/pacemaker.h | 5 ++++- + include/pcmki/pcmki_fence.h | 5 ++++- + lib/pacemaker/pcmk_fence.c | 10 +++++++--- + tools/stonith_admin.c | 6 +++--- + 4 files changed, 18 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index a8523c969e..0daa4c5945 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -189,12 +189,15 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max. ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index d4cef68f5c..c3da0361d7 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -28,12 +28,15 @@ + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 125e1b268b..dbf084fb6b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -139,7 +139,7 @@ async_fence_helper(gpointer user_data) + int + pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -161,6 +161,9 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + ++ if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { ++ *reason = strdup(async_fence_data.result.exit_reason); ++ } + return stonith__result2rc(&async_fence_data.result); + } + +@@ -168,9 +171,10 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + int + pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, delay); ++ return pcmk__fence_action(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 2d48326e1b..fdc7c46d49 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -571,17 +571,17 @@ main(int argc, char **argv) + + case 'B': + rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'F': + rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'U': + rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 7570510f9985ba75ef73fb824f28109e135ace0a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:40:48 -0600 +Subject: [PATCH 03/11] Refactor: libpacemaker: rename high-level fencing API + +Rename pcmk_fence_action() to pcmk_request_fencing(), and its internal +equivalent pcmk__fence_action() to pcmk__request_fencing(). The change is +backward-compatible because pcmk_fence_action() has not been exposed publicly +yet. + +"Fence action" can be easily confused with libcrmservice actions, liblrmd +actions, libstonithd actions, scheduler actions, and so forth. + +Also, the new name makes it clearer that the caller is requesting that the +cluster perform fencing, and not directly performing fencing. +--- + include/pacemaker.h | 20 ++++++++++---------- + include/pcmki/pcmki_fence.h | 16 ++++++++-------- + lib/pacemaker/pcmk_fence.c | 16 ++++++++-------- + tools/stonith_admin.c | 18 ++++++++++++------ + 4 files changed, 38 insertions(+), 32 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index 0daa4c5945..e581f975a9 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -177,27 +177,27 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + #ifdef BUILD_PUBLIC_LIBPACEMAKER + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action +- * again. ++ * again + * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max. ++ * static/random fencing delays from pcmk_delay_base/max + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index c3da0361d7..e3a7e27264 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -13,14 +13,14 @@ + # include + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \note This is the internal version of pcmk_fence_action(). External users ++ * \note This is the internal version of pcmk_request_fencing(). External users + * of the pacemaker API should use that function instead. + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? + * \param[in] timeout How long to wait for the operation to complete (in ms). + * \param[in] tolerance If a successful action for \p target happened within +@@ -34,9 +34,9 @@ + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index dbf084fb6b..1b7feb54b2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -137,9 +137,9 @@ async_fence_helper(gpointer user_data) + } + + int +-pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -169,12 +169,12 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + #ifdef BUILD_PUBLIC_LIBPACEMAKER + int +-pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, +- delay, reason); ++ return pcmk__request_fencing(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index fdc7c46d49..56948b3875 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -570,18 +570,24 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "reboot", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'F': +- rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "off", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'U': +- rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "on", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 247eb303df934944c0b72b162bb661cee6e0ed8b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:52:37 -0600 +Subject: [PATCH 04/11] Refactor: tools: drop unnecessary string duplication in + stonith_admin + +--- + tools/stonith_admin.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 56948b3875..c11e302e76 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -360,8 +360,6 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + +- name = strdup(crm_system_name); +- + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -496,7 +494,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, name, NULL); ++ rc = st->cmds->connect(st, crm_system_name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -570,21 +568,21 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", name, ++ rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", name, ++ rc = pcmk__request_fencing(st, target, "off", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", name, ++ rc = pcmk__request_fencing(st, target, "on", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); +@@ -619,7 +617,6 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } +- free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + + +From a7888bf6868d8d9d9c77f65ae9983cf748bb0548 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:56:34 -0600 +Subject: [PATCH 05/11] Refactor: tools: functionize requesting fencing in + stonith_admin + +... to reduce code duplication and improve readability +--- + tools/stonith_admin.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index c11e302e76..f738a9c888 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -331,6 +331,18 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + return context; + } + ++// \return Standard Pacemaker return code ++static int ++request_fencing(stonith_t *st, const char *target, const char *command) ++{ ++ int rc = pcmk__request_fencing(st, target, command, crm_system_name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); ++ ++ return rc; ++} ++ + int + main(int argc, char **argv) + { +@@ -568,24 +580,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "reboot"); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "off"); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "on"); + break; + + case 'h': +-- +2.27.0 + + +From 2da32df780983ec1197e857eed5eeb5bf1101889 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:05:19 -0600 +Subject: [PATCH 06/11] Feature: tools: display failure reasons for + stonith_admin fencing commands + +Previously, stonith_admin's --fence/--unfence/--reboot options did not output +any error message on failure. Now, they do, including the exit reason, if +available. +--- + tools/stonith_admin.c | 30 +++++++++++++++++++++++++----- + 1 file changed, 25 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index f738a9c888..5590faf11e 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -333,13 +333,33 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + + // \return Standard Pacemaker return code + static int +-request_fencing(stonith_t *st, const char *target, const char *command) ++request_fencing(stonith_t *st, const char *target, const char *command, ++ GError **error) + { ++ char *reason = NULL; + int rc = pcmk__request_fencing(st, target, command, crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, +- options.delay, NULL); ++ options.delay, &reason); + ++ if (rc != pcmk_rc_ok) { ++ const char *rc_str = pcmk_rc_str(rc); ++ ++ // If reason is identical to return code string, don't display it twice ++ if (pcmk__str_eq(rc_str, reason, pcmk__str_none)) { ++ free(reason); ++ reason = NULL; ++ } ++ ++ g_set_error(error, PCMK__RC_ERROR, rc, ++ "Couldn't %sfence %s: %s%s%s%s", ++ ((strcmp(command, "on") == 0)? "un" : ""), ++ target, pcmk_rc_str(rc), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")")); ++ } ++ free(reason); + return rc; + } + +@@ -580,15 +600,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = request_fencing(st, target, "reboot"); ++ rc = request_fencing(st, target, "reboot", &error); + break; + + case 'F': +- rc = request_fencing(st, target, "off"); ++ rc = request_fencing(st, target, "off", &error); + break; + + case 'U': +- rc = request_fencing(st, target, "on"); ++ rc = request_fencing(st, target, "on", &error); + break; + + case 'h': +-- +2.27.0 + + +From 2d99eba4c326d3b13dbbe446971ea5febd5d05be Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:08:49 -0600 +Subject: [PATCH 07/11] Feature: libpacemaker: return exit reason for fencer + connection failures + +... instead of outputting to stderr directly, so that the caller (i.e. +stonith_admin) can output the error in the correct output format. +--- + lib/pacemaker/pcmk_fence.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 1b7feb54b2..d17b07cda2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -104,10 +104,9 @@ async_fence_helper(gpointer user_data) + int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); + + if (rc != pcmk_ok) { +- fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); + pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, +- PCMK_EXEC_NOT_CONNECTED, NULL); ++ PCMK_EXEC_NOT_CONNECTED, pcmk_strerror(rc)); + return TRUE; + } + +-- +2.27.0 + + +From 4480ef0602f47450bdddfbde360a6a8327710927 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:39:39 -0600 +Subject: [PATCH 08/11] Low: libpacemaker: compare fence action names + case-sensitively + +--- + lib/pacemaker/pcmk_fence.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index d17b07cda2..2a8f50a555 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -77,7 +77,7 @@ static void + notify_callback(stonith_t * st, stonith_event_t * e) + { + if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) +- && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_none)) { + + pcmk__set_result(&async_fence_data.result, + stonith__event_exit_status(e), +@@ -549,7 +549,7 @@ pcmk__reduce_fence_history(stonith_history_t *history) + if ((hp->state == st_done) || (hp->state == st_failed)) { + /* action not in progress */ + if (pcmk__str_eq(hp->target, np->target, pcmk__str_casei) && +- pcmk__str_eq(hp->action, np->action, pcmk__str_casei) && ++ pcmk__str_eq(hp->action, np->action, pcmk__str_none) && + (hp->state == np->state) && + ((hp->state == st_done) || + pcmk__str_eq(hp->delegate, np->delegate, pcmk__str_casei))) { +-- +2.27.0 + + +From fe4c65a3b9e715c2b535709f989f2369d3637b78 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:45:24 -0600 +Subject: [PATCH 09/11] Refactor: libpacemaker: avoid unnecessary string + duplication + +... and don't leave any dynamic memory hanging around +--- + lib/pacemaker/pcmk_fence.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 2a8f50a555..260fa5ab8e 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -141,6 +141,7 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; ++ int rc = pcmk_rc_ok; + + async_fence_data.st = st; + async_fence_data.name = strdup(name); +@@ -160,10 +161,14 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { +- *reason = strdup(async_fence_data.result.exit_reason); ++ if (reason != NULL) { ++ // Give the caller ownership of the exit reason ++ *reason = async_fence_data.result.exit_reason; ++ async_fence_data.result.exit_reason = NULL; + } +- return stonith__result2rc(&async_fence_data.result); ++ rc = stonith__result2rc(&async_fence_data.result); ++ pcmk__reset_result(&async_fence_data.result); ++ return rc; + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 7b7af07796f05a1adabdac655582be2e17106f81 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:07:10 -0600 +Subject: [PATCH 10/11] Doc: libpacemaker: improve pcmk__request_fencing() + doxygen block + +--- + include/pacemaker.h | 6 ++++-- + include/pcmki/pcmki_fence.h | 15 +++++++++------ + 2 files changed, 13 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index e581f975a9..266a844892 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -187,8 +187,10 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action + * again +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index e3a7e27264..4a2fe3c481 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -22,17 +22,20 @@ + * \param[in] target The node that should be fenced + * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within +- * this many ms, return 0 without performing the action +- * again. +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * this many milliseconds, return success without ++ * performing the action again ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. ++ * \todo delay is eventually used with g_timeout_add() and should be guint + */ + int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, +-- +2.27.0 + + +From 61fb7271712e1246eb6d9472dc1afc7cd10e0a79 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:18:02 -0600 +Subject: [PATCH 11/11] Fix: tools: get stonith_admin -T option working again + +Regression introduced in 2.0.3 by 3910b6fec + +This reverts commit 247eb303df934944c0b72b162bb661cee6e0ed8b +("Refactor: tools: drop unnecessary string duplication in stonith_admin") +and fixes a regression introduced when stonith_admin was converted to use +GOption. + +The -T option is intended to override the client name passed to the fencer API, +but the client name was set to the default (crm_system_name) after option +processing had already been done, so any value for -T was overwritten by the +default, and its memory was leaked. + +This commit sets the default only if -T was not used. +--- + tools/stonith_admin.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 5590faf11e..54774b6fee 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -337,10 +337,10 @@ request_fencing(stonith_t *st, const char *target, const char *command, + GError **error) + { + char *reason = NULL; +- int rc = pcmk__request_fencing(st, target, command, crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, &reason); ++ int rc = pcmk__request_fencing(st, target, command, name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, &reason); + + if (rc != pcmk_rc_ok) { + const char *rc_str = pcmk_rc_str(rc); +@@ -392,6 +392,10 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + ++ if (name == NULL) { ++ name = strdup(crm_system_name); ++ } ++ + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -526,7 +530,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, crm_system_name, NULL); ++ rc = st->cmds->connect(st, name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -640,6 +644,7 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } ++ free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + diff --git a/SOURCES/018-failure-messages.patch b/SOURCES/018-failure-messages.patch new file mode 100644 index 0000000..3a2f249 --- /dev/null +++ b/SOURCES/018-failure-messages.patch @@ -0,0 +1,796 @@ +From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 18 Jan 2022 16:04:49 -0600 +Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not + connected" status + +PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor +connection", but it can also now mean no fencer connection, so change it to +"Internal communication failure" which is probably less mysterious to end users +anyway (especially since it should be accompanied by a more descriptive exit +reason). +--- + include/crm/common/results.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/common/results.h b/include/crm/common/results.h +index 873faf5c43..3d322a7ce6 100644 +--- a/include/crm/common/results.h ++++ b/include/crm/common/results.h +@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status) + case PCMK_EXEC_ERROR_HARD: return "Hard error"; + case PCMK_EXEC_ERROR_FATAL: return "Fatal error"; + case PCMK_EXEC_NOT_INSTALLED: return "Not installed"; +- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection"; ++ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure"; + case PCMK_EXEC_INVALID: return "Cannot execute now"; + case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device"; + case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable"; +-- +2.27.0 + + +From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:12:36 -0600 +Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error + redefinitions + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +This was a bad idea, since there's no way to ensure the defined values don't +conflict with existing system codes. However, we use a number of them, so it's +probably best to keep them, at least until we can make a backward compatibility +break. + +However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those. +--- + include/portability.h | 12 ------------ + lib/common/results.c | 9 ++++++--- + 2 files changed, 6 insertions(+), 15 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index 9a60c583a7..ee065a376d 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -131,10 +131,6 @@ typedef union + # define EREMOTEIO 193 + # endif + +-# ifndef EUNATCH +-# define EUNATCH 194 +-# endif +- + # ifndef ENOKEY + # define ENOKEY 195 + # endif +@@ -147,14 +143,6 @@ typedef union + # define ETIME 197 + # endif + +-# ifndef ENOSR +-# define ENOSR 198 +-# endif +- +-# ifndef ENOSTR +-# define ENOSTR 199 +-# endif +- + # ifndef EKEYREJECTED + # define EKEYREJECTED 200 + # endif +diff --git a/lib/common/results.c b/lib/common/results.c +index 6d120694cd..96cd4e5659 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -118,9 +118,6 @@ pcmk_strerror(int rc) + case EREMOTEIO: + return "Remote I/O error"; + /* coverity[dead_error_condition] False positive on non-Linux */ +- case EUNATCH: +- return "Protocol driver not attached"; +- /* coverity[dead_error_condition] False positive on non-Linux */ + case ENOKEY: + return "Required key not available"; + } +@@ -342,8 +339,12 @@ pcmk_rc_name(int rc) + case ENOMSG: return "ENOMSG"; + case ENOPROTOOPT: return "ENOPROTOOPT"; + case ENOSPC: return "ENOSPC"; ++#ifdef ENOSR + case ENOSR: return "ENOSR"; ++#endif ++#ifdef ENOSTR + case ENOSTR: return "ENOSTR"; ++#endif + case ENOSYS: return "ENOSYS"; + case ENOTBLK: return "ENOTBLK"; + case ENOTCONN: return "ENOTCONN"; +@@ -376,7 +377,9 @@ pcmk_rc_name(int rc) + case ETIME: return "ETIME"; + case ETIMEDOUT: return "ETIMEDOUT"; + case ETXTBSY: return "ETXTBSY"; ++#ifdef EUNATCH + case EUNATCH: return "EUNATCH"; ++#endif + case EUSERS: return "EUSERS"; + /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ + case EXDEV: return "EXDEV"; +-- +2.27.0 + + +From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:33:12 -0600 +Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h + error codes + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when +the system doesn't have the value, so we can detect that when relevant. + +Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values. +--- + include/portability.h | 8 ++++++++ + lib/common/results.c | 32 ++++++++++++++++++++++++++++++-- + 2 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index ee065a376d..5d5fbf21cb 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -116,34 +116,42 @@ typedef union + # include + + # ifndef ENOTUNIQ ++# define PCMK__ENOTUNIQ + # define ENOTUNIQ 190 + # endif + + # ifndef ECOMM ++# define PCMK__ECOMM + # define ECOMM 191 + # endif + + # ifndef ELIBACC ++# define PCMK__ELIBACC + # define ELIBACC 192 + # endif + + # ifndef EREMOTEIO ++# define PCMK__EREMOTIO + # define EREMOTEIO 193 + # endif + + # ifndef ENOKEY ++# define PCMK__ENOKEY + # define ENOKEY 195 + # endif + + # ifndef ENODATA ++# define PCMK__ENODATA + # define ENODATA 196 + # endif + + # ifndef ETIME ++# define PCMK__ETIME + # define ETIME 197 + # endif + + # ifndef EKEYREJECTED ++# define PCMK__EKEYREJECTED + # define EKEYREJECTED 200 + # endif + +diff --git a/lib/common/results.c b/lib/common/results.c +index 96cd4e5659..bcf289d0d6 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -395,9 +395,9 @@ pcmk_rc_name(int rc) + #ifdef EISNAM // Not available on OS X, Illumos, Solaris + case EISNAM: return "EISNAM"; + case EKEYEXPIRED: return "EKEYEXPIRED"; +- case EKEYREJECTED: return "EKEYREJECTED"; + case EKEYREVOKED: return "EKEYREVOKED"; + #endif ++ case EKEYREJECTED: return "EKEYREJECTED"; + case EL2HLT: return "EL2HLT"; + case EL2NSYNC: return "EL2NSYNC"; + case EL3HLT: return "EL3HLT"; +@@ -443,7 +443,35 @@ pcmk_rc_str(int rc) + if (rc < 0) { + return "Unknown error"; + } +- return strerror(rc); ++ ++ // Handle values that could be defined by system or by portability.h ++ switch (rc) { ++#ifdef PCMK__ENOTUNIQ ++ case ENOTUNIQ: return "Name not unique on network"; ++#endif ++#ifdef PCMK__ECOMM ++ case ECOMM: return "Communication error on send"; ++#endif ++#ifdef PCMK__ELIBACC ++ case ELIBACC: return "Can not access a needed shared library"; ++#endif ++#ifdef PCMK__EREMOTEIO ++ case EREMOTEIO: return "Remote I/O error"; ++#endif ++#ifdef PCMK__ENOKEY ++ case ENOKEY: return "Required key not available"; ++#endif ++#ifdef PCMK__ENODATA ++ case ENODATA: return "No data available"; ++#endif ++#ifdef PCMK__ETIME ++ case ETIME: return "Timer expired"; ++#endif ++#ifdef PCMK__EKEYREJECTED ++ case EKEYREJECTED: return "Key was rejected by service"; ++#endif ++ default: return strerror(rc); ++ } + } + + // This returns negative values for errors +-- +2.27.0 + + +From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:39:19 -0600 +Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of + pcmk_rc_str() + +... to reduce code duplication. This causes minor differences in the string for +a few values. +--- + lib/common/results.c | 67 +------------------------------------------- + 1 file changed, 1 insertion(+), 66 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index bcf289d0d6..b2c6e8d553 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -57,72 +57,7 @@ pcmk_errorname(int rc) + const char * + pcmk_strerror(int rc) + { +- if (rc == 0) { +- return "OK"; +- } +- +- rc = abs(rc); +- +- // Of course rc > 0 ... unless someone passed INT_MIN as rc +- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) { +- return strerror(rc); +- } +- +- switch (rc) { +- case pcmk_err_generic: +- return "Generic Pacemaker error"; +- case pcmk_err_no_quorum: +- return "Operation requires quorum"; +- case pcmk_err_schema_validation: +- return "Update does not conform to the configured schema"; +- case pcmk_err_transform_failed: +- return "Schema transform failed"; +- case pcmk_err_old_data: +- return "Update was older than existing configuration"; +- case pcmk_err_diff_failed: +- return "Application of an update diff failed"; +- case pcmk_err_diff_resync: +- return "Application of an update diff failed, requesting a full refresh"; +- case pcmk_err_cib_modified: +- return "The on-disk configuration was manually modified"; +- case pcmk_err_cib_backup: +- return "Could not archive the previous configuration"; +- case pcmk_err_cib_save: +- return "Could not save the new configuration to disk"; +- case pcmk_err_cib_corrupt: +- return "Could not parse on-disk configuration"; +- case pcmk_err_multiple: +- return "Resource active on multiple nodes"; +- case pcmk_err_node_unknown: +- return "Node not found"; +- case pcmk_err_already: +- return "Situation already as requested"; +- case pcmk_err_bad_nvpair: +- return "Bad name/value pair given"; +- case pcmk_err_schema_unchanged: +- return "Schema is already the latest available"; +- case pcmk_err_unknown_format: +- return "Unknown output format"; +- +- /* The following cases will only be hit on systems for which they are non-standard */ +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOTUNIQ: +- return "Name not unique on network"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ECOMM: +- return "Communication error on send"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ELIBACC: +- return "Can not access a needed shared library"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case EREMOTEIO: +- return "Remote I/O error"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOKEY: +- return "Required key not available"; +- } +- crm_err("Unknown error code: %d", rc); +- return "Unknown error"; ++ return pcmk_rc_str(pcmk_legacy2rc(rc)); + } + + // Standard Pacemaker API return codes +-- +2.27.0 + + +From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:41:24 -0600 +Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error" + +... which is unhelpful and annoying to users +--- + lib/common/results.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index b2c6e8d553..5ffac76549 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -376,7 +376,7 @@ pcmk_rc_str(int rc) + return pcmk__rcs[pcmk_rc_error - rc].desc; + } + if (rc < 0) { +- return "Unknown error"; ++ return "Error"; + } + + // Handle values that could be defined by system or by portability.h +@@ -768,7 +768,7 @@ bz2_strerror(int rc) + case BZ_OUTBUFF_FULL: + return "output data will not fit into the buffer provided"; + } +- return "Unknown error"; ++ return "Data compression error"; + } + + crm_exit_t +-- +2.27.0 + + +From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:01:39 -0600 +Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog + device + +--- + lib/fencing/st_client.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index b1de912b2a..a0f3119f3b 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + * we drop in here - so as not to make remote nodes + * panic on that answer + */ +- crm_warn("watchdog-fencing-query failed"); ++ if (rc == -ENODEV) { ++ crm_notice("Cluster does not have watchdog fencing device"); ++ } else { ++ crm_warn("Could not check for watchdog fencing device: %s", ++ pcmk_strerror(rc)); ++ } + } else if (list[0] == '\0') { + rv = TRUE; + } else { +-- +2.27.0 + + +From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:22:49 -0600 +Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for + recent change + +--- + lib/common/tests/results/pcmk__results_test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c +index 57a520c501..e08d4b6261 100644 +--- a/lib/common/tests/results/pcmk__results_test.c ++++ b/lib/common/tests/results/pcmk__results_test.c +@@ -30,7 +30,7 @@ static void + test_for_pcmk_rc_str(void **state) { + assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format"); + assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK"); +- assert_string_equal(pcmk_rc_str(-1), "Unknown error"); ++ assert_string_equal(pcmk_rc_str(-1), "Error"); + } + + static void +-- +2.27.0 + + +From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 10:20:38 -0600 +Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent + changes + +--- + cts/lab/CTStests.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py +index 62c832eb45..f4be998cfb 100644 +--- a/cts/lab/CTStests.py ++++ b/cts/lab/CTStests.py +@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver): + r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", + r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", + r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", +- r"error: Result of monitor operation for .* on remote-.*: No executor connection", ++ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure", + ] + + ignore_pats.extend(RemoteDriver.errorstoignore(self)) +-- +2.27.0 + + +From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 15:40:49 -0600 +Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation + less chatty + +Other messages with the same info will already be logged at higher severity +--- + daemons/controld/controld_execd.c | 3 +-- + daemons/controld/controld_te_actions.c | 7 ++----- + include/pcmki/pcmki_sched_utils.h | 3 +-- + lib/pacemaker/pcmk_injections.c | 3 +-- + lib/pacemaker/pcmk_sched_actions.c | 12 +++++------- + 5 files changed, 10 insertions(+), 18 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 15784e7687..52157fa5d4 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_ + caller_version = CRM_FEATURE_SET; + } + +- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version); + xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc, +- fsa_our_uname, src, LOG_DEBUG); ++ fsa_our_uname, src); + if (xml_op == NULL) { + return TRUE; + } +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index 63b7c72359..b0bcb8b2e4 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action) + lrmd_event_data_t *op = NULL; + xmlNode *state = NULL; + xmlNode *rsc = NULL; +- xmlNode *xml_op = NULL; + xmlNode *action_rsc = NULL; + + int rc = pcmk_ok; +@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action) + op->user_data = pcmk__transition_key(transition_graph->id, action->id, + target_rc, te_uuid); + +- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, +- target, __func__, LOG_INFO); ++ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, ++ __func__); + lrmd_free_event(op); + +- crm_log_xml_trace(xml_op, "Action timeout"); +- + rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); + fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); + free_xml(state); +diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h +index 68d60fc7db..144424a609 100644 +--- a/include/pcmki/pcmki_sched_utils.h ++++ b/include/pcmki/pcmki_sched_utils.h +@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor + + xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event, + const char *caller_version, int target_rc, +- const char *node, const char *origin, +- int level); ++ const char *node, const char *origin); + + # define LOAD_STOPPED "load_stopped" + +diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c +index 678c3f5dd2..1aa90a5a0b 100644 +--- a/lib/pacemaker/pcmk_sched_transition.c ++++ b/lib/pacemaker/pcmk_sched_transition.c +@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + { + return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET, +- target_rc, NULL, crm_system_name, +- LOG_TRACE); ++ target_rc, NULL, crm_system_name); + } + + static xmlNode * +diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c +index f8200b0efc..4f63d3374d 100644 +--- a/lib/pacemaker/pcmk_sched_utils.c ++++ b/lib/pacemaker/pcmk_sched_utils.c +@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update) + * \param[in] target_rc Expected result of operation + * \param[in] node Name of node on which operation was performed + * \param[in] origin Arbitrary description of update source +- * \param[in] level A log message will be logged at this level + * + * \return Newly created XML node for history update + */ + xmlNode * + pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *caller_version, int target_rc, +- const char *node, const char *origin, int level) ++ const char *node, const char *origin) + { + char *key = NULL; + char *magic = NULL; +@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *task = NULL; + + CRM_CHECK(op != NULL, return NULL); +- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)", +- origin, op->rsc_id, op->op_type, +- pcmk_exec_status_str(op->op_status), op->interval_ms); +- +- crm_trace("DC version: %s", caller_version); ++ crm_trace("Creating history XML for %s-interval %s action for %s on %s " ++ "(DC version: %s, origin: %s)", ++ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id, ++ ((node == NULL)? "no node" : node), caller_version, origin); + + task = op->op_type; + +-- +2.27.0 + + +From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 17:09:44 -0600 +Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal + timeouts + +Functionize the part of controld_record_action_timeout() that creates a fake +executor event, into a new function synthesize_timeout_event(), and have it set +a more detailed exit reason describing what timed out. +--- + daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------ + 1 file changed, 48 insertions(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index b0bcb8b2e4..de2fbb82bf 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action) + return TRUE; + } + ++/*! ++ * \internal ++ * \brief Synthesize an executor event for a resource action timeout ++ * ++ * \param[in] action Resource action that timed out ++ * \param[in] target_rc Expected result of action that timed out ++ * ++ * Synthesize an executor event for a resource action timeout. (If the executor ++ * gets a timeout while waiting for a resource action to complete, that will be ++ * reported via the usual callback. This timeout means we didn't hear from the ++ * executor itself or the controller that relayed the action to the executor.) ++ * ++ * \return Newly created executor event for result of \p action ++ * \note The caller is responsible for freeing the return value using ++ * lrmd_free_event(). ++ */ ++static lrmd_event_data_t * ++synthesize_timeout_event(crm_action_t *action, int target_rc) ++{ ++ lrmd_event_data_t *op = NULL; ++ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ const char *reason = NULL; ++ char *dynamic_reason = NULL; ++ ++ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) { ++ reason = "Local executor did not return result in time"; ++ } else { ++ const char *router_node = NULL; ++ ++ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); ++ if (router_node == NULL) { ++ router_node = target; ++ } ++ dynamic_reason = crm_strdup_printf("Controller on %s did not return " ++ "result in time", router_node); ++ reason = dynamic_reason; ++ } ++ ++ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, ++ PCMK_OCF_UNKNOWN_ERROR, reason); ++ op->call_id = -1; ++ op->user_data = pcmk__transition_key(transition_graph->id, action->id, ++ target_rc, te_uuid); ++ free(dynamic_reason); ++ return op; ++} ++ + void + controld_record_action_timeout(crm_action_t *action) + { +@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action) + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS); + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER); + +- /* If the executor gets a timeout while waiting for the action to complete, +- * that will be reported via the usual callback. This timeout means that we +- * didn't hear from the executor or the controller that relayed the action +- * to the executor. +- */ +- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, +- PCMK_OCF_UNKNOWN_ERROR, +- "Cluster communication timeout " +- "(no response from executor)"); +- op->call_id = -1; +- op->user_data = pcmk__transition_key(transition_graph->id, action->id, +- target_rc, te_uuid); +- ++ op = synthesize_timeout_event(action, target_rc); + pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, + __func__); + lrmd_free_event(op); +-- +2.27.0 + + +From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 16:35:06 -0600 +Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing + timeouts + +Troubleshooting timeouts is one of the more difficult aspects of cluster +maintenance. We want to give as much of a hint as possible, but for fencing in +particular it is difficult because an operation might involve multiple retries +of multiple devices. + +Barring another major project to track exactly which devices, retries, etc., +were used in a given operation, these changes in wording are probably the best +we can do. +--- + daemons/fenced/fenced_remote.c | 8 +++++--- + lib/fencing/st_client.c | 2 +- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 1e237150c5..6eebb7381e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata, "Fencing could not be completed " +- "within overall timeout"); ++ finalize_timed_out_op(userdata, "Fencing did not complete within a " ++ "total timeout based on the " ++ "configured timeout and retries for " ++ "any devices attempted"); + } + return G_SOURCE_REMOVE; + } +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index a0f3119f3b..718739b321 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + if (msg == NULL) { + // Fencer didn't reply in time + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Timeout waiting for reply from fencer"); ++ "Fencer accepted request but did not reply in time"); + CRM_LOG_ASSERT(call_id > 0); + + } else { +-- +2.27.0 + + +From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:09:09 -0600 +Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for + timeouts + +The services library doesn't have enough information about an action to say +(for example) what configuration parameters might be relevant, but we can at +least distinguish what kind of agent timed out. +--- + lib/services/services_linux.c | 12 +++++++++++- + lib/services/systemd.c | 2 +- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c +index f15eee860e..d6aafcfe46 100644 +--- a/lib/services/services_linux.c ++++ b/lib/services/services_linux.c +@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + parse_exit_reason_from_stderr(op); + + } else if (mainloop_child_timeout(p)) { ++ const char *reason = NULL; ++ ++ if (op->rsc != NULL) { ++ reason = "Resource agent did not complete in time"; ++ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH, ++ pcmk__str_none)) { ++ reason = "Fence agent did not complete in time"; ++ } else { ++ reason = "Process did not complete in time"; ++ } + crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); + services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, +- "Process did not exit within specified timeout"); ++ reason); + + } else if (op->cancel) { + /* If an in-flight recurring operation was killed because it was +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 27a3b376db..d87b287424 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p) + crm_info("%s action for systemd unit %s named '%s' timed out", + op->action, op->agent, op->rsc); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, +- "Systemd action did not complete within specified timeout"); ++ "Systemd unit action did not complete in time"); + services__finalize_async_op(op); + return FALSE; + } +-- +2.27.0 + diff --git a/SOURCES/019-corosync-tracking.patch b/SOURCES/019-corosync-tracking.patch new file mode 100644 index 0000000..ac3ca96 --- /dev/null +++ b/SOURCES/019-corosync-tracking.patch @@ -0,0 +1,29 @@ +From e8bf0161b872267f1bb7143a9866fdc15ec218f2 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 18 Jan 2022 16:35:24 +0100 +Subject: [PATCH] Fix: corosync: Repeat corosync_cfg_trackstart + +corosync_cfg_trackstart can fail with CS_ERR_TRY_AGAIN failure so +(similarly as for corosync_cfg_local_get, ...) handle failure with +using cs_repeat macro. +--- + daemons/pacemakerd/pcmkd_corosync.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c +index 7990bc43c5..cd7a40321d 100644 +--- a/daemons/pacemakerd/pcmkd_corosync.c ++++ b/daemons/pacemakerd/pcmkd_corosync.c +@@ -186,7 +186,8 @@ cluster_connect_cfg(void) + crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid); + + #ifdef HAVE_COROSYNC_CFG_TRACKSTART +- rc = corosync_cfg_trackstart(cfg_handle, 0); ++ retries = 0; ++ cs_repeat(retries, 30, rc = corosync_cfg_trackstart(cfg_handle, 0)); + if (rc != CS_OK) { + crm_crit("Could not enable Corosync CFG shutdown tracker: %s " CRM_XS " rc=%d", + cs_strerror(rc), rc); +-- +2.27.0 + diff --git a/SOURCES/020-systemd-unit.patch b/SOURCES/020-systemd-unit.patch new file mode 100644 index 0000000..a425ae3 --- /dev/null +++ b/SOURCES/020-systemd-unit.patch @@ -0,0 +1,41 @@ +From e316840a7e1d2a72e3089ee194334244c959905a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 09:53:53 -0600 +Subject: [PATCH] Fix: pacemakerd: tweak systemd unit respawn settings + +If pacemaker exits immediately after starting, wait 1 second before trying to +respawn, since the default of 100ms is a bit aggressive for a Pacemaker +cluster. + +Also, allow 5 attempts in 25 seconds before giving up. +--- + daemons/pacemakerd/pacemaker.service.in | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in +index 0363a2259c..3fd53d9ffb 100644 +--- a/daemons/pacemakerd/pacemaker.service.in ++++ b/daemons/pacemakerd/pacemaker.service.in +@@ -31,6 +31,9 @@ After=rsyslog.service + After=corosync.service + Requires=corosync.service + ++# If Pacemaker respawns repeatedly, give up after this many tries in this time ++StartLimitBurst=5 ++StartLimitIntervalSec=25s + + [Install] + WantedBy=multi-user.target +@@ -57,6 +60,9 @@ TasksMax=infinity + # resource. Sending -KILL will just get the node fenced + SendSIGKILL=no + ++# Systemd's default of respawning a failed service after 100ms is too aggressive ++RestartSec=1s ++ + # If we ever hit the StartLimitInterval/StartLimitBurst limit, and the + # admin wants to stop the cluster while pacemakerd is not running, it + # might be a good idea to enable the ExecStopPost directive below. +-- +2.27.0 + diff --git a/SOURCES/021-daemon-tracking.patch b/SOURCES/021-daemon-tracking.patch new file mode 100644 index 0000000..8259921 --- /dev/null +++ b/SOURCES/021-daemon-tracking.patch @@ -0,0 +1,354 @@ +From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 18 Nov 2021 13:23:34 +0100 +Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for + liveness + +--- + daemons/pacemakerd/pacemakerd.c | 2 + + daemons/pacemakerd/pacemakerd.h | 3 +- + daemons/pacemakerd/pcmkd_messages.c | 6 +- + daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++--------- + 4 files changed, 98 insertions(+), 52 deletions(-) + +diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c +index 34d64c4053..062c2d5326 100644 +--- a/daemons/pacemakerd/pacemakerd.c ++++ b/daemons/pacemakerd/pacemakerd.c +@@ -259,6 +259,8 @@ main(int argc, char **argv) + pcmk_ipc_api_t *old_instance = NULL; + qb_ipcs_service_t *ipcs = NULL; + ++ subdaemon_check_progress = time(NULL); ++ + crm_log_preinit(NULL, argc, argv); + mainloop_add_signal(SIGHUP, pcmk_ignore); + mainloop_add_signal(SIGQUIT, pcmk_sigquit); +diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h +index 7c541bbf9e..424dbbcc5d 100644 +--- a/daemons/pacemakerd/pacemakerd.h ++++ b/daemons/pacemakerd/pacemakerd.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2010-2021 the Pacemaker project contributors ++ * Copyright 2010-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to; + extern gboolean shutdown_complete_state_reported_client_closed; + extern crm_trigger_t *shutdown_trigger; + extern crm_trigger_t *startup_trigger; ++extern time_t subdaemon_check_progress; + + gboolean mcp_read_config(void); + +diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c +index 0439986ecf..f2cddc353e 100644 +--- a/daemons/pacemakerd/pcmkd_messages.c ++++ b/daemons/pacemakerd/pcmkd_messages.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2010-2021 the Pacemaker project contributors ++ * Copyright 2010-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) + const char *value = NULL; + xmlNode *ping = NULL; + xmlNode *reply = NULL; +- time_t pinged = time(NULL); + const char *from = crm_element_value(msg, F_CRM_SYS_FROM); + + /* Pinged for status */ +@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) + value = crm_element_value(msg, F_CRM_SYS_TO); + crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); + crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state); +- crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged); ++ crm_xml_add_ll(ping, XML_ATTR_TSTAMP, ++ (long long) subdaemon_check_progress); + crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); + reply = create_reply(msg, ping); + free_xml(ping); +diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c +index a54fcce1ba..c03903c99e 100644 +--- a/daemons/pacemakerd/pcmkd_subdaemons.c ++++ b/daemons/pacemakerd/pcmkd_subdaemons.c +@@ -32,14 +32,16 @@ typedef struct pcmk_child_s { + const char *command; + const char *endpoint; /* IPC server name */ + bool needs_cluster; ++ int check_count; + + /* Anything below here will be dynamically initialized */ + bool needs_retry; + bool active_before_startup; + } pcmk_child_t; + +-#define PCMK_PROCESS_CHECK_INTERVAL 5 +-#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ ++#define PCMK_PROCESS_CHECK_INTERVAL 1 ++#define PCMK_PROCESS_CHECK_RETRIES 5 ++#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ + + /* Index into the array below */ + #define PCMK_CHILD_CONTROLD 5 +@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; + + crm_trigger_t *shutdown_trigger = NULL; + crm_trigger_t *startup_trigger = NULL; ++time_t subdaemon_check_progress = 0; + + /* When contacted via pacemakerd-api by a client having sbd in + * the name we assume it is sbd-daemon which wants to know +@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */ + GMainLoop *mainloop = NULL; + + static gboolean fatal_error = FALSE; +-static bool global_keep_tracking = false; + + static gboolean check_active_before_startup_processes(gpointer user_data); + static int child_liveness(pcmk_child_t *child); +@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void) + static gboolean + check_active_before_startup_processes(gpointer user_data) + { +- gboolean keep_tracking = FALSE; +- +- for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) { +- if (!pcmk_children[i].active_before_startup) { +- /* we are already tracking it as a child process. */ +- continue; +- } else { +- int rc = child_liveness(&pcmk_children[i]); +- +- switch (rc) { +- case pcmk_rc_ok: +- break; +- case pcmk_rc_ipc_unresponsive: +- case pcmk_rc_ipc_pid_only: // This case: it was previously OK +- if (pcmk_children[i].respawn) { +- crm_err("%s[%lld] terminated%s", pcmk_children[i].name, +- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), +- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); +- } else { +- /* orderly shutdown */ +- crm_notice("%s[%lld] terminated%s", pcmk_children[i].name, +- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid), +- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : ""); +- } +- pcmk_process_exit(&(pcmk_children[i])); +- continue; +- default: +- crm_exit(CRM_EX_FATAL); +- break; /* static analysis/noreturn */ ++ static int next_child = 0; ++ int rc = child_liveness(&pcmk_children[next_child]); ++ ++ crm_trace("%s[%lld] checked as %d", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ rc); ++ ++ switch (rc) { ++ case pcmk_rc_ok: ++ pcmk_children[next_child].check_count = 0; ++ next_child++; ++ subdaemon_check_progress = time(NULL); ++ break; ++ case pcmk_rc_ipc_pid_only: // This case: it was previously OK ++ pcmk_children[next_child].check_count++; ++ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) { ++ crm_err("%s[%lld] is unresponsive to ipc after %d tries but " ++ "we found the pid so have it killed that we can restart", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ pcmk_children[next_child].check_count); ++ stop_child(&pcmk_children[next_child], SIGKILL); ++ if (pcmk_children[next_child].respawn) { ++ /* as long as the respawn-limit isn't reached ++ give it another round of check retries ++ */ ++ pcmk_children[next_child].check_count = 0; ++ } ++ } else { ++ crm_notice("%s[%lld] is unresponsive to ipc after %d tries", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid), ++ pcmk_children[next_child].check_count); ++ if (pcmk_children[next_child].respawn) { ++ /* as long as the respawn-limit isn't reached ++ and we haven't run out of connect retries ++ we account this as progress we are willing ++ to tell to sbd ++ */ ++ subdaemon_check_progress = time(NULL); ++ } + } +- } +- /* at least one of the processes found at startup +- * is still going, so keep this recurring timer around */ +- keep_tracking = TRUE; ++ /* go to the next child and see if ++ we can make progress there ++ */ ++ next_child++; ++ break; ++ case pcmk_rc_ipc_unresponsive: ++ if (pcmk_children[next_child].respawn) { ++ crm_err("%s[%lld] terminated", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ } else { ++ /* orderly shutdown */ ++ crm_notice("%s[%lld] terminated", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ } ++ pcmk_process_exit(&(pcmk_children[next_child])); ++ if (!pcmk_children[next_child].respawn) { ++ /* if a subdaemon is down and we don't want it ++ to be restarted this is a success during ++ shutdown. if it isn't restarted anymore ++ due to MAX_RESPAWN it is ++ rather no success. ++ */ ++ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { ++ subdaemon_check_progress = time(NULL); ++ } ++ next_child++; ++ } ++ break; ++ default: ++ crm_exit(CRM_EX_FATAL); ++ break; /* static analysis/noreturn */ + } + +- global_keep_tracking = keep_tracking; +- return keep_tracking; ++ if (next_child >= PCMK__NELEM(pcmk_children)) { ++ next_child = 0; ++ } ++ ++ return G_SOURCE_CONTINUE; + } + + static gboolean +@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child) + child->name, child->endpoint); + /* need to monitor how it evolves, and start new process if badly */ + child->active_before_startup = true; +- if (!global_keep_tracking) { +- global_keep_tracking = true; +- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, +- check_active_before_startup_processes, NULL); +- } + + } else { + if (child->needs_cluster && !pcmkd_cluster_connected()) { +@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child) + int + find_and_track_existing_processes(void) + { +- bool tracking = false; + bool wait_in_progress; + int rc; + size_t i, rounds; +@@ -716,7 +762,6 @@ find_and_track_existing_processes(void) + pcmk_children[i].pid)); + pcmk_children[i].respawn_count = -1; /* 0~keep watching */ + pcmk_children[i].active_before_startup = true; +- tracking = true; + break; + case pcmk_rc_ipc_pid_only: + if (pcmk_children[i].respawn_count == WAIT_TRIES) { +@@ -751,10 +796,8 @@ find_and_track_existing_processes(void) + pcmk_children[i].respawn_count = 0; /* restore pristine state */ + } + +- if (tracking) { +- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, ++ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, + check_active_before_startup_processes, NULL); +- } + return pcmk_rc_ok; + } + +-- +2.27.0 + + +From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 9 Dec 2021 11:25:22 +0100 +Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect + +--- + configure.ac | 3 +++ + lib/common/ipc_client.c | 22 ++++++++++++++++++++++ + 2 files changed, 25 insertions(+) + +diff --git a/configure.ac b/configure.ac +index f43fb724c7..c747fe1193 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17) + CPPFLAGS="$libqb_CFLAGS $CPPFLAGS" + LIBS="$libqb_LIBS $LIBS" + ++dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01) ++AC_CHECK_FUNCS([qb_ipcc_connect_async]) ++ + dnl libqb 2.0.2+ (2020-10) + AC_CHECK_FUNCS(qb_ipcc_auth_get, + AC_DEFINE(HAVE_IPCC_AUTH_GET, 1, +diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c +index c5afdf3a3d..417b9ef175 100644 +--- a/lib/common/ipc_client.c ++++ b/lib/common/ipc_client.c +@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid, + int32_t qb_rc; + pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; + qb_ipcc_connection_t *c; ++#ifdef HAVE_QB_IPCC_CONNECT_ASYNC ++ struct pollfd pollfd = { 0, }; ++ int poll_rc; + ++ c = qb_ipcc_connect_async(name, 0, ++ &(pollfd.fd)); ++#else + c = qb_ipcc_connect(name, 0); ++#endif + if (c == NULL) { + crm_info("Could not connect to %s IPC: %s", name, strerror(errno)); + rc = pcmk_rc_ipc_unresponsive; + goto bail; + } ++#ifdef HAVE_QB_IPCC_CONNECT_ASYNC ++ pollfd.events = POLLIN; ++ do { ++ poll_rc = poll(&pollfd, 1, 2000); ++ } while ((poll_rc == -1) && (errno == EINTR)); ++ if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) { ++ crm_info("Could not connect to %s IPC: %s", name, ++ (poll_rc == 0)?"timeout":strerror(errno)); ++ rc = pcmk_rc_ipc_unresponsive; ++ if (poll_rc > 0) { ++ c = NULL; // qb_ipcc_connect_continue cleaned up for us ++ } ++ goto bail; ++ } ++#endif + + qb_rc = qb_ipcc_fd_get(c, &fd); + if (qb_rc != 0) { +-- +2.27.0 + diff --git a/SOURCES/022-failure-messages.patch b/SOURCES/022-failure-messages.patch new file mode 100644 index 0000000..fab1013 --- /dev/null +++ b/SOURCES/022-failure-messages.patch @@ -0,0 +1,1338 @@ +From 9ee3d6c9b0aba6aae022cc152a3b3472fe388fa3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 16:44:32 -0600 +Subject: [PATCH 01/15] Refactor: fencer: add exit reason to fencing operation + object + +In order to pass a fencing action's exit reason with the action history, +we need the exit reason in remote_fencing_op_t. Nothing sets or uses it as of +this commit. +--- + daemons/fenced/fenced_remote.c | 2 ++ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 6eebb7381e..0fa9706140 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -260,6 +260,8 @@ free_remote_op(gpointer data) + } + g_list_free_full(op->automatic_list, free); + g_list_free(op->duplicates); ++ ++ pcmk__reset_result(&op->result); + free(op); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 502fcc9a29..1a5c933ea7 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -151,6 +151,8 @@ typedef struct remote_fencing_op_s { + /*! The point at which the remote operation completed(nsec) */ + long long completed_nsec; + ++ /*! The (potentially intermediate) result of the operation */ ++ pcmk__action_result_t result; + } remote_fencing_op_t; + + void fenced_broadcast_op_result(remote_fencing_op_t *op, +-- +2.27.0 + + +From 97a2c318866adc5ef5e426c5c3b753df1fa3ab66 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:08:42 -0600 +Subject: [PATCH 02/15] Refactor: fencer: track full result in + remote_fencing_op_t + +Now that remote_fencing_op_t has a place for the full result, +set it before calling finalize_op(), instead of passing a separate result +object to finalize_op(). + +As a bonus, this simplifies the memory management, reducing the chance of +mistakes. +--- + daemons/fenced/fenced_remote.c | 161 ++++++++++++++++----------------- + 1 file changed, 77 insertions(+), 84 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 0fa9706140..30edbff890 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -82,8 +82,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); +-static void finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -485,7 +484,9 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- finalize_op(other, data, result, true); ++ pcmk__set_result(&other->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(other, data, true); + + } else { + // Possible if (for example) it timed out already +@@ -520,20 +521,20 @@ delegate_from_xml(xmlNode *xml) + * + * \param[in] op Fencer operation that completed + * \param[in] data If not NULL, XML reply of last delegated fencing operation +- * \param[in] result Full operation result + * \param[in] dup Whether this operation is a duplicate of another + * (in which case, do not broadcast the result) ++ * ++ * \note The operation result should be set before calling this function. + */ + static void +-finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + +- CRM_CHECK((op != NULL) && (result != NULL), return); ++ CRM_CHECK((op != NULL), return); + + if (op->notify_sent) { + // Most likely, this is a timed-out action that eventually completed +@@ -557,11 +558,11 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + local_data = data; + + } else if (op->delegate == NULL) { +- switch (result->execution_status) { ++ switch (op->result.execution_status) { + case PCMK_EXEC_NO_FENCE_DEVICE: + break; + case PCMK_EXEC_INVALID: +- if (result->exit_status == CRM_EX_EXPIRED) { ++ if (op->result.exit_status == CRM_EX_EXPIRED) { + break; + } + // else fall through +@@ -581,12 +582,12 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, result, op_merged); ++ fenced_broadcast_op_result(op, &op->result, op_merged); + free_xml(local_data); + return; + } + +- if (pcmk__result_ok(result) || dup ++ if (pcmk__result_ok(&op->result) || dup + || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +@@ -595,16 +596,17 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), +- pcmk_exec_status_str(result->execution_status), +- ((result->exit_reason == NULL)? "" : ": "), +- ((result->exit_reason == NULL)? "" : result->exit_reason), ++ (op_merged? " (merged)" : ""), ++ crm_exit_str(op->result.exit_status), ++ pcmk_exec_status_str(op->result.execution_status), ++ ((op->result.exit_reason == NULL)? "" : ": "), ++ ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, result); ++ handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, result); ++ finalize_op_duplicates(op, data, &op->result); + } + + /* Free non-essential parts of the record +@@ -634,7 +636,6 @@ static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -642,8 +643,8 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, NULL, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, false); + return G_SOURCE_REMOVE; + } + +@@ -676,8 +677,6 @@ remote_op_timeout_one(gpointer userdata) + static void + finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + op->op_timer_total = 0; + + crm_debug("Action '%s' targeting %s for client %s timed out " +@@ -690,13 +689,12 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + * devices, and return success. + */ + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { + op->state = st_failed; +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); ++ finalize_op(op, NULL, false); + } + + /*! +@@ -1094,13 +1092,9 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- { +- // For the fencer's purposes, the fencing operation is done +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, msg, &result, false); +- } ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1279,16 +1273,11 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + switch (op->state) { + case st_failed: + // advance_topology_level() exhausted levels +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, +- "All topology levels failed"); +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); +- } ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, false); + return op; + + case st_duplicate: +@@ -1613,10 +1602,6 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1644,6 +1629,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + } + + if (op->devices) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1659,7 +1648,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + } + } + +@@ -1868,7 +1858,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- finalize_op(op, NULL, result, false); ++ pcmk__set_result(&op->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(op, NULL, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2245,31 +2237,34 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- goto done; ++ pcmk__reset_result(&result); ++ return; + } + ++ op->result = result; // The operation takes ownership of the result ++ + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- goto done; ++ return; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { + crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")", op->id); +- if (pcmk__result_ok(&result)) { ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&op->result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2277,7 +2272,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- goto done; ++ return; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2286,58 +2281,58 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, + op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")"); ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + +- if ((op->phase == 2) && !pcmk__result_ok(&result)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&op->result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ + crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " + "after successful 'off'", +- device, pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : ": ", +- (result.exit_reason == NULL)? "" : result.exit_reason, ++ device, pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : ": ", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, + op->target); +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (pcmk__result_ok(&result)) { ++ if (pcmk__result_ok(&op->result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- goto done; ++ return; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + } + +- } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { ++ } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + +- } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else { + /* fall-through and attempt other fencing action using another peer */ +@@ -2346,10 +2341,8 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Retry on failure */ + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, +- pcmk_exec_status_str(result.execution_status)); +- request_peer_fencing(op, NULL, &result); +-done: +- pcmk__reset_result(&result); ++ pcmk_exec_status_str(op->result.execution_status)); ++ request_peer_fencing(op, NULL, &op->result); + } + + gboolean +-- +2.27.0 + + +From c59d062154f7c9e15e90929a20ea244d7efd7247 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:11:12 -0600 +Subject: [PATCH 03/15] Refactor: fencer: drop redundant argument from + finalize_op_duplicates() + +... now that the result is in the op +--- + daemons/fenced/fenced_remote.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 30edbff890..8b496e1042 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -468,11 +468,9 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data) + { + for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; +@@ -482,10 +480,11 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_exec_status_str(result->execution_status), ++ pcmk_exec_status_str(op->result.execution_status), + other->id); +- pcmk__set_result(&other->result, result->exit_status, +- result->execution_status, result->exit_reason); ++ pcmk__set_result(&other->result, op->result.exit_status, ++ op->result.execution_status, ++ op->result.exit_reason); + finalize_op(other, data, true); + + } else { +@@ -606,7 +605,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, &op->result); ++ finalize_op_duplicates(op, data); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 6c49675855323a52a534afa112a0861ba2e3b1ad Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:15:17 -0600 +Subject: [PATCH 04/15] Refactor: fencer: drop redundant argument from + fenced_broadcast_op_result() + +... now that the op includes the result +--- + daemons/fenced/fenced_history.c | 9 +++------ + daemons/fenced/fenced_remote.c | 8 +++----- + daemons/fenced/pacemaker-fenced.h | 3 +-- + 3 files changed, 7 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 0157deadb3..5cacf36ca8 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -359,8 +359,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); +@@ -378,10 +376,10 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() + * from setting a delegate + */ +- pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ pcmk__set_result(&op->result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, + "Initiated by earlier fencer " + "process and presumed failed"); +- fenced_broadcast_op_result(op, &result, false); ++ fenced_broadcast_op_result(op, false); + } + + g_hash_table_iter_steal(&iter); +@@ -396,7 +394,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + +- pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8b496e1042..fb5a5e980e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -390,16 +390,14 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + * \brief Broadcast a fence result notification to all CPG peers + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * \param[in] op_merged Whether this operation is a duplicate of another + */ + void +-fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, result); ++ xmlNode *notify_data = fencing_result2xml(op, &op->result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -581,7 +579,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, &op->result, op_merged); ++ fenced_broadcast_op_result(op, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 1a5c933ea7..6213407da3 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -155,8 +155,7 @@ typedef struct remote_fencing_op_s { + pcmk__action_result_t result; + } remote_fencing_op_t; + +-void fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 73994fc740b8833457b130368db479502d49f285 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:17:33 -0600 +Subject: [PATCH 05/15] Refactor: fencer: drop redundant argument from + handle_local_reply_and_notify() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index fb5a5e980e..2621cb2f19 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -424,11 +424,9 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -443,15 +441,15 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = fenced_construct_reply(op->request, data, result); ++ reply = fenced_construct_reply(op->request, data, &op->result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, result); +- fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ notify_data = fencing_result2xml(op, &op->result); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + +@@ -600,7 +598,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, &op->result); ++ handle_local_reply_and_notify(op, data); + + if (!dup) { + finalize_op_duplicates(op, data); +-- +2.27.0 + + +From 194056d18d3b550d3a53b94d558ceed03b5e5442 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:18:27 -0600 +Subject: [PATCH 06/15] Refactor: fencer: drop redundant argument from + fencing_result2xml() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 2621cb2f19..8d4f53eef6 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -362,13 +362,12 @@ undo_op_remap(remote_fencing_op_t *op) + * \brief Create notification data XML for a fencing operation result + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * + * \return Newly created XML to add as notification data + * \note The caller is responsible for freeing the result. + */ + static xmlNode * +-fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) ++fencing_result2xml(remote_fencing_op_t *op) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + +@@ -381,7 +380,7 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + +- stonith__xe_set_result(notify_data, result); ++ stonith__xe_set_result(notify_data, &op->result); + return notify_data; + } + +@@ -397,7 +396,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, &op->result); ++ xmlNode *notify_data = fencing_result2xml(op); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -448,7 +447,7 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, &op->result); ++ notify_data = fencing_result2xml(op); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); +-- +2.27.0 + + +From c5d38cb201a1219ca95127cba9c3a778e31966a2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:35:43 -0600 +Subject: [PATCH 07/15] Refactor: fencer: drop redundant argument from + request_peer_fencing() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 66 +++++++++++++--------------------- + 1 file changed, 25 insertions(+), 41 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8d4f53eef6..7fb7695fba 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -80,8 +80,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, +- peer_device_info_t *peer, +- pcmk__action_result_t *result); ++ peer_device_info_t *peer); + static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, +@@ -646,18 +645,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, + "Peer did not return fence result within timeout"); + +- + // Try another device, if appropriate +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + return FALSE; + } + +@@ -730,13 +727,10 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { +- // Result won't be used in this case, but we need to pass something +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1622,11 +1616,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op_phase_on(op); + } + +- if (op->devices) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ // This function is only called if the previous device succeeded ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + ++ if (op->devices) { + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1636,13 +1629,12 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + finalize_op(op, msg, false); + } + } +@@ -1673,13 +1665,9 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + * \param[in] op Fencing operation to be executed + * \param[in] peer If NULL or topology is in use, choose best peer to execute + * the fencing, otherwise use this peer +- * \param[in] result Full result of previous failed attempt, if any (used as +- * final result only if a previous attempt failed, topology +- * is not in use, and no devices remain to be attempted) + */ + static void +-request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, +- pcmk__action_result_t *result) ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) + { + const char *device = NULL; + int timeout; +@@ -1822,27 +1810,26 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + } + +- // This is the only case in which result will be used +- CRM_CHECK(result != NULL, return); +- + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, +- NULL); ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } +- /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from +- setting the correct delegate if needed. ++ /* ... else use existing result from previous failed attempt ++ * (topology is not in use, and no devices remain to be attempted). ++ * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would ++ * prevent finalize_op() from setting the correct delegate if ++ * needed. + */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " +@@ -1852,8 +1839,6 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- pcmk__set_result(&op->result, result->exit_status, +- result->execution_status, result->exit_reason); + finalize_op(op, NULL, false); + + } else { +@@ -2104,7 +2089,6 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2139,7 +2123,7 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, +@@ -2148,12 +2132,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } + + } else if (op->state == st_query) { +@@ -2165,12 +2149,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2336,7 +2320,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(op->result.execution_status)); +- request_peer_fencing(op, NULL, &op->result); ++ request_peer_fencing(op, NULL); + } + + gboolean +-- +2.27.0 + + +From be0a0b652c13161a82b05d3104449b7bfc06e8ac Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:56:24 -0600 +Subject: [PATCH 08/15] Feature: fencer: track full result in fencing history + +Add fencing operation results when creating XML in +stonith_local_history_diff_and_merge(), and parse the results from the received +XML in stonith_xml_history_to_list(). + +With this, the fencer now always has full results in its op list, and returns +them in the reply for STONITH_OP_FENCE_HISTORY requests (though nothing uses +that as of this commit). +--- + daemons/fenced/fenced_history.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 5cacf36ca8..3ebf016e67 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -257,6 +257,7 @@ stonith_xml_history_to_list(xmlNode *history) + op->completed_nsec = completed_nsec; + crm_element_value_int(xml_op, F_STONITH_STATE, &state); + op->state = (enum op_state) state; ++ stonith__xe_get_result(xml_op, &op->result); + + g_hash_table_replace(rv, id, op); + CRM_LOG_ASSERT(g_hash_table_lookup(rv, id) != NULL); +@@ -355,6 +356,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + crm_xml_add_ll(entry, F_STONITH_DATE, op->completed); + crm_xml_add_ll(entry, F_STONITH_DATE_NSEC, op->completed_nsec); + crm_xml_add_int(entry, F_STONITH_STATE, op->state); ++ stonith__xe_set_result(entry, &op->result); + } + } + +-- +2.27.0 + + +From afc5292036e212bcfc7475893e0b326b2a69ac58 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:17:21 -0600 +Subject: [PATCH 09/15] API: libstonithd: add exit_reason member to + stonith_history_t + +not yet used, but will be +--- + include/crm/stonith-ng.h | 3 ++- + lib/fencing/st_client.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h +index 3fe9cf54f8..2c79bfa579 100644 +--- a/include/crm/stonith-ng.h ++++ b/include/crm/stonith-ng.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -111,6 +111,7 @@ typedef struct stonith_history_s { + time_t completed; + struct stonith_history_s *next; + long completed_nsec; ++ char *exit_reason; + } stonith_history_t; + + typedef struct stonith_s stonith_t; +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 718739b321..57a2e03361 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -735,6 +735,7 @@ void stonith_history_free(stonith_history_t *history) + free(hp->origin); + free(hp->delegate); + free(hp->client); ++ free(hp->exit_reason); + } + } + +-- +2.27.0 + + +From 1b9e2896322849002a5c0a3a34c9375ea32571d6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 18:04:15 -0600 +Subject: [PATCH 10/15] Feature: fencing: return exit reason with fencing + history + +libstonithd's stonith_t:cmds->history() method now parses exit reasons from the +fencer reply, and returns them in the stonith_history_t results. +--- + lib/fencing/st_client.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 57a2e03361..d229b34805 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -698,6 +698,7 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + stonith_history_t *kvp; + long long completed; + long long completed_nsec = 0L; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + kvp = calloc(1, sizeof(stonith_history_t)); + kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); +@@ -711,6 +712,11 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + kvp->completed_nsec = completed_nsec; + crm_element_value_int(op, F_STONITH_STATE, &kvp->state); + ++ stonith__xe_get_result(op, &result); ++ kvp->exit_reason = result.exit_reason; ++ result.exit_reason = NULL; ++ pcmk__reset_result(&result); ++ + if (last) { + last->next = kvp; + } else { +-- +2.27.0 + + +From ba4e77242e9be4ebeb2843b444ee4afad43c29f3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:44:39 -0600 +Subject: [PATCH 11/15] Feature: fencing: display exit reasons with failed + fencing events + +... when available +--- + lib/fencing/st_output.c | 20 ++++++++++++++++---- + tools/crm_mon_curses.c | 9 +++++++-- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c +index e484278867..18924d795d 100644 +--- a/lib/fencing/st_output.c ++++ b/lib/fencing/st_output.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -11,6 +11,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -263,8 +264,12 @@ stonith_event_html(pcmk__output_t *out, va_list args) { + char *failed_s = time_t_string(event->completed); + + out->list_item(out, "failed-stonith-event", +- "%s of %s failed : delegate=%s, client=%s, origin=%s, %s='%s' %s", ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", +@@ -296,8 +301,13 @@ stonith_event_text(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- pcmk__indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s' %s\n", ++ pcmk__indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +@@ -341,7 +351,9 @@ stonith_event_xml(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- crm_xml_add(node, "status", "failed"); ++ pcmk__xe_set_props(node, "status", "failed", ++ XML_LRM_ATTR_EXIT_REASON, event->exit_reason, ++ NULL); + break; + + case st_done: +diff --git a/tools/crm_mon_curses.c b/tools/crm_mon_curses.c +index bae3710c44..73c8516a8c 100644 +--- a/tools/crm_mon_curses.c ++++ b/tools/crm_mon_curses.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -463,8 +463,13 @@ stonith_event_console(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s'%s\n", ++ curses_indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +-- +2.27.0 + + +From 8105fb4a3a786780fdf85b3d0308eaf6df1ea434 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:45:22 -0600 +Subject: [PATCH 12/15] Low: schemas: copy fence-event API schema in + preparation for changes + +--- + include/crm/common/output_internal.h | 2 +- + xml/api/fence-event-2.15.rng | 33 ++++++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 1 deletion(-) + create mode 100644 xml/api/fence-event-2.15.rng + +diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h +index 479f0e4b43..8c5dcee17c 100644 +--- a/include/crm/common/output_internal.h ++++ b/include/crm/common/output_internal.h +@@ -27,7 +27,7 @@ extern "C" { + # include + # include + +-# define PCMK__API_VERSION "2.14" ++# define PCMK__API_VERSION "2.15" + + #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) + # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +new file mode 100644 +index 0000000000..e54687cd25 +--- /dev/null ++++ b/xml/api/fence-event-2.15.rng +@@ -0,0 +1,33 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ failed ++ success ++ pending ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 46dd9b74d2ee8f7ab70a0c7fe3a998954d4029e8 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:47:16 -0600 +Subject: [PATCH 13/15] Low: schemas: update fence-event API schema for recent + change + +--- + xml/api/fence-event-2.15.rng | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +index e54687cd25..8e000cafa5 100644 +--- a/xml/api/fence-event-2.15.rng ++++ b/xml/api/fence-event-2.15.rng +@@ -18,6 +18,9 @@ + + + ++ ++ ++ + + + +-- +2.27.0 + + +From 350e71772f67f28af6b67f864cbabc481730035c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 11:32:09 -0600 +Subject: [PATCH 14/15] Build: libstonithd: bump shared library version + +... for stonith_history_t change since 2.1.2. + +The struct should only ever be returned by the library as a pointer, so the +changes can be considered backward-compatible. Normally we wouldn't bump shared +library versions mid-cycle, but this will simplify expected backports of this +change. +--- + lib/fencing/Makefile.am | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am +index 1ffa3e051b..a10ddb88ec 100644 +--- a/lib/fencing/Makefile.am ++++ b/lib/fencing/Makefile.am +@@ -2,7 +2,7 @@ + # Original Author: Sun Jiang Dong + # Copyright 2004 International Business Machines + # +-# with later changes copyright 2004-2021 the Pacemaker project contributors. ++# with later changes copyright 2004-2022 the Pacemaker project contributors. + # The version control history for this file may have further details. + # + # This source code is licensed under the GNU General Public License version 2 +@@ -14,7 +14,7 @@ noinst_HEADERS = fencing_private.h + + lib_LTLIBRARIES = libstonithd.la + +-libstonithd_la_LDFLAGS = -version-info 33:0:7 ++libstonithd_la_LDFLAGS = -version-info 34:0:8 + + libstonithd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) + libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) +-- +2.27.0 + + +From 63ea88620a62ff0759560a02bb5e284ebdd03eb6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 16:53:45 -0600 +Subject: [PATCH 15/15] Low: fencer: reset op result before grabbing new one + +just in case +--- + daemons/fenced/fenced_remote.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 7fb7695fba..dc4649e0fc 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2219,6 +2219,7 @@ fenced_process_fencing_reply(xmlNode *msg) + return; + } + ++ pcmk__reset_result(&op->result); + op->result = result; // The operation takes ownership of the result + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { +-- +2.27.0 + diff --git a/SOURCES/023-memory-leak.patch b/SOURCES/023-memory-leak.patch new file mode 100644 index 0000000..3970dd3 --- /dev/null +++ b/SOURCES/023-memory-leak.patch @@ -0,0 +1,82 @@ +From 8034a203bbff0aa3b53f2946dc58e409bd7246c9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 20 Jan 2022 15:03:31 -0600 +Subject: [PATCH] Fix: scheduler: avoid memory leak when displaying clones + +Previously, pe__clone_default() unconditionally created a hash table for +stopped instances, but didn't free it in every code path. + +Now, only create the table when we have something to put in it and might +actually use it, and ensure it always gets freed. +--- + lib/pengine/clone.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 742e2920b0..920a04c32c 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -761,7 +761,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + +- GHashTable *stopped = pcmk__strkey_table(free, free); ++ GHashTable *stopped = NULL; + + char *list_text = NULL; + size_t list_text_len = 0; +@@ -818,7 +818,11 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } else if (partially_active == FALSE) { + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) ++ && !pcmk_is_set(show_opts, pcmk_show_clone_detail) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + +@@ -873,7 +877,6 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -948,8 +951,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + + /* Custom stopped table for non-unique clones */ +- g_hash_table_destroy(stopped); +- stopped = pcmk__strkey_table(free, free); ++ if (stopped != NULL) { ++ g_hash_table_destroy(stopped); ++ stopped = NULL; ++ } + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -972,6 +977,9 @@ pe__clone_default(pcmk__output_t *out, va_list args) + state = "Stopped (disabled)"; + } + ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + if (probe_op != NULL) { + int rc; + +@@ -987,7 +995,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + g_list_free(list); + } + +- if (g_hash_table_size(stopped) > 0) { ++ if (stopped != NULL) { + GList *list = sorted_hash_table_values(stopped); + + clone_header(out, &rc, rsc, clone_data); +-- +2.27.0 + diff --git a/SOURCES/024-daemon-tracking.patch b/SOURCES/024-daemon-tracking.patch new file mode 100644 index 0000000..d9e15e2 --- /dev/null +++ b/SOURCES/024-daemon-tracking.patch @@ -0,0 +1,108 @@ +From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Mon, 24 Jan 2022 12:18:42 +0100 +Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost + processes + +regression from introduction of periodic subdaemon checking +in cases they are pacemakerd children - previously it was either +periodic checking or signal-handler per process. +--- + daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++----------- + 1 file changed, 22 insertions(+), 16 deletions(-) + +diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c +index c03903c99e..84ecdc1ee8 100644 +--- a/daemons/pacemakerd/pcmkd_subdaemons.c ++++ b/daemons/pacemakerd/pcmkd_subdaemons.c +@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data) + switch (rc) { + case pcmk_rc_ok: + pcmk_children[next_child].check_count = 0; +- next_child++; + subdaemon_check_progress = time(NULL); + break; + case pcmk_rc_ipc_pid_only: // This case: it was previously OK +@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data) + /* go to the next child and see if + we can make progress there + */ +- next_child++; + break; + case pcmk_rc_ipc_unresponsive: ++ if (!pcmk_children[next_child].respawn) { ++ /* if a subdaemon is down and we don't want it ++ to be restarted this is a success during ++ shutdown. if it isn't restarted anymore ++ due to MAX_RESPAWN it is ++ rather no success. ++ */ ++ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { ++ subdaemon_check_progress = time(NULL); ++ } ++ } ++ if (!pcmk_children[next_child].active_before_startup) { ++ crm_trace("found %s[%lld] missing - signal-handler " ++ "will take care of it", ++ pcmk_children[next_child].name, ++ (long long) PCMK__SPECIAL_PID_AS_0( ++ pcmk_children[next_child].pid)); ++ break; ++ } + if (pcmk_children[next_child].respawn) { + crm_err("%s[%lld] terminated", + pcmk_children[next_child].name, +@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data) + pcmk_children[next_child].pid)); + } + pcmk_process_exit(&(pcmk_children[next_child])); +- if (!pcmk_children[next_child].respawn) { +- /* if a subdaemon is down and we don't want it +- to be restarted this is a success during +- shutdown. if it isn't restarted anymore +- due to MAX_RESPAWN it is +- rather no success. +- */ +- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) { +- subdaemon_check_progress = time(NULL); +- } +- next_child++; +- } + break; + default: + crm_exit(CRM_EX_FATAL); + break; /* static analysis/noreturn */ + } + ++ next_child++; + if (next_child >= PCMK__NELEM(pcmk_children)) { + next_child = 0; + } +@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child) + { + child->pid = 0; + child->active_before_startup = false; ++ child->check_count = 0; + + child->respawn_count += 1; + if (child->respawn_count > MAX_RESPAWN) { +@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child) + crm_warn("One-off suppressing strict respawning of a child process %s," + " appears alright per %s IPC end-point", + child->name, child->endpoint); +- /* need to monitor how it evolves, and start new process if badly */ +- child->active_before_startup = true; + + } else { + if (child->needs_cluster && !pcmkd_cluster_connected()) { +@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child) + const char *env_callgrind = getenv("PCMK_callgrind_enabled"); + + child->active_before_startup = false; ++ child->check_count = 0; + + if (child->command == NULL) { + crm_info("Nothing to do for child \"%s\"", child->name); +-- +2.27.0 + diff --git a/SOURCES/025-regression.patch b/SOURCES/025-regression.patch new file mode 100644 index 0000000..62d2a46 --- /dev/null +++ b/SOURCES/025-regression.patch @@ -0,0 +1,30 @@ +From 16928cfc69136bc56b1574bee9966e0d5de73abd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 26 Jan 2022 09:15:43 -0600 +Subject: [PATCH] Fix: controller: correctly match "node down" events + +regression introduced in 2.1.2 by 03ce7376e + +The symptom that led to this was that removing a remote node connection +resource would lead to the remote node getting fenced when the connection stop +was not recognized as an expected down event. +--- + daemons/controld/controld_te_events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c +index 36fd832ba0..1fd7129922 100644 +--- a/daemons/controld/controld_te_events.c ++++ b/daemons/controld/controld_te_events.c +@@ -304,7 +304,7 @@ match_down_event(const char *target) + gIter2 = gIter2->next) { + + match = (crm_action_t*)gIter2->data; +- if (pcmk_is_set(match->flags, pcmk__graph_action_confirmed)) { ++ if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { + xpath_ret = xpath_search(match->xml, xpath); + if (numXpathResults(xpath_ret) < 1) { + match = NULL; +-- +2.27.0 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec new file mode 100644 index 0000000..26b7bf4 --- /dev/null +++ b/SPECS/pacemaker.spec @@ -0,0 +1,1492 @@ +# User-configurable globals and defines to control package behavior +# (these should not test {with X} values, which are declared later) + +## User and group to use for nonprivileged services +%global uname hacluster +%global gname haclient + +## Where to install Pacemaker documentation +%if 0%{?suse_version} > 0 +%global pcmk_docdir %{_docdir}/%{name}-%{version} +%else +%if 0%{?rhel} > 7 +%global pcmk_docdir %{_docdir}/%{name}-doc +%else +%global pcmk_docdir %{_docdir}/%{name} +%endif +%endif + +## GitHub entity that distributes source (for ease of using a fork) +%global github_owner ClusterLabs + +## Where bug reports should be submitted +## Leave bug_url undefined to use ClusterLabs default, others define it here +%if 0%{?rhel} +%global bug_url https://bugzilla.redhat.com/ +%else +%if 0%{?fedora} +%global bug_url https://bugz.fedoraproject.org/%{name} +%endif +%endif + +## What to use as the OCF resource agent root directory +%global ocf_root %{_prefix}/lib/ocf + +## Upstream pacemaker version, and its package version (specversion +## can be incremented to build packages reliably considered "newer" +## than previously built packages with the same pcmkversion) +%global pcmkversion 2.1.2 +%global specversion 4 + +## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build +%global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175 + +## Since git v2.11, the extent of abbreviation is autoscaled by default +## (used to be constant of 7), so we need to convey it for non-tags, too. +%global commit_abbrev 9 + + +# Define conditionals so that "rpmbuild --with " and +# "rpmbuild --without " can enable and disable specific features + +## Add option to enable support for stonith/external fencing agents +%bcond_with stonithd + +## Add option for whether to support storing sensitive information outside CIB +%if (0%{?fedora} && 0%{?fedora} <= 33) || (0%{?rhel} && 0%{?rhel} <= 8) +%bcond_with cibsecrets +%else +%bcond_without cibsecrets +%endif + +## Add option to create binaries suitable for use with profiling tools +%bcond_with profiling + +## Add option to create binaries with coverage analysis +%bcond_with coverage + +## Add option to skip (or enable, on RHEL) generating documentation +## (the build tools aren't available everywhere) +%if 0%{?rhel} +%bcond_with doc +%else +%bcond_without doc +%endif + +## Add option to default to start-up synchronization with SBD. +## +## If enabled, SBD *MUST* be built to default similarly, otherwise data +## corruption could occur. Building both Pacemaker and SBD to default +## to synchronization improves safety, without requiring higher-level tools +## to be aware of the setting or requiring users to modify configurations +## after upgrading to versions that support synchronization. +%if 0%{?rhel} && 0%{?rhel} > 8 +%bcond_without sbd_sync +%else +%bcond_with sbd_sync +%endif + +## Add option to prefix package version with "0." +## (so later "official" packages will be considered updates) +%bcond_with pre_release + +## NOTE: skip --with upstart_job + +## Add option to turn off hardening of libraries and daemon executables +%bcond_without hardening + +## Add option to enable (or disable, on RHEL 8) links for legacy daemon names +%if 0%{?rhel} && 0%{?rhel} <= 8 +%bcond_without legacy_links +%else +%bcond_with legacy_links +%endif + +## Nagios source control identifiers +%global nagios_name nagios-agents-metadata +%global nagios_hash 105ab8a7b2c16b9a29cf1c1596b80136eeef332b +%global nagios_archive_github_url %{nagios_hash}#/%{nagios_name}-%{nagios_hash}.tar.gz + +# Define globals for convenient use later + +## Workaround to use parentheses in other globals +%global lparen ( +%global rparen ) + +## Whether this is a tagged release (final or release candidate) +%define tag_release %(c=%{commit}; case ${c} in Pacemaker-*%{rparen} echo 1 ;; + *%{rparen} echo 0 ;; esac) + +## Portion of export/dist tarball name after "pacemaker-", and release version +%if 0%{tag_release} +%define archive_version %(c=%{commit}; echo ${c:10}) +%define archive_github_url %{commit}#/%{name}-%{archive_version}.tar.gz +%else +%define archive_version %(c=%{commit}; echo ${c:0:%{commit_abbrev}}) +%define archive_github_url %{archive_version}#/%{name}-%{archive_version}.tar.gz +%endif +### Always use a simple release number +%define pcmk_release %{specversion} + +%if 0%{?fedora} > 20 || 0%{?rhel} > 7 +## Base GnuTLS cipher priorities (presumably only the initial, required keyword) +## overridable with "rpmbuild --define 'pcmk_gnutls_priorities PRIORITY-SPEC'" +%define gnutls_priorities %{?pcmk_gnutls_priorities}%{!?pcmk_gnutls_priorities:@SYSTEM} +%endif + +%if 0%{?fedora} > 22 || 0%{?rhel} > 7 +%global supports_recommends 1 +%endif + +## Different distros name certain packages differently +## (note: corosync libraries also differ, but all provide corosync-devel) +%if 0%{?suse_version} > 0 +%global pkgname_bzip2_devel libbz2-devel +%global pkgname_docbook_xsl docbook-xsl-stylesheets +%global pkgname_gnutls_devel libgnutls-devel +%global pkgname_shadow_utils shadow +%global pkgname_procps procps +%global pkgname_glue_libs libglue +%global pkgname_pcmk_libs lib%{name}3 +%global hacluster_id 90 +%else +%global pkgname_libtool_devel libtool-ltdl-devel +%global pkgname_libtool_devel_arch libtool-ltdl-devel%{?_isa} +%global pkgname_bzip2_devel bzip2-devel +%global pkgname_docbook_xsl docbook-style-xsl +%global pkgname_gnutls_devel gnutls-devel +%global pkgname_shadow_utils shadow-utils +%global pkgname_procps procps-ng +%global pkgname_glue_libs cluster-glue-libs +%global pkgname_pcmk_libs %{name}-libs +%global hacluster_id 189 +%endif + +## Distro-specific configuration choices + +### Use 2.0-style output when other distro packages don't support current output +%if 0%{?fedora} || ( 0%{?rhel} && 0%{?rhel} <= 8 ) +%global compat20 --enable-compat-2.0 +%endif + +### Default concurrent-fencing to true when distro prefers that +%if 0%{?rhel} >= 7 +%global concurrent_fencing --with-concurrent-fencing-default=true +%endif + +### Default resource-stickiness to 1 when distro prefers that +%if 0%{?fedora} >= 35 || 0%{?rhel} >= 9 +%global resource_stickiness --with-resource-stickiness-default=1 +%endif + + +# Python-related definitions + +## Turn off auto-compilation of Python files outside Python specific paths, +## so there's no risk that unexpected "__python" macro gets picked to do the +## RPM-native byte-compiling there (only "{_datadir}/pacemaker/tests" affected) +## -- distro-dependent tricks or automake's fallback to be applied there +%if %{defined _python_bytecompile_extra} +%global _python_bytecompile_extra 0 +%else +### the statement effectively means no RPM-native byte-compiling will occur at +### all, so distro-dependent tricks for Python-specific packages to be applied +%global __os_install_post %(echo '%{__os_install_post}' | { + sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g'; }) +%endif + +## Prefer Python 3 definitions explicitly, in case 2 is also available +%if %{defined __python3} +%global python_name python3 +%global python_path %{__python3} +%define python_site %{?python3_sitelib}%{!?python3_sitelib:%( + %{python_path} -c 'from distutils.sysconfig import get_python_lib as gpl; print(gpl(1))' 2>/dev/null)} +%else +%if %{defined python_version} +%global python_name python%(echo %{python_version} | cut -d'.' -f1) +%define python_path %{?__python}%{!?__python:/usr/bin/%{python_name}} +%else +%global python_name python +%global python_path %{?__python}%{!?__python:/usr/bin/python%{?python_pkgversion}} +%endif +%define python_site %{?python_sitelib}%{!?python_sitelib:%( + %{python_name} -c 'from distutils.sysconfig import get_python_lib as gpl; print(gpl(1))' 2>/dev/null)} +%endif + + +# Keep sane profiling data if requested +%if %{with profiling} + +## Disable -debuginfo package and stripping binaries/libraries +%define debug_package %{nil} + +%endif + + +Name: pacemaker +Summary: Scalable High-Availability cluster resource manager +Version: %{pcmkversion} +Release: %{pcmk_release}%{?dist} +License: GPLv2+ and LGPLv2+ +Url: https://www.clusterlabs.org/ + +# Example: https://codeload.github.com/ClusterLabs/pacemaker/tar.gz/e91769e +# will download pacemaker-e91769e.tar.gz +# +# The ending part starting with '#' is ignored by github but necessary for +# rpmbuild to know what the tar archive name is. (The downloaded file will be +# named correctly only for commit IDs, not tagged releases.) +# +# You can use "spectool -s 0 pacemaker.spec" (rpmdevtools) to show final URL. +Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{archive_github_url} +Source1: https://codeload.github.com/%{github_owner}/%{nagios_name}/tar.gz/%{nagios_archive_github_url} + +# upstream commits +Patch1: 001-acl-group-schema.patch +Patch2: 002-fencing-reasons.patch +Patch3: 003-fencing-reasons.patch +Patch4: 004-systemd-metadata.patch +Patch5: 005-fencing-reasons.patch +Patch6: 006-stateful-metadata.patch +Patch7: 007-memory-leak.patch +Patch8: 008-fencing-history.patch +Patch9: 009-fencing-reasons.patch +Patch10: 010-probe-failures.patch +Patch11: 011-fencing-reasons.patch +Patch12: 012-notify-crash.patch +Patch13: 013-probe-failures.patch +Patch14: 014-pcmk_delay_base.patch +Patch15: 015-fencing-reasons.patch +Patch16: 016-fencing-crash.patch +Patch17: 017-fencing-reasons.patch +Patch18: 018-failure-messages.patch +Patch19: 019-corosync-tracking.patch +Patch20: 020-systemd-unit.patch +Patch21: 021-daemon-tracking.patch +Patch22: 022-failure-messages.patch +Patch23: 023-memory-leak.patch +Patch24: 024-daemon-tracking.patch +Patch25: 025-regression.patch + +Requires: resource-agents +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} +Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} +Requires: %{name}-cli = %{version}-%{release} +%{?systemd_requires} + +%if %{defined centos} +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 %{arm} +%else +%if 0%{?rhel} +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 +%endif +%endif + +Requires: %{python_path} +BuildRequires: %{python_name}-devel + +# Pacemaker requires a minimum libqb functionality +# RHEL requires a higher version than upstream, for qb_ipcc_connect_async() +Requires: libqb >= 2.0.3-7 +BuildRequires: libqb-devel >= 2.0.3-7 + +# Required basic build tools +BuildRequires: autoconf +BuildRequires: automake +BuildRequires: coreutils +BuildRequires: findutils +BuildRequires: gcc +BuildRequires: grep +BuildRequires: libtool +%if %{defined pkgname_libtool_devel} +BuildRequires: %{?pkgname_libtool_devel} +%endif +BuildRequires: make +BuildRequires: pkgconfig +BuildRequires: sed + +# Required for core functionality +BuildRequires: pkgconfig(glib-2.0) >= 2.42 +BuildRequires: libxml2-devel +BuildRequires: libxslt-devel +BuildRequires: libuuid-devel +BuildRequires: %{pkgname_bzip2_devel} + +# Enables optional functionality +BuildRequires: pkgconfig(dbus-1) +BuildRequires: %{pkgname_docbook_xsl} +BuildRequires: %{pkgname_gnutls_devel} +BuildRequires: help2man +BuildRequires: ncurses-devel +BuildRequires: pam-devel + +# Required for "make check" +BuildRequires: libcmocka-devel + +BuildRequires: pkgconfig(systemd) + +# RH patches are created by git, so we need git to apply them +BuildRequires: git + +# The RHEL 9 build root has corosync_cfg_trackstart() available, so +# Pacemaker's configure script will build support for it. Add a hard dependency +# to ensure users have compatible Corosync libraries if they upgrade Pacemaker. +Requires: corosync >= 3.1.1 +BuildRequires: corosync-devel >= 3.1.1 + +%if %{with stonithd} +BuildRequires: %{pkgname_glue_libs}-devel +%endif + +%if %{with doc} +BuildRequires: asciidoc +BuildRequires: inkscape +BuildRequires: %{python_name}-sphinx +%endif + +Provides: pcmk-cluster-manager = %{version}-%{release} +Provides: pcmk-cluster-manager%{?_isa} = %{version}-%{release} + +# Bundled bits +## Pacemaker uses the crypto/md5-buffer module from gnulib +%if 0%{?fedora} || 0%{?rhel} +Provides: bundled(gnulib) = 20200404 +%endif + +%description +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +It supports more than 16 node clusters with significant capabilities +for managing resources and dependencies. + +It will run scripts at initialization, when machines go up or down, +when related resources fail and can be configured to periodically check +resource health. + +Available rpmbuild rebuild options: + --with(out) : cibsecrets coverage doc hardening pre_release profiling stonithd + +%package cli +License: GPLv2+ and LGPLv2+ +Summary: Command line tools for controlling Pacemaker clusters +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} +%if 0%{?supports_recommends} +Recommends: pcmk-cluster-manager = %{version}-%{release} +# For crm_report +Requires: tar +Requires: bzip2 +%endif +Requires: perl-TimeDate +Requires: %{pkgname_procps} +Requires: psmisc +Requires(post):coreutils + +%description cli +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{name}-cli package contains command line tools that can be used +to query and control the cluster from machines that may, or may not, +be part of the cluster. + +%package -n %{pkgname_pcmk_libs} +License: GPLv2+ and LGPLv2+ +Summary: Core Pacemaker libraries +Requires(pre): %{pkgname_shadow_utils} +Requires: %{name}-schemas = %{version}-%{release} +# sbd 1.4.0+ supports the libpe_status API for pe_working_set_t +# sbd 1.4.2+ supports startup/shutdown handshake via pacemakerd-api +# sbd 1.5.0+ supports handshake defaults to enabled in this spec +Conflicts: sbd < 1.5.0 + +%description -n %{pkgname_pcmk_libs} +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{pkgname_pcmk_libs} package contains shared libraries needed for cluster +nodes and those just running the CLI tools. + +%package cluster-libs +License: GPLv2+ and LGPLv2+ +Summary: Cluster Libraries used by Pacemaker +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} + +%description cluster-libs +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{name}-cluster-libs package contains cluster-aware shared +libraries needed for nodes that will form part of the cluster nodes. + +%package remote +License: GPLv2+ and LGPLv2+ +Summary: Pacemaker remote executor daemon for non-cluster nodes +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} +Requires: %{name}-cli = %{version}-%{release} +Requires: resource-agents +# -remote can be fully independent of systemd +%{?systemd_ordering}%{!?systemd_ordering:%{?systemd_requires}} +Provides: pcmk-cluster-manager = %{version}-%{release} +Provides: pcmk-cluster-manager%{?_isa} = %{version}-%{release} + +%description remote +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{name}-remote package contains the Pacemaker Remote daemon +which is capable of extending pacemaker functionality to remote +nodes not running the full corosync/cluster stack. + +%package -n %{pkgname_pcmk_libs}-devel +License: GPLv2+ and LGPLv2+ +Summary: Pacemaker development package +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} +Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} +Requires: %{pkgname_bzip2_devel}%{?_isa} +Requires: corosync-devel%{?_isa} >= 2.0.0 +Requires: glib2-devel%{?_isa} +Requires: libqb-devel%{?_isa} +%if %{defined pkgname_libtool_devel_arch} +Requires: %{?pkgname_libtool_devel_arch} +%endif +Requires: libuuid-devel%{?_isa} +Requires: libxml2-devel%{?_isa} +Requires: libxslt-devel%{?_isa} + +%description -n %{pkgname_pcmk_libs}-devel +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{pkgname_pcmk_libs}-devel package contains headers and shared libraries +for developing tools for Pacemaker. + +%package cts +License: GPLv2+ and LGPLv2+ +Summary: Test framework for cluster-related technologies like Pacemaker +Requires: %{python_path} +Requires: %{pkgname_pcmk_libs} = %{version}-%{release} +Requires: %{name}-cli = %{version}-%{release} +Requires: %{pkgname_procps} +Requires: psmisc +BuildArch: noarch + +# systemd Python bindings are a separate package in some distros +%if %{defined systemd_requires} +%if 0%{?fedora} > 22 || 0%{?rhel} > 7 +Requires: %{python_name}-systemd +%endif +%endif + +%description cts +Test framework for cluster-related technologies like Pacemaker + +%package doc +License: CC-BY-SA-4.0 +Summary: Documentation for Pacemaker +BuildArch: noarch + +%description doc +Documentation for Pacemaker. + +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +%package schemas +License: GPLv2+ +Summary: Schemas and upgrade stylesheets for Pacemaker +BuildArch: noarch + +%description schemas +Schemas and upgrade stylesheets for Pacemaker + +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +%package nagios-plugins-metadata +License: GPLv3 +Summary: Pacemaker Nagios Metadata +BuildArch: noarch +# NOTE below are the plugins this metadata uses. +# Requires: nagios-plugins-http +# Requires: nagios-plugins-ldap +# Requires: nagios-plugins-mysql +# Requires: nagios-plugins-pgsql +# Requires: nagios-plugins-tcp +Requires: pcmk-cluster-manager + +%description nagios-plugins-metadata +The metadata files required for Pacemaker to execute the nagios plugin +monitor resources. + +%prep +%autosetup -a 1 -n %{name}-%{archive_version} -S git_am -p 1 + +%build + +export systemdsystemunitdir=%{?_unitdir}%{!?_unitdir:no} + +%if %{with hardening} +# prefer distro-provided hardening flags in case they are defined +# through _hardening_{c,ld}flags macros, configure script will +# use its own defaults otherwise; if such hardenings are completely +# undesired, rpmbuild using "--without hardening" +# (or "--define '_without_hardening 1'") +export CFLAGS_HARDENED_EXE="%{?_hardening_cflags}" +export CFLAGS_HARDENED_LIB="%{?_hardening_cflags}" +export LDFLAGS_HARDENED_EXE="%{?_hardening_ldflags}" +export LDFLAGS_HARDENED_LIB="%{?_hardening_ldflags}" +%endif + +./autogen.sh + +%{configure} \ + PYTHON=%{python_path} \ + %{!?with_hardening: --disable-hardening} \ + %{?with_legacy_links: --enable-legacy-links} \ + %{?with_profiling: --with-profiling} \ + %{?with_coverage: --with-coverage} \ + %{?with_cibsecrets: --with-cibsecrets} \ + %{?with_sbd_sync: --with-sbd-sync-default="true"} \ + %{?gnutls_priorities: --with-gnutls-priorities="%{gnutls_priorities}"} \ + %{?bug_url: --with-bug-url=%{bug_url}} \ + %{?ocf_root: --with-ocfdir=%{ocf_root}} \ + %{?concurrent_fencing} \ + %{?resource_stickiness} \ + %{?compat20} \ + --disable-static \ + --with-initdir=%{_initrddir} \ + --with-runstatedir=%{_rundir} \ + --localstatedir=%{_var} \ + --with-nagios \ + --with-nagios-metadata-dir=%{_datadir}/pacemaker/nagios/plugins-metadata/ \ + --with-nagios-plugin-dir=%{_libdir}/nagios/plugins/ \ + --with-version=%{version}-%{release} + +make %{_smp_mflags} V=1 + +%check +make %{_smp_mflags} check +{ cts/cts-scheduler --run load-stopped-loop \ + && cts/cts-cli \ + && touch .CHECKED +} 2>&1 | sed 's/[fF]ail/faiil/g' # prevent false positives in rpmlint +[ -f .CHECKED ] && rm -f -- .CHECKED + +%install +# skip automake-native Python byte-compilation, since RPM-native one (possibly +# distro-confined to Python-specific directories, which is currently the only +# relevant place, anyway) assures proper intrinsic alignment with wider system +# (such as with py_byte_compile macro, which is concurrent Fedora/EL specific) +make install \ + DESTDIR=%{buildroot} V=1 docdir=%{pcmk_docdir} \ + %{?_python_bytecompile_extra:%{?py_byte_compile:am__py_compile=true}} + +mkdir -p %{buildroot}%{_datadir}/pacemaker/nagios/plugins-metadata +for file in $(find %{nagios_name}-%{nagios_hash}/metadata -type f); do + install -m 644 $file %{buildroot}%{_datadir}/pacemaker/nagios/plugins-metadata +done + + +mkdir -p ${RPM_BUILD_ROOT}%{_localstatedir}/lib/rpm-state/%{name} + +# Don't package libtool archives +find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f + +# Do not package these either +rm -f %{buildroot}/%{_sbindir}/fence_legacy +rm -f %{buildroot}/%{_mandir}/man8/fence_legacy.* + +# For now, don't package the servicelog-related binaries built only for +# ppc64le when certain dependencies are installed. If they get more exercise by +# advanced users, we can reconsider. +rm -f %{buildroot}/%{_sbindir}/notifyServicelogEvent +rm -f %{buildroot}/%{_sbindir}/ipmiservicelogd + +# Byte-compile Python sources where suitable and the distro procedures known +%if %{defined py_byte_compile} +%{py_byte_compile %{python_path} %{buildroot}%{_datadir}/pacemaker/tests} +%if !%{defined _python_bytecompile_extra} +%{py_byte_compile %{python_path} %{buildroot}%{python_site}/cts} +%endif +%endif + +%if %{with coverage} +GCOV_BASE=%{buildroot}/%{_var}/lib/pacemaker/gcov +mkdir -p $GCOV_BASE +find . -name '*.gcno' -type f | while read F ; do + D=`dirname $F` + mkdir -p ${GCOV_BASE}/$D + cp $F ${GCOV_BASE}/$D +done +%endif + +%post +%systemd_post pacemaker.service + +%preun +%systemd_preun pacemaker.service + +%postun +%systemd_postun_with_restart pacemaker.service + +%pre remote +# Stop the service before anything is touched, and remember to restart +# it as one of the last actions (compared to using systemd_postun_with_restart, +# this avoids suicide when sbd is in use) +systemctl --quiet is-active pacemaker_remote +if [ $? -eq 0 ] ; then + mkdir -p %{_localstatedir}/lib/rpm-state/%{name} + touch %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote + systemctl stop pacemaker_remote >/dev/null 2>&1 +else + rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote +fi + +%post remote +%systemd_post pacemaker_remote.service + +%preun remote +%systemd_preun pacemaker_remote.service + +%postun remote +# This next line is a no-op, because we stopped the service earlier, but +# we leave it here because it allows us to revert to the standard behavior +# in the future if desired +%systemd_postun_with_restart pacemaker_remote.service +# Explicitly take care of removing the flag-file(s) upon final removal +if [ "$1" -eq 0 ] ; then + rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote +fi + +%posttrans remote +if [ -e %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote ] ; then + systemctl start pacemaker_remote >/dev/null 2>&1 + rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote +fi + +%post cli +%systemd_post crm_mon.service +if [ "$1" -eq 2 ]; then + # Package upgrade, not initial install: + # Move any pre-2.0 logs to new location to ensure they get rotated + { mv -fbS.rpmsave %{_var}/log/pacemaker.log* %{_var}/log/pacemaker \ + || mv -f %{_var}/log/pacemaker.log* %{_var}/log/pacemaker + } >/dev/null 2>/dev/null || : +fi + +%preun cli +%systemd_preun crm_mon.service + +%postun cli +%systemd_postun_with_restart crm_mon.service + +%pre -n %{pkgname_pcmk_libs} +# @TODO Use sysusers.d: +# https://fedoraproject.org/wiki/Changes/Adopting_sysusers.d_format +getent group %{gname} >/dev/null || groupadd -r %{gname} -g %{hacluster_id} +getent passwd %{uname} >/dev/null || useradd -r -g %{gname} -u %{hacluster_id} -s /sbin/nologin -c "cluster user" %{uname} +exit 0 + +%ldconfig_scriptlets -n %{pkgname_pcmk_libs} +%ldconfig_scriptlets cluster-libs + +%files +########################################################### +%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker +%{_sbindir}/pacemakerd + +%{_unitdir}/pacemaker.service + +%exclude %{_datadir}/pacemaker/nagios/plugins-metadata/* + +%exclude %{_libexecdir}/pacemaker/cts-log-watcher +%exclude %{_libexecdir}/pacemaker/cts-support +%exclude %{_sbindir}/pacemaker-remoted +%exclude %{_sbindir}/pacemaker_remoted +%{_libexecdir}/pacemaker/* + +%{_sbindir}/crm_attribute +%{_sbindir}/crm_master +%{_sbindir}/fence_watchdog + +%doc %{_mandir}/man7/pacemaker-controld.* +%doc %{_mandir}/man7/pacemaker-schedulerd.* +%doc %{_mandir}/man7/pacemaker-fenced.* +%doc %{_mandir}/man7/ocf_pacemaker_controld.* +%doc %{_mandir}/man7/ocf_pacemaker_remote.* +%doc %{_mandir}/man8/crm_attribute.* +%doc %{_mandir}/man8/crm_master.* +%doc %{_mandir}/man8/fence_watchdog.* +%doc %{_mandir}/man8/pacemakerd.* + +%doc %{_datadir}/pacemaker/alerts + +%license licenses/GPLv2 +%doc COPYING +%doc ChangeLog + +%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cib +%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/pengine +%{ocf_root}/resource.d/pacemaker/controld +%{ocf_root}/resource.d/pacemaker/remote + +%files cli +%dir %attr (750, root, %{gname}) %{_sysconfdir}/pacemaker +%config(noreplace) %{_sysconfdir}/logrotate.d/pacemaker +%config(noreplace) %{_sysconfdir}/sysconfig/crm_mon + +%{_unitdir}/crm_mon.service + +%{_sbindir}/attrd_updater +%{_sbindir}/cibadmin +%if %{with cibsecrets} +%{_sbindir}/cibsecret +%endif +%{_sbindir}/crm_diff +%{_sbindir}/crm_error +%{_sbindir}/crm_failcount +%{_sbindir}/crm_mon +%{_sbindir}/crm_node +%{_sbindir}/crm_resource +%{_sbindir}/crm_rule +%{_sbindir}/crm_standby +%{_sbindir}/crm_verify +%{_sbindir}/crmadmin +%{_sbindir}/iso8601 +%{_sbindir}/crm_shadow +%{_sbindir}/crm_simulate +%{_sbindir}/crm_report +%{_sbindir}/crm_ticket +%{_sbindir}/stonith_admin +# "dirname" is owned by -schemas, which is a prerequisite +%{_datadir}/pacemaker/report.collector +%{_datadir}/pacemaker/report.common +# XXX "dirname" is not owned by any prerequisite +%{_datadir}/snmp/mibs/PCMK-MIB.txt + +%exclude %{ocf_root}/resource.d/pacemaker/controld +%exclude %{ocf_root}/resource.d/pacemaker/o2cb +%exclude %{ocf_root}/resource.d/pacemaker/remote + +%dir %{ocf_root} +%dir %{ocf_root}/resource.d +%{ocf_root}/resource.d/pacemaker + +%doc %{_mandir}/man7/* +%exclude %{_mandir}/man7/pacemaker-controld.* +%exclude %{_mandir}/man7/pacemaker-schedulerd.* +%exclude %{_mandir}/man7/pacemaker-fenced.* +%exclude %{_mandir}/man7/ocf_pacemaker_controld.* +%exclude %{_mandir}/man7/ocf_pacemaker_o2cb.* +%exclude %{_mandir}/man7/ocf_pacemaker_remote.* +%doc %{_mandir}/man8/* +%exclude %{_mandir}/man8/crm_attribute.* +%exclude %{_mandir}/man8/crm_master.* +%exclude %{_mandir}/man8/fence_legacy.* +%exclude %{_mandir}/man8/fence_watchdog.* +%exclude %{_mandir}/man8/pacemakerd.* +%exclude %{_mandir}/man8/pacemaker-remoted.* + +%license licenses/GPLv2 +%doc COPYING +%doc ChangeLog + +%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker +%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/blackbox +%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cores +%dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker +%dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker/bundles + +%files -n %{pkgname_pcmk_libs} +%{_libdir}/libcib.so.* +%{_libdir}/liblrmd.so.* +%{_libdir}/libcrmservice.so.* +%{_libdir}/libcrmcommon.so.* +%{_libdir}/libpe_status.so.* +%{_libdir}/libpe_rules.so.* +%{_libdir}/libpacemaker.so.* +%{_libdir}/libstonithd.so.* +%license licenses/LGPLv2.1 +%doc COPYING +%doc ChangeLog + +%files cluster-libs +%{_libdir}/libcrmcluster.so.* +%license licenses/LGPLv2.1 +%doc COPYING +%doc ChangeLog + +%files remote +%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker +# state directory is shared between the subpackets +# let rpm take care of removing it once it isn't +# referenced anymore and empty +%ghost %dir %{_localstatedir}/lib/rpm-state/%{name} +%{_unitdir}/pacemaker_remote.service + +%{_sbindir}/pacemaker-remoted +%{_sbindir}/pacemaker_remoted +%{_mandir}/man8/pacemaker-remoted.* +%license licenses/GPLv2 +%doc COPYING +%doc ChangeLog + +%files doc +%doc %{pcmk_docdir} +%license licenses/CC-BY-SA-4.0 + +%files cts +%{python_site}/cts +%{_datadir}/pacemaker/tests + +%{_libexecdir}/pacemaker/cts-log-watcher +%{_libexecdir}/pacemaker/cts-support + +%license licenses/GPLv2 +%doc COPYING +%doc ChangeLog + +%files -n %{pkgname_pcmk_libs}-devel +%{_includedir}/pacemaker +%{_libdir}/*.so +%if %{with coverage} +%{_var}/lib/pacemaker/gcov +%endif +%{_libdir}/pkgconfig/*.pc +%license licenses/LGPLv2.1 +%doc COPYING +%doc ChangeLog + +%files schemas +%license licenses/GPLv2 +%dir %{_datadir}/pacemaker +%{_datadir}/pacemaker/*.rng +%{_datadir}/pacemaker/*.xsl +%{_datadir}/pacemaker/api +%{_datadir}/pkgconfig/pacemaker-schemas.pc + +%files nagios-plugins-metadata +%dir %{_datadir}/pacemaker/nagios +%dir %{_datadir}/pacemaker/nagios/plugins-metadata +%attr(0644,root,root) %{_datadir}/pacemaker/nagios/plugins-metadata/* +%license %{nagios_name}-%{nagios_hash}/COPYING + +%changelog +* Wed Jan 26 2022 Ken Gaillot - 2.1.2-4 +- Fix regression in down event detection that affects remote nodes +- Resolves: rhbz2039399 + +* Mon Jan 24 2022 Ken Gaillot - 2.1.2-3 +- Detect an unresponsive subdaemon +- Handle certain probe failures as stopped instead of failed +- Update pcmk_delay_base option meta-data +- Avoid crash when using clone notifications +- Retry Corosync shutdown tracking if first attempt fails +- Improve display of failed actions +- Resolves: rhbz1707851 +- Resolves: rhbz2039982 +- Resolves: rhbz2032032 +- Resolves: rhbz2040443 +- Resolves: rhbz2042367 +- Resolves: rhbz2042546 + +* Thu Dec 16 2021 Ken Gaillot - 2.1.2-2 +- Correctly get metadata for systemd agent names that end in '@' +- Use correct OCF 1.1 syntax in ocf:pacemaker:Stateful meta-data +- Fix regression in displayed times in crm_mon's fence history +- Resolves: rhbz2032031 +- Resolves: rhbz2032032 +- Resolves: rhbz2031765 + +* Tue Nov 30 2021 Ken Gaillot - 2.1.2-1 +- Rebase on upstream 2.1.2 +- Resolves: rhbz2011974 + +* Fri Aug 20 2021 Ken Gaillot - 2.1.0-11 +- Fix XML issue with fence_watchdog meta-data +- Resolves: rhbz1988568 + +* Thu Aug 12 2021 Ken Gaillot - 2.1.0-10 +- Fix minor issue with crm_resource error message change +- Resolves: rhbz1983196 + +* Wed Aug 11 2021 Ken Gaillot - 2.1.0-9 +- Fix watchdog agent version information +- Ensure transient attributes are cleared when multiple nodes are lost +- Resolves: rhbz1988568 +- Resolves: rhbz1989292 + +* Mon Aug 09 2021 Mohan Boddu - 2.1.0-7.1 +- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags + Related: rhbz#1991688 + +* Fri Aug 06 2021 Ken Gaillot - 2.1.0-7 +- Allow configuring specific nodes to use watchdog-only sbd for fencing +- Resolves: rhbz1988568 + +* Fri Jul 30 2021 Ken Gaillot - 2.1.0-6 +- Avoid selecting wrong device when dynamic-list fencing is used with host map +- Show better error messages in crm_resource with invalid resource types +- Do not schedule probes of unmanaged resources on pending nodes +- Fix regressions in crm_attribute and crm_master argument handling +- Resolves: rhbz1978013 +- Resolves: rhbz1983196 +- Resolves: rhbz1983197 +- Resolves: rhbz1984130 + +* Wed Jun 30 2021 Ken Gaillot - 2.1.0-5 +- crm_resource now supports XML output from resource agent actions +- Correct output for crm_simulate --show-failcounts +- Avoid remote node unfencing loop +- Resolves: rhbz1975380 +- Resolves: rhbz1975386 +- Resolves: rhbz1975388 + +* Thu Jun 10 2021 Ken Gaillot - 2.1.0-4 +- Rebase on upstream 2.1.0 final release +- Resolves: rhbz1936023 + +* Tue Jun 1 2021 Ken Gaillot - 2.1.0-3 +- Rebase on upstream 2.1.0-rc3 release +- Resolves: rhbz1936023 + +* Wed May 26 2021 Ken Gaillot - 2.1.0-2 +- Include recent post-rc2 fixes with rebase +- Resolves: rhbz1936023 + +* Wed May 12 2021 Ken Gaillot - 2.1.0-1 +- Default resource-stickiness to 1 in newly created clusters +- Rebase on upstream 2.1.0-rc2 release +- Resolves: rhbz1850145 +- Resolves: rhbz1936023 + +* Fri Apr 16 2021 Mohan Boddu - 2.0.5-10.2 +- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937 + +* Tue Jan 26 2021 Fedora Release Engineering - 2.0.5-10.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild + +* Mon Dec 7 2020 Klaus Wenninger - 2.0.5-10 +- Conflicts of doc package introduced to fix upgrade/downgrade + issues needs to be independent from arch + +* Fri Dec 4 2020 Klaus Wenninger - 2.0.5-9 +- Make doc-package conflict with wrong version of libs + to fix upgrade/downgrade issues + +* Fri Dec 4 2020 Klaus Wenninger - 2.0.5-8 +- Update for new upstream release tarball: Pacemaker-2.0.5 + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.5 + +* Wed Nov 18 2020 Klaus Wenninger - 2.0.5-0.7.rc3 +- a little more syncing with upstream spec-file + +* Tue Nov 17 2020 Klaus Wenninger - 2.0.5-0.6.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.5-rc3 + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.5-rc3 +- Corosync in Fedora now provides corosync-devel as well in isa-flavor + +* Sun Nov 1 2020 Klaus Wenninger - 2.0.5-0.5.rc2 +- remove no more working dist.rpmdeplint from gating + +* Fri Oct 30 2020 Klaus Wenninger - 2.0.5-0.4.rc2 +- never use spec-variables in changelog +- replace dist.depcheck by dist.rpmdeplint +- do gate stable as well to be effective on rawhide + +* Fri Oct 30 2020 Klaus Wenninger - 2.0.5-0.3.rc2 +- revert dependency corosync-devel back to corosynclib-devel as long + as corosynclib-devel-package doesn't provide corosync-devel(isa) + we would need for pacemaker-libs-devel to require +- enable some basic gating-tests +- re-add building documentation using publican to everything but ELN +- rename doc-dir for ELN + +* Wed Oct 28 2020 Klaus Wenninger - 2.0.5-0.2.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.5-rc2, + includes fix for CVE-2020-25654 + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.5-rc2 + +* Thu Oct 22 2020 Klaus Wenninger - 2.0.5-0.1.rc1 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.5-rc1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.5-rc1 +- Disable building of documentation - as not to pull in publican +- Remove dependencies to nagios-plugins from metadata-package +- some sync with structure of upstream spec-file +- removed some legacy conditionals +- added with-cibsecrets + +* Tue Jul 28 2020 Fedora Release Engineering - 2.0.4-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild + +* Tue Jun 16 2020 Chris Lumens - 2.0.4-1 +- Update for new upstream tarball: Pacemaker-2.0.4 + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.4 + +* Thu Jun 04 2020 Chris Lumens - 2.0.4-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.4-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.4-rc3 + +* Tue May 26 2020 Miro Hrončok - 2.0.4-0.2.rc1.1 +- Rebuilt for Python 3.9 + +* Wed May 13 2020 Chris Lumens - 2.0.4-0.2.rc1 +- Rebuilt for libqb 2.0. + +* Mon May 04 2020 Chris Lumens - 2.0.4-0.1.rc1 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.4-rc1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.4-rc1 + +* Fri Mar 06 2020 Jan Pokorný - 2.0.3-4 +- return back to building also for s390x architecture, previous obstacle + was identified and interim fix (way to build along with one actual bugfix + as raised along) applied (RHBZ#1799842) + +* Wed Mar 04 2020 Jan Pokorný - 2.0.3-3 +- include upstream fix for buildability with GCC 10 (PR #1968) +- omit s390x architecture for now, compilation would fail at this time + +* Wed Jan 29 2020 Fedora Release Engineering - 2.0.3-2.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild + +* Tue Nov 26 2019 Jan Pokorný - 2.0.3-1 +- Update for new upstream tarball: Pacemaker-2.0.3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.3 + (functionally identical to 2.0.3-rc3, new build mostly to fix a memory + leak & allow for easy glibc ~2.31+ friendly switch away from ftime(3)) +- Fix unability to build with Inkscape 1.0 beta (and possibly beyond) + +* Thu Nov 14 2019 Jan Pokorný - 2.0.3-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.3-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.3-rc3 +- Fix failure to build due to using obsolete ftime(3) + +* Wed Nov 06 2019 Jan Pokorný - 2.0.3-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.3-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.3-rc2 + +* Thu Oct 03 2019 Miro Hrončok - 2.0.2-1.3 +- Rebuilt for Python 3.8.0rc1 (#1748018) + +* Mon Aug 19 2019 Miro Hrončok - 2.0.2-1.2 +- Rebuilt for Python 3.8 + +* Thu Jul 25 2019 Fedora Release Engineering - 2.0.2-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild + +* Fri Jun 07 2019 Jan Pokorný - 2.0.2-1 +- Update for new upstream tarball: Pacemaker-2.0.2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.2 + (functionally identical to 2.0.2-rc3, new build mostly to match expectations) + +* Fri May 31 2019 Jan Pokorný - 2.0.2-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.2-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.2-rc3 +- Adapt spec file more akin to upstream version including: + . /usr/share/pacemaker now owned by -schemas, its "api" subdirectory + is not carried redundantly in -cli anymore (f05eb7eec) + +* Tue May 28 2019 Jan Pokorný - 2.0.2-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.2-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.2-rc2 + +* Thu Apr 25 2019 Jan Pokorný - 2.0.2-0.1.rc1 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.2-rc1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.2-rc1 +- Customize (as allowed now) exhibited downstream-specific bug reporting URL +- Adapt spec file more akin to upstream version including: + . sbd ABI compatible version enforcement (37ad2bea1) + +* Wed Apr 17 2019 Jan Pokorný - 2.0.1-2 +- Apply fixes for security issues: + . CVE-2019-3885 (use-after-free with potential information disclosure) + . CVE-2018-16877 (insufficient local IPC client-server authentication) + . CVE-2018-16878 (insufficient verification inflicted preference of + uncontrolled processes) + +* Tue Mar 05 2019 Jan Pokorný - 2.0.1-1 +- Update for new upstream tarball: Pacemaker-2.0.1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.1 + +* Thu Feb 28 2019 Jan Pokorný - 2.0.1-0.4.rc5 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.1-rc5, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.1-rc5 +- Reflect that cts-scheduler tests are fully compatible with whatever recent + glib version that gets to be used in run-time (incl. buildroot tests) again + +* Mon Feb 04 2019 Jan Pokorný - 2.0.1-0.3.rc4 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.1-rc4, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.1-rc4 +- Conditionally disable "hash affected tests" in cts-scheduler (-cts package), + since it is unlikely glib v2.59.0+ present in the buildroot will be + artificially downgraded post-deployment + +* Fri Feb 01 2019 Fedora Release Engineering - 2.0.1-0.2.rc3.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild + +* Tue Jan 22 2019 Jan Pokorný - 2.0.1-0.2.rc3 +- Fix buildability with GCC 9 (PR #1681) +- Apply minor crm_mon XML output fix (PR #1678) + +* Sun Jan 20 2019 Jan Pokorný - 2.0.1-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.1-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.1-rc3 +- Adapt spec file more akin to upstream version including: + . split a dedicated, noarch -schemas package (c6a87bd86) + . make static dependencies on inner libraries arch-specific (14bfff68e) + . weak co-dependence of -cli with -remote & pacemaker proper (73e2c94a3) + . declare bundled gnulib (d57aa84c1) +- Move stonith_admin to -cli where it belongs, since it doesn't require + -cluster-libs (considered by upstream) +- Apply patches to restore basic buildability (still without much run-time + reproducibility guarantees compared to what's been customary prior to glib + v2.59.0+ that may now get run-time linked upon its fresh installation/update, + but this applies also to whatever older version of pacemaker, and wasn't + discovered until now; cf. https://github.com/ClusterLabs/pacemaker/pull/1677) + +* Thu Aug 23 2018 Jan Pokorný - 2.0.0-4 +- Sanitize/generalize approach to Python byte-compilation, so that also + out-of-Python-path *.py files (%%{_datadir}/pacemaker/tests/cts/CTSlab.py + in particular) get the expected treatment now + +* Wed Aug 15 2018 Jan Pokorný - 2.0.0-3 +- Fix Python 3.7 incompatibility (otherwise missed in bytecompilation phase, + see rhbz#1616219) + +* Thu Aug 09 2018 Jan Pokorný - 2.0.0-2 +- Include fix for "cibadmin --upgrade" related issues (rhbz#1611631) +- Adapt spec file more akin to upstream version including: + . assuredly skip servicelog-related binaries even when build-time + prerequisites are present on suitable systems (9f24448d8) + +* Mon Jul 09 2018 Jan Pokorný - 2.0.0-1 +- Update for new upstream tarball: Pacemaker-2.0.0, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0 + +* Mon Jul 02 2018 Miro Hrončok - 2.0.0-0.1.rc6.1 +- Rebuilt for Python 3.7 + +* Thu Jun 28 2018 Jan Pokorný - 2.0.0-0.1.rc6 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.0-rc6, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0-rc6 +- Adapt spec file more akin to upstream version including: + . new procps-ng and psmisc dependencies with -cli and -cts, for e.g. + "ps/sysctl/uptime" and "killall" invocations, respectively (a4ad8183a) + . move crm_node to -cli (a94a1ed58) + +* Tue Jun 19 2018 Miro Hrončok - 2.0.0-0.1.rc5.1 +- Rebuilt for Python 3.7 + +* Fri Jun 01 2018 Jan Pokorný - 2.0.0-0.1.rc5 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.0-rc5, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0-rc5 +- Adapt spec file more akin to upstream version including: + . new coreutils dependency for "post" scriptlet of -cli, + for "mv" invocation (c2b16165d) + +* Wed May 16 2018 Jan Pokorný - 2.0.0-0.1.rc4 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.0-rc4, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0-rc4 + . as a special note, previous release candidate, rc3, had rolling upgrades + broken, and if that is required, that particular release shall be + skipped in the upgrade path altogether +- Adapt spec file more akin to upstream version including: + . as part of the update process, possibly move old log files as implicitly + used prior to 2.0 so there's a (limited) continuity with the new implicit + location, preventing clutter and confusion (ce2e74c99, c2b16165d) + . move cts-exec-helper from -cli under main package (a2dc2a67e) + . -cts backed with new helpers and, tangentially, dummy systemd service + file transiently generated on-demand again (fa2d43445, d52b001b1) + +* Wed May 02 2018 Jan Pokorný - 2.0.0-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.0-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0-rc3 + . IMPORTANT: this release candidate, rc3, has rolling upgrades broken, + and if that is required, this particular release shall be + skipped in the upgrade path altogether +- Adapt spec file more akin to upstream version including: + . new --without legacy_links conditional (c8a7e5225) + . reflect name change of the auxiliary daemons + (e4f4a0d64, db5536e40, e2fdc2bac + 9ecbfea1c, 038c465e2 + ed8ce4055a) + . new dummy systemd service for -cts (bf0a22812) + . honor system-wide crypto policies once for all, via package-build-time + configurable "pcmk_gnutls_priorities" defaulting to @SYSTEM as prescribed + in https://fedoraproject.org/wiki/Packaging:CryptoPolicies + (based on b3dfce1d3) +- Adapt spec file akin to current packaging guidelines including: + . make -nagios-plugins-metadata package noarch + +* Mon Apr 09 2018 Jan Pokorný - 2.0.0-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-2.0.0-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-2.0.0-rc2 +- Adapt spec file more akin to upstream version including: + . out-of-tree change from 1.1.18-2 build got subsumed (508ad52e7) + . %%{_sysconfdir}/pacemaker path got properly owned + (-cli package; f6e3ab98d) + . -libs package started to properly declare Requires(pre): shadow-utils + (293fcc1e8 + b3d49d210) + . some build conditionals and dependencies dropped for no longer + (snmp, esmtp; f24bdc6f2 and 1f7374884, respectively) or never + being relevant (~bison, byacc, flex; 61aef8af4) + . some dependencies were constrained with new or higher lower bounds: + corosync needs to be of version 2+ unconditionally (ccd58fe29), + ditto some others components (~GLib, 1ac2e7cbb), plus both 2 and 3 + versions of Python are now (comprehensively for the auxiliary + functionality where used) supported upstream with the latter being + a better fit (453355f8f) + . package descriptions got to reflect the drop of legacy low-level + cluster infrastructures (55ab749bf) +- Adapt spec file akin to current packaging guidelines including: + . drop some redundant/futile expressions (defattr, "-n %%{name}-libs" + instead of plain "libs", "timezone hack"), add some notes for future + . make -cts and -doc packages noarch (former enabled with 088a5e7d4) + . simplify "systemd_requires" macro invocation, and relax it to + "systemd_ordering" for -remote package where possible so as not + to drag systemd into a lightweight system setup (e.g. container) + needlessly + . adjust, in a compatible way, common ldconfig invocation with + post{,un} scriptlets + (https://fedoraproject.org/wiki/Changes/Removing_ldconfig_scriptlets) + . drop some more unuseful conditionals (upstart_job) +- Apply some regression fixes on top as patches (PR #1457, #1459) + +* Wed Feb 21 2018 Iryna Shcherbina - 1.1.18-2.2 +- Update Python 2 dependency declarations to new packaging standards + (See https://fedoraproject.org/wiki/FinalizingFedoraSwitchtoPython3) + +* Thu Feb 08 2018 Fedora Release Engineering - 1.1.18-2.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild + +* Thu Nov 16 2017 Jan Pokorný - 1.1.18-2 +- Make sure neither of pacemaker{,_remoted} is process-limited + +* Wed Nov 15 2017 Jan Pokorný - 1.1.18-1 +- Update for new upstream tarball: Pacemaker-1.1.18, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.18 +- Make -libs-devel package dependencies arch-qualified + (-cts hasn't been switched at this time, pending further cleanup) + +* Fri Nov 03 2017 Jan Pokorný - 1.1.18-0.1.rc4 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.18-rc4, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.18-rc4 + +* Thu Oct 26 2017 Jan Pokorný - 1.1.18-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.18-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.18-rc3 + +* Mon Oct 16 2017 Jan Pokorný - 1.1.18-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.18-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.18-rc2 +- Fix check scriptlet so as to work properly also with rpm<4.14 (not strictly + required since: https://github.com/rpm-software-management/rpm/pull/249, + but pragmatically follow the upstream) + +* Thu Aug 03 2017 Fedora Release Engineering - 1.1.17-1.2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild + +* Thu Jul 27 2017 Fedora Release Engineering - 1.1.17-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild + +* Fri Jul 07 2017 Jan Pokorný - 1.1.17-1 +- Update for new upstream tarball: Pacemaker-1.1.17, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.17 + +* Thu Jun 22 2017 Jan Pokorný - 1.1.17-0.1.rc4 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.17-rc4, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.17-rc4 +- Add an imposed lower bound for glib2 BuildRequires + +* Thu Jun 01 2017 Jan Pokorný - 1.1.17-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.17-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.17-rc3 + +* Wed May 24 2017 Jan Pokorný - 1.1.17-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.17-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.17-rc2 + +* Tue May 09 2017 Jan Pokorný - 1.1.17-0.1.rc1 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.17-rc1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.17-rc1 + +* Mon Feb 06 2017 Jan Pokorný - 1.1.16-2.a39ea6491.git +- Update for (slightly stabilized) snapshot beyond Pacemaker-1.1.16 + (commit a39ea6491), including: + . prevent FTBFS with new GCC 7 (a7476dd96) +- Adapt spec file more akin to upstream version including: + . better pre-release vs. tags logic (4581d4366) + +* Fri Dec 02 2016 Jan Pokorný - 1.1.16-1 +- Update for new upstream tarball: Pacemaker-1.1.16, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.16 +- Adapt spec file more akin to upstream version including: + . clarify licensing, especially for -doc (f01f734) + . fix pacemaker-remote upgrade (779e0e3) + . require python >= 2.6 (31ef7f0) + . older libqb is sufficient (based on 30fe1ce) + . remove openssl-devel and libselinux-devel as BRs (2e05c17) + . make systemd BR pkgconfig-driven (6285924) + . defines instead of some globals + error suppression (625d427) +- Rectify -nagios-plugins-metadata declared license and install + also respective license text + +* Thu Nov 03 2016 Jan Pokorný - 1.1.15-3 +- Apply fix for CVE-2016-7035 (improper IPC guarding) + +* Tue Jul 19 2016 Fedora Release Engineering - 1.1.15-2.1 +- https://fedoraproject.org/wiki/Changes/Automatic_Provides_for_Python_RPM_Packages + +* Thu Jul 07 2016 Jan Pokorný - 1.1.15-2 +- Stop building with -fstack-protector-all using the upstream patches + overhauling toolchain hardening (Fedora natively uses + -fstack-protector-strong so this effectively relaxed stack protection + is the only effect as hardened flags are already used by default: + https://fedoraproject.org/wiki/Changes/Harden_All_Packages) + +* Wed Jun 22 2016 Jan Pokorný - 1.1.15-1 +- Update for new upstream tarball: Pacemaker-1.1.15, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.15 +- Adapt spec file more akin to upstream version: + . move xml schema files + PCMK-MIB.txt (81ef956), logrotate configuration + file (ce576cf; drop it from -remote package as well), attrd_updater + (aff80ae), the normal resource agents (1fc7287), and common directories + under /var/lib/pacemaker (3492794) from main package under -cli + . simplify docdir build parameter passing and drop as of now + redundant chmod invocations (e91769e) + +* Fri May 27 2016 Jan Pokorný - 1.1.15-0.1.rc3 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.15-rc3, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.15-rc3 +- Drop fence_pcmk (incl. man page) from the package (no use where no CMAN) +- Drop license macro emulation for cases when not supported natively + (several recent Fedora releases do not need that) + +* Mon May 16 2016 Jan Pokorný - 1.1.15-0.1.rc2 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.15-rc2, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.15-rc2 + +* Tue Apr 26 2016 Jan Pokorný - 1.1.15-0.1.rc1 +- Update for new upstream tarball for release candidate: Pacemaker-1.1.15-rc1, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.15-rc1 +- Adapt spec file more akin to upstream version (also to reflect recent + changes like ability to built explicitly without Publican-based docs) + +* Thu Mar 31 2016 Jan Pokorný - 1.1.14-2.5a6cdd1.git +- Update for currently stabilized snapshot beyond Pacemaker-1.1.14 + (commit 5a6cdd1), but restore old-style notifications to the state at + Pacemaker-1.1.14 point release (disabled) +- Definitely get rid of Corosync v1 (Flatiron) hypothetical support +- Remove some of the spec file cruft, not required for years + (BuildRoot, AutoReqProv, "clean" scriptlet, etc.) and adapt the file + per https://github.com/ClusterLabs/pacemaker/pull/965 + +* Thu Feb 04 2016 Fedora Release Engineering - 1.1.14-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild + +* Mon Jan 18 2016 Jan Pokorný - 1.1.14-1 +- Update for new upstream tarball: Pacemaker-1.1.14, + for full details, see included ChangeLog file or + https://github.com/ClusterLabs/pacemaker/releases/tag/Pacemaker-1.1.14 +- Disable Fedora crypto policies conformance patch for now (rhbz#1179335) +- Better align specfile with the upstream version (also fix issue with + crm_mon sysconfig file not being installed) +- Further specfile modifications: + - drop unused gcc-c++ and repeatedly mentioned pkgconfig packages + from BuildRequires + - refer to python_sitearch macro first, if defined + - tolerate license macro not being defined (e.g., for EPEL rebuilds) +- Prevent console mode not available in crm_mon due to curses library test + fragility of configure script in hardened build environment (rhbz#1297985) + +* Tue Oct 20 2015 Jan Pokorný - 1.1.13-4 +- Adapt to follow Fedora crypto policies (rhbz#1179335) + +* Wed Oct 14 2015 Jan Pokorný - 1.1.13-3 +- Update to Pacemaker-1.1.13 post-release + patches (sync) +- Add nagios-plugins-metadata subpackage enabling support of selected + Nagios plugins as resources recognized by Pacemaker +- Several specfile improvements: drop irrelevant stuff, rehash the + included/excluded files + dependencies, add check scriptlet, + reflect current packaging practice, do minor cleanups + (mostly adopted from another spec) + +* Thu Aug 20 2015 Andrew Beekhof - 1.1.13-2 +- Update for new upstream tarball: Pacemaker-1.1.13 +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + +* Thu Jun 18 2015 Fedora Release Engineering - 1.1.12-2.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild + +* Wed Nov 05 2014 Andrew Beekhof - 1.1.12-2 +- Address incorrect use of the dbus API for interacting with systemd + +* Tue Oct 28 2014 Andrew Beekhof - 1.1.12-1 +- Update for new upstream tarball: Pacemaker-1.1.12+ (a9c8177) +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + +* Sun Aug 17 2014 Fedora Release Engineering - 1.1.11-1.2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild + +* Fri Jun 06 2014 Fedora Release Engineering - 1.1.11-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild + +* Tue Feb 18 2014 Andrew Beekhof - 1.1.11-1 +- Update for new upstream tarball: Pacemaker-1.1.11 (9d39a6b) +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + +* Thu Jun 20 2013 Andrew Beekhof - 1.1.9-3 +- Update to upstream 7d8acec +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + + + Feature: Turn off auto-respawning of systemd services when the cluster starts them + + Fix: crmd: Ensure operations for cleaned up resources don't block recovery + + Fix: logging: If SIGTRAP is sent before tracing is turned on, turn it on instead of crashing + +* Mon Jun 17 2013 Andrew Beekhof - 1.1.9-2 +- Update for new upstream tarball: 781a388 +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + +* Wed May 12 2010 Andrew Beekhof - 1.1.2-1 +- Update the tarball from the upstream 1.1.2 release +- See included ChangeLog file or https://raw.github.com/ClusterLabs/pacemaker/master/ChangeLog for full details + +* Tue Jul 14 2009 Andrew Beekhof - 1.0.4-1 +- Initial checkin