diff --git a/.gitignore b/.gitignore index 5b0aa26..fad0d0f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ SOURCES/nagios-agents-metadata-105ab8a.tar.gz -SOURCES/pacemaker-7c3f660.tar.gz +SOURCES/pacemaker-ada5c3b.tar.gz diff --git a/.pacemaker.metadata b/.pacemaker.metadata index de63f82..5af6b2a 100644 --- a/.pacemaker.metadata +++ b/.pacemaker.metadata @@ -1,2 +1,2 @@ ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz -17aa11e179c3f9eacbacac5735d7f5b14a1ac010 SOURCES/pacemaker-7c3f660.tar.gz +f9fd69263d5b21446b530f9750c262f7b492cad4 SOURCES/pacemaker-ada5c3b.tar.gz diff --git a/SOURCES/001-acl-group-schema.patch b/SOURCES/001-acl-group-schema.patch new file mode 100644 index 0000000..4835e3e --- /dev/null +++ b/SOURCES/001-acl-group-schema.patch @@ -0,0 +1,230 @@ +From f5ffbaf1f537d3d5b00e594211cd322f97df51ac Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:39:39 -0400 +Subject: [PATCH 1/3] Low: xml: clone acls schema in preparation for changes + +--- + xml/acls-3.8.rng | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 80 insertions(+) + create mode 100644 xml/acls-3.8.rng + +diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng +new file mode 100644 +index 000000000..0fe6eed96 +--- /dev/null ++++ b/xml/acls-3.8.rng +@@ -0,0 +1,80 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ read ++ write ++ deny ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 7838213fc639236bdedf5f15320152d973f1bdad Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:40:48 -0400 +Subject: [PATCH 2/3] Add a 'name' attribute to acl_target and acl_group + elements + +--- + xml/acls-3.8.rng | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng +index 0fe6eed96..48bcdffe3 100644 +--- a/xml/acls-3.8.rng ++++ b/xml/acls-3.8.rng +@@ -13,6 +13,9 @@ + + + ++ ++ ++ + + + +@@ -22,6 +25,9 @@ + + + ++ ++ ++ + + + +-- +2.27.0 + + +From c3c498f4636f57e29670f8e385b625024ed222d7 Mon Sep 17 00:00:00 2001 +From: Grace Chin +Date: Fri, 5 Nov 2021 11:42:48 -0400 +Subject: [PATCH 3/3] Changes made by run of 'cts/cts-cli -s' + +--- + cts/cli/regression.upgrade.exp | 7 +++++-- + cts/cli/regression.validity.exp | 22 ++++++++++++++++++---- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/cts/cli/regression.upgrade.exp b/cts/cli/regression.upgrade.exp +index e38adebdd..7ce7ec13b 100644 +--- a/cts/cli/regression.upgrade.exp ++++ b/cts/cli/regression.upgrade.exp +@@ -91,8 +91,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 + update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + update_validation debug: Configuration valid for schema: pacemaker-3.7 +-update_validation trace: Stopping at pacemaker-3.7 +-update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.7 ++update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++update_validation debug: Configuration valid for schema: pacemaker-3.8 ++update_validation trace: Stopping at pacemaker-3.8 ++update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.8 + =#=#=#= Current cib after: Upgrade to latest CIB schema (trigger 2.10.xsl + the wrapping) =#=#=#= + + +diff --git a/cts/cli/regression.validity.exp b/cts/cli/regression.validity.exp +index 5ace430e7..125035a47 100644 +--- a/cts/cli/regression.validity.exp ++++ b/cts/cli/regression.validity.exp +@@ -121,7 +121,11 @@ update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + update_validation trace: pacemaker-3.7 validation failed +-Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.7 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++update_validation trace: pacemaker-3.8 validation failed ++Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.8 + =#=#=#= End test: Run crm_simulate with invalid CIB (enum violation) - Invalid configuration (78) =#=#=#= + * Passed: crm_simulate - Run crm_simulate with invalid CIB (enum violation) + =#=#=#= Begin test: Try to make resulting CIB invalid (unrecognized validate-with) =#=#=#= +@@ -226,7 +230,10 @@ update_validation trace: pacemaker-3.6 validation failed + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + element cib: Relax-NG validity error : Invalid attribute validate-with for element cib + update_validation trace: pacemaker-3.7 validation failed +-Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.7 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++element cib: Relax-NG validity error : Invalid attribute validate-with for element cib ++update_validation trace: pacemaker-3.8 validation failed ++Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.8 + =#=#=#= End test: Run crm_simulate with invalid CIB (unrecognized validate-with) - Invalid configuration (78) =#=#=#= + * Passed: crm_simulate - Run crm_simulate with invalid CIB (unrecognized validate-with) + =#=#=#= Begin test: Try to make resulting CIB invalid, but possibly recoverable (valid with X.Y+1) =#=#=#= +@@ -326,8 +333,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 + update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 + update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) + update_validation debug: Configuration valid for schema: pacemaker-3.7 +-update_validation trace: Stopping at pacemaker-3.7 +-update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.7 ++update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 ++update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) ++update_validation debug: Configuration valid for schema: pacemaker-3.8 ++update_validation trace: Stopping at pacemaker-3.8 ++update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.8 + unpack_resources error: Resource start-up disabled since no STONITH resources have been defined + unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option + unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity +@@ -437,6 +447,8 @@ element rsc_order: Relax-NG validity error : Invalid attribute first-action for + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + =#=#=#= Current cib after: Make resulting CIB invalid, and without validate-with attribute =#=#=#= + + +@@ -502,6 +514,8 @@ validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attrib + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order + validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order ++validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order ++validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order + unpack_resources error: Resource start-up disabled since no STONITH resources have been defined + unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option + unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity +-- +2.27.0 + diff --git a/SOURCES/001-ping-agent.patch b/SOURCES/001-ping-agent.patch deleted file mode 100644 index 89fe41a..0000000 --- a/SOURCES/001-ping-agent.patch +++ /dev/null @@ -1,225 +0,0 @@ -From c6ee0973522268ed7b3241cf0ec2e06398444114 Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Tue, 4 May 2021 12:02:17 -0400 -Subject: [PATCH 1/4] Remove deprecated attrd_options - ---- - extra/resources/ping | 11 +++-------- - 1 file changed, 3 insertions(+), 8 deletions(-) - -diff --git a/extra/resources/ping b/extra/resources/ping -index 3cf8dfe..2e93f22 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -178,7 +178,7 @@ ping_stop() { - - rm -f "${OCF_RESKEY_pidfile}" - -- attrd_updater -D -n "$OCF_RESKEY_name" -d "$OCF_RESKEY_dampen" $attrd_options -+ attrd_updater -D -n "$OCF_RESKEY_name" -d "$OCF_RESKEY_dampen" - - return $OCF_SUCCESS - } -@@ -302,9 +302,9 @@ ping_update() { - - score=$(expr $active \* $OCF_RESKEY_multiplier) - if [ "$__OCF_ACTION" = "start" ] ; then -- attrd_updater -n "$OCF_RESKEY_name" -B "$score" -d "$OCF_RESKEY_dampen" $attrd_options -+ attrd_updater -n "$OCF_RESKEY_name" -B "$score" -d "$OCF_RESKEY_dampen" - else -- attrd_updater -n "$OCF_RESKEY_name" -v "$score" -d "$OCF_RESKEY_dampen" $attrd_options -+ attrd_updater -n "$OCF_RESKEY_name" -v "$score" -d "$OCF_RESKEY_dampen" - fi - rc=$? - case $rc in -@@ -396,11 +396,6 @@ case "${OCF_RESKEY_debug}" in - ;; - esac - --attrd_options='-q' --if [ "${OCF_RESKEY_debug}" = "true" ]; then -- attrd_options='' --fi -- - case "$__OCF_ACTION" in - meta-data) meta_data - exit $OCF_SUCCESS --- -1.8.3.1 - - -From 6d6c4691cf0970059689856c354daf9e098b4451 Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Tue, 4 May 2021 14:50:37 -0400 -Subject: [PATCH 2/4] Replace debug values, true and false, with 0 and 1 - ---- - extra/resources/ping | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/extra/resources/ping b/extra/resources/ping -index 2e93f22..fee019b 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -24,7 +24,7 @@ - : ${OCF_RESKEY_dampen:="5s"} - : ${OCF_RESKEY_attempts:="3"} - : ${OCF_RESKEY_multiplier:="1"} --: ${OCF_RESKEY_debug:="false"} -+: ${OCF_RESKEY_debug:="0"} - : ${OCF_RESKEY_failure_score:="0"} - : ${OCF_RESKEY_use_fping:="1"} - : ${OCF_RESKEY_host_list:=""} -@@ -152,7 +152,7 @@ END - - ping_conditional_log() { - level="$1"; shift -- if [ "${OCF_RESKEY_debug}" = "true" ]; then -+ if [ $OCF_RESKEY_debug -gt 0 ]; then - ocf_log "$level" "$*" - fi - } -@@ -388,8 +388,8 @@ fi - - # Check the debug option - case "${OCF_RESKEY_debug}" in -- true|True|TRUE|1) OCF_RESKEY_debug=true;; -- false|False|FALSE|0) OCF_RESKEY_debug=false;; -+ true|True|TRUE|1) OCF_RESKEY_debug=0;; -+ false|False|FALSE|0) OCF_RESKEY_debug=1;; - *) - ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}" - OCF_RESKEY_debug=false --- -1.8.3.1 - - -From a886a31056b6aca764c6911f5432af2c5ebf51df Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Tue, 11 May 2021 11:04:50 -0400 -Subject: [PATCH 3/4] Add verbose debug mode which logs ping and fping output - when set - ---- - extra/resources/ping | 19 ++++++++++++++----- - 1 file changed, 14 insertions(+), 5 deletions(-) - -diff --git a/extra/resources/ping b/extra/resources/ping -index fee019b..cc796af 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -249,10 +249,13 @@ fping_check() { - - case $rc in - 0) -+ if [ $OCF_RESKEY_debug -gt 1 ]; then -+ ping_conditional_log info "$output" -+ fi - ;; - 1) - for h in $(echo "$output" | grep "is unreachable" | awk '{print $1}'); do -- ping_conditional_log warn "$h is inactive" -+ ping_conditional_log warn "$h is inactive: $output" - done - ;; - *) -@@ -282,7 +285,12 @@ ping_check() { - p_out=$($p_exe $p_args $OCF_RESKEY_options $host 2>&1); rc=$? - - case $rc in -- 0) active=$(expr $active + 1);; -+ 0) -+ active=$(expr $active + 1) -+ if [ $OCF_RESKEY_debug -gt 1 ]; then -+ ping_conditional_log info "$p_out" -+ fi -+ ;; - 1) ping_conditional_log warn "$host is inactive: $p_out";; - *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";; - esac -@@ -388,10 +396,11 @@ fi - - # Check the debug option - case "${OCF_RESKEY_debug}" in -- true|True|TRUE|1) OCF_RESKEY_debug=0;; -- false|False|FALSE|0) OCF_RESKEY_debug=1;; -+ true|True|TRUE|1) OCF_RESKEY_debug=1;; -+ false|False|FALSE|0) OCF_RESKEY_debug=0;; -+ verbose|Verbose|VERBOSE|2) OCF_RESKEY_debug=2;; - *) -- ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}" -+ ocf_log warn "Value for 'debug' is incorrect. Please specify 'true', 'false', or 'verbose', not: ${OCF_RESKEY_debug}" - OCF_RESKEY_debug=false - ;; - esac --- -1.8.3.1 - - -From 460043f133ced80e923b1290af70502a72deb7f8 Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Tue, 11 May 2021 11:07:05 -0400 -Subject: [PATCH 4/4] Improve variable names - ---- - extra/resources/ping | 20 ++++++++++---------- - 1 file changed, 10 insertions(+), 10 deletions(-) - -diff --git a/extra/resources/ping b/extra/resources/ping -index cc796af..9763b60 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -244,22 +244,22 @@ fping_check() { - timeout=$(expr $OCF_RESKEY_timeout \* 1000 / $OCF_RESKEY_attempts) - - cmd="$p_exe -r $OCF_RESKEY_attempts -t $timeout -B 1.0 $OCF_RESKEY_options $OCF_RESKEY_host_list" -- output=$($cmd 2>&1); rc=$? -- active=$(echo "$output" | grep "is alive" | wc -l) -+ fping_output=$($cmd 2>&1); rc=$? -+ active=$(echo "$fping_output" | grep "is alive" | wc -l) - - case $rc in - 0) - if [ $OCF_RESKEY_debug -gt 1 ]; then -- ping_conditional_log info "$output" -+ ping_conditional_log info "$fping_output" - fi - ;; - 1) -- for h in $(echo "$output" | grep "is unreachable" | awk '{print $1}'); do -- ping_conditional_log warn "$h is inactive: $output" -+ for h in $(echo "$fping_output" | grep "is unreachable" | awk '{print $1}'); do -+ ping_conditional_log warn "$h is inactive: $fping_output" - done - ;; - *) -- ocf_log err "Unexpected result for '$cmd' $rc: $(echo "$output" | tr '\n' ';')" -+ ocf_log err "Unexpected result for '$cmd' $rc: $(echo "$fping_output" | tr '\n' ';')" - ;; - esac - -@@ -282,17 +282,17 @@ ping_check() { - *:*) p_exe=ping6 - esac - -- p_out=$($p_exe $p_args $OCF_RESKEY_options $host 2>&1); rc=$? -+ ping_output=$($p_exe $p_args $OCF_RESKEY_options $host 2>&1); rc=$? - - case $rc in - 0) - active=$(expr $active + 1) - if [ $OCF_RESKEY_debug -gt 1 ]; then -- ping_conditional_log info "$p_out" -+ ping_conditional_log info "$ping_output" - fi - ;; -- 1) ping_conditional_log warn "$host is inactive: $p_out";; -- *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";; -+ 1) ping_conditional_log warn "$host is inactive: $ping_output";; -+ *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $ping_output";; - esac - done - return $active --- -1.8.3.1 - diff --git a/SOURCES/002-fencing-reasons.patch b/SOURCES/002-fencing-reasons.patch new file mode 100644 index 0000000..f89cbec --- /dev/null +++ b/SOURCES/002-fencing-reasons.patch @@ -0,0 +1,2100 @@ +From 95b4f87aae5fb2cf771cf9a8f8e5420b65fb213f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 10:47:51 -0500 +Subject: [PATCH 01/12] Refactor: fencing: use pcmk__action_result_t in + stonith_action_t + +stonith_action_t previously had an rc member for a legacy return code, along +with output and error members for action stdout/stderr. When setting rc based +on the svc_action_t result, it used a mapping function svc_action_to_errno(). + +This replaces those with a pcmk__action_result_t member, which means we now +track the exit status and execution status as originally set by libcrmservice, +rather than the mapped rc. The library now calls the mapping function, now +returning standard codes and called result2rc(), when calling the client +callback. + +The exit_reason member is unused as of this commit. + +The behavior should be identical, with the small exception of +services_action_async() failure leaving the exit status as set by the services +library, which means callers will get the result2rc() mapping of the actual +result instead of the former -ECONNABORTED. +--- + lib/fencing/st_client.c | 118 +++++++++++++++++++++++----------------- + 1 file changed, 68 insertions(+), 50 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 08adb51c6..6c607b010 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + +@@ -57,9 +58,7 @@ struct stonith_action_s { + int max_retries; + + int pid; +- int rc; +- char *output; +- char *error; ++ pcmk__action_result_t result; + }; + + typedef struct stonith_private_s { +@@ -120,6 +119,7 @@ static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); + static int internal_stonith_action_execute(stonith_action_t * action); + static void log_action(stonith_action_t *action, pid_t pid); ++static int result2rc(const pcmk__action_result_t *result); + + /*! + * \brief Get agent namespace by name +@@ -196,6 +196,23 @@ stonith_get_namespace(const char *agent, const char *namespace_s) + return st_namespace_invalid; + } + ++/*! ++ * \internal ++ * \brief Set an action's result based on services library result ++ * ++ * \param[in] action Fence action to set result for ++ * \param[in] svc_action Service action to get result from ++ */ ++static void ++set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) ++{ ++ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, ++ NULL); ++ pcmk__set_result_output(&(action->result), ++ services__grab_stdout(svc_action), ++ services__grab_stderr(svc_action)); ++} ++ + gboolean + stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + { +@@ -259,19 +276,19 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + static void + log_action(stonith_action_t *action, pid_t pid) + { +- if (action->output) { ++ if (action->result.action_stdout != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); + +- crm_log_output(LOG_TRACE, prefix, action->output); ++ crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); + free(prefix); + } + +- if (action->error) { ++ if (action->result.action_stderr != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); + +- crm_log_output(LOG_WARNING, prefix, action->error); ++ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); + free(prefix); + } + } +@@ -645,8 +662,7 @@ stonith__destroy_action(stonith_action_t *action) + if (action->svc_action) { + services_action_free(action->svc_action); + } +- free(action->output); +- free(action->error); ++ pcmk__reset_result(&(action->result)); + free(action); + } + } +@@ -678,15 +694,15 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, + } + if (action != NULL) { + if (rc) { +- *rc = action->rc; ++ *rc = pcmk_rc2legacy(result2rc(&(action->result))); + } +- if (output && action->output) { +- *output = action->output; +- action->output = NULL; // hand off memory management to caller ++ if ((output != NULL) && (action->result.action_stdout != NULL)) { ++ *output = action->result.action_stdout; ++ action->result.action_stdout = NULL; // hand off ownership to caller + } +- if (error_output && action->error) { +- *error_output = action->error; +- action->error = NULL; // hand off memory management to caller ++ if ((error_output != NULL) && (action->result.action_stderr != NULL)) { ++ *error_output = action->result.action_stderr; ++ action->result.action_stderr = NULL; // hand off ownership to caller + } + } + } +@@ -715,6 +731,9 @@ stonith_action_create(const char *agent, + action->timeout = action->remaining_timeout = timeout; + action->max_retries = FAILURE_MAX_RETRIES; + ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, ++ NULL); ++ + if (device_args) { + char buffer[512]; + const char *value = NULL; +@@ -739,7 +758,8 @@ update_remaining_timeout(stonith_action_t * action) + crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", + action->agent, action->action, action->max_retries); + action->remaining_timeout = 0; +- } else if ((action->rc != -ETIME) && diff < (action->timeout * 0.7)) { ++ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) ++ && (diff < (action->timeout * 0.7))) { + /* only set remaining timeout period if there is 30% + * or greater of the original timeout period left */ + action->remaining_timeout = action->timeout - diff; +@@ -750,31 +770,31 @@ update_remaining_timeout(stonith_action_t * action) + } + + static int +-svc_action_to_errno(svc_action_t *svc_action) { +- int rv = pcmk_ok; ++result2rc(const pcmk__action_result_t *result) { ++ int rc = pcmk_rc_ok; + +- if (svc_action->status == PCMK_EXEC_TIMEOUT) { +- rv = -ETIME; ++ if (result->execution_status == PCMK_EXEC_TIMEOUT) { ++ rc = ETIME; + +- } else if (svc_action->rc != PCMK_OCF_OK) { ++ } else if (result->exit_status != CRM_EX_OK) { + /* Try to provide a useful error code based on the fence agent's + * error output. + */ +- if (svc_action->stderr_data == NULL) { +- rv = -ENODATA; ++ if (result->action_stderr == NULL) { ++ rc = ENODATA; + +- } else if (strstr(svc_action->stderr_data, "imed out")) { ++ } else if (strstr(result->action_stderr, "imed out")) { + /* Some agents have their own internal timeouts */ +- rv = -ETIME; ++ rc = ETIME; + +- } else if (strstr(svc_action->stderr_data, "Unrecognised action")) { +- rv = -EOPNOTSUPP; ++ } else if (strstr(result->action_stderr, "Unrecognised action")) { ++ rc = EOPNOTSUPP; + + } else { +- rv = -pcmk_err_generic; ++ rc = pcmk_rc_error; + } + } +- return rv; ++ return rc; + } + + static void +@@ -782,11 +802,7 @@ stonith_action_async_done(svc_action_t *svc_action) + { + stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; + +- action->rc = svc_action_to_errno(svc_action); +- action->output = svc_action->stdout_data; +- svc_action->stdout_data = NULL; +- action->error = svc_action->stderr_data; +- svc_action->stderr_data = NULL; ++ set_result_from_svc_action(action, svc_action); + + svc_action->params = NULL; + +@@ -795,7 +811,9 @@ stonith_action_async_done(svc_action_t *svc_action) + + log_action(action, action->pid); + +- if (action->rc != pcmk_ok && update_remaining_timeout(action)) { ++ if ((action->result.exit_status != CRM_EX_OK) ++ && update_remaining_timeout(action)) { ++ + int rc = internal_stonith_action_execute(action); + if (rc == pcmk_ok) { + return; +@@ -803,7 +821,8 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, action->rc, action->output, action->userdata); ++ action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), ++ action->result.action_stdout, action->userdata); + } + + action->svc_action = NULL; // don't remove our caller +@@ -835,9 +854,13 @@ internal_stonith_action_execute(stonith_action_t * action) + static int stonith_sequence = 0; + char *buffer = NULL; + +- if ((action == NULL) || (action->action == NULL) || (action->args == NULL) ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ if ((action->action == NULL) || (action->args == NULL) + || (action->agent == NULL)) { +- return -EPROTO; ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_ERROR_FATAL, NULL); ++ return -EINVAL; + } + + if (!action->tries) { +@@ -857,6 +880,7 @@ internal_stonith_action_execute(stonith_action_t * action) + free(buffer); + + if (svc_action->rc != PCMK_OCF_UNKNOWN) { ++ set_result_from_svc_action(action, svc_action); + services_action_free(svc_action); + return -E2BIG; + } +@@ -877,10 +901,7 @@ internal_stonith_action_execute(stonith_action_t * action) + + /* keep retries from executing out of control and free previous results */ + if (is_retry) { +- free(action->output); +- action->output = NULL; +- free(action->error); +- action->error = NULL; ++ pcmk__reset_result(&(action->result)); + sleep(1); + } + +@@ -889,22 +910,19 @@ internal_stonith_action_execute(stonith_action_t * action) + if (services_action_async_fork_notify(svc_action, + &stonith_action_async_done, + &stonith_action_async_forked)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, ++ PCMK_EXEC_PENDING, NULL); + return pcmk_ok; + } + + } else if (services_action_sync(svc_action)) { // sync success + rc = pcmk_ok; +- action->rc = svc_action_to_errno(svc_action); +- action->output = svc_action->stdout_data; +- svc_action->stdout_data = NULL; +- action->error = svc_action->stderr_data; +- svc_action->stderr_data = NULL; + + } else { // sync failure +- action->rc = -ECONNABORTED; +- rc = action->rc; ++ rc = -ECONNABORTED; + } + ++ set_result_from_svc_action(action, svc_action); + svc_action->params = NULL; + services_action_free(svc_action); + return rc; +-- +2.27.0 + + +From 4c8e0b0ecc53cb3883f0da0eede20b900fff48d1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 11:14:31 -0500 +Subject: [PATCH 02/12] Low: fencing: improve return code given back to library + callers + +Expose result2rc() internally for future reuse, and expand it to handle more +cases. In theory, this can give us better log messages and status output for +failures. +--- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_client.c | 63 +++++++++++++++++++++------------- + 2 files changed, 41 insertions(+), 23 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index fa9059e6f..0d23967bb 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -60,6 +60,7 @@ stonith_action_t *stonith_action_create(const char *agent, + void stonith__destroy_action(stonith_action_t *action); + void stonith__action_result(stonith_action_t *action, int *rc, char **output, + char **error_output); ++int stonith__result2rc(const pcmk__action_result_t *result); + + int + stonith_action_execute_async(stonith_action_t * action, +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 6c607b010..809be1640 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -119,7 +119,6 @@ static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); + static int internal_stonith_action_execute(stonith_action_t * action); + static void log_action(stonith_action_t *action, pid_t pid); +-static int result2rc(const pcmk__action_result_t *result); + + /*! + * \brief Get agent namespace by name +@@ -694,7 +693,7 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, + } + if (action != NULL) { + if (rc) { +- *rc = pcmk_rc2legacy(result2rc(&(action->result))); ++ *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); + } + if ((output != NULL) && (action->result.action_stdout != NULL)) { + *output = action->result.action_stdout; +@@ -769,32 +768,49 @@ update_remaining_timeout(stonith_action_t * action) + return action->remaining_timeout ? TRUE : FALSE; + } + +-static int +-result2rc(const pcmk__action_result_t *result) { +- int rc = pcmk_rc_ok; ++/*! ++ * \internal ++ * \brief Map a fencing action result to a standard return code ++ * ++ * \param[in] result Fencing action result to map ++ * ++ * \return Standard Pacemaker return code that best corresponds to \p result ++ */ ++int ++stonith__result2rc(const pcmk__action_result_t *result) ++{ ++ switch (result->execution_status) { ++ case PCMK_EXEC_CANCELLED: return ECANCELED; ++ case PCMK_EXEC_TIMEOUT: return ETIME; ++ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; ++ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; ++ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; ++ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; ++ case PCMK_EXEC_NO_SECRETS: return EACCES; ++ default: break; ++ } + +- if (result->execution_status == PCMK_EXEC_TIMEOUT) { +- rc = ETIME; ++ if (result->exit_status == CRM_EX_OK) { ++ return pcmk_rc_ok; ++ } + +- } else if (result->exit_status != CRM_EX_OK) { +- /* Try to provide a useful error code based on the fence agent's +- * error output. +- */ +- if (result->action_stderr == NULL) { +- rc = ENODATA; ++ // Try to provide useful error code based on result's error output + +- } else if (strstr(result->action_stderr, "imed out")) { +- /* Some agents have their own internal timeouts */ +- rc = ETIME; ++ if (result->action_stderr == NULL) { ++ return ENODATA; + +- } else if (strstr(result->action_stderr, "Unrecognised action")) { +- rc = EOPNOTSUPP; ++ } else if (strcasestr(result->action_stderr, "timed out") ++ || strcasestr(result->action_stderr, "timeout")) { ++ return ETIME; + +- } else { +- rc = pcmk_rc_error; +- } ++ } else if (strcasestr(result->action_stderr, "unrecognised action") ++ || strcasestr(result->action_stderr, "unrecognized action") ++ || strcasestr(result->action_stderr, "unsupported action")) { ++ return EOPNOTSUPP; + } +- return rc; ++ ++ // Oh well, we tried ++ return pcmk_rc_error; + } + + static void +@@ -821,7 +837,8 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), ++ action->done_cb(action->pid, ++ pcmk_rc2legacy(stonith__result2rc(&(action->result))), + action->result.action_stdout, action->userdata); + } + +-- +2.27.0 + + +From 153c9b552a5bad9dd36e8635fa478ed9cad1f240 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 11:35:44 -0500 +Subject: [PATCH 03/12] Refactor: fencing: return full result from + stonith__action_result() + +Previously, stonith__action_result() grabbed an action's legacy rc, stdout, and +stderr separately. Now, directly return a pointer to the action's result +object, and map that to a legacy rc in the callers when needed. +--- + include/crm/fencing/internal.h | 3 +-- + lib/fencing/st_client.c | 36 ++++--------------------- + lib/fencing/st_rhcs.c | 48 ++++++++++++++++++++++++---------- + 3 files changed, 40 insertions(+), 47 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 0d23967bb..4e9f50fe8 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -58,8 +58,7 @@ stonith_action_t *stonith_action_create(const char *agent, + GHashTable * port_map, + const char * host_arg); + void stonith__destroy_action(stonith_action_t *action); +-void stonith__action_result(stonith_action_t *action, int *rc, char **output, +- char **error_output); ++pcmk__action_result_t *stonith__action_result(stonith_action_t *action); + int stonith__result2rc(const pcmk__action_result_t *result); + + int +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 809be1640..b9df18465 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -670,40 +670,14 @@ stonith__destroy_action(stonith_action_t *action) + * \internal + * \brief Get the result of an executed stonith action + * +- * \param[in,out] action Executed action +- * \param[out] rc Where to store result code (or NULL) +- * \param[out] output Where to store standard output (or NULL) +- * \param[out] error_output Where to store standard error output (or NULL) ++ * \param[in] action Executed action + * +- * \note If output or error_output is not NULL, the caller is responsible for +- * freeing the memory. ++ * \return Pointer to action's result (or NULL if \p action is NULL) + */ +-void +-stonith__action_result(stonith_action_t *action, int *rc, char **output, +- char **error_output) ++pcmk__action_result_t * ++stonith__action_result(stonith_action_t *action) + { +- if (rc) { +- *rc = pcmk_ok; +- } +- if (output) { +- *output = NULL; +- } +- if (error_output) { +- *error_output = NULL; +- } +- if (action != NULL) { +- if (rc) { +- *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); +- } +- if ((output != NULL) && (action->result.action_stdout != NULL)) { +- *output = action->result.action_stdout; +- action->result.action_stdout = NULL; // hand off ownership to caller +- } +- if ((error_output != NULL) && (action->result.action_stderr != NULL)) { +- *error_output = action->result.action_stderr; +- action->result.action_stderr = NULL; // hand off ownership to caller +- } +- } ++ return (action == NULL)? NULL : &(action->result); + } + + #define FAILURE_MAX_RETRIES 2 +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 89a2625bd..23e694975 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2020 the Pacemaker project contributors ++ * Copyright 2004-2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -123,10 +123,10 @@ stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter) + static int + stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + { +- char *buffer = NULL; + xmlNode *xml = NULL; + xmlNode *actions = NULL; + xmlXPathObject *xpathObj = NULL; ++ pcmk__action_result_t *result = NULL; + stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0, + 5, NULL, NULL, NULL); + int rc = stonith__execute(action); +@@ -138,23 +138,31 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + return rc; + } + +- stonith__action_result(action, &rc, &buffer, NULL); +- stonith__destroy_action(action); +- if (rc < 0) { +- crm_warn("Metadata action for %s failed: %s " CRM_XS "rc=%d", +- agent, pcmk_strerror(rc), rc); +- free(buffer); +- return rc; ++ result = stonith__action_result(action); ++ ++ if (result->execution_status != PCMK_EXEC_DONE) { ++ crm_warn("Could not execute metadata action for %s: %s", ++ agent, pcmk_exec_status_str(result->execution_status)); ++ stonith__destroy_action(action); ++ return pcmk_rc2legacy(stonith__result2rc(result)); + } + +- if (buffer == NULL) { ++ if (result->exit_status != CRM_EX_OK) { ++ crm_warn("Metadata action for %s returned error code %d", ++ agent, result->exit_status); ++ stonith__destroy_action(action); ++ return pcmk_rc2legacy(stonith__result2rc(result)); ++ } ++ ++ if (result->action_stdout == NULL) { + crm_warn("Metadata action for %s returned no data", agent); ++ stonith__destroy_action(action); + return -ENODATA; + } + +- xml = string2xml(buffer); +- free(buffer); +- buffer = NULL; ++ xml = string2xml(result->action_stdout); ++ stonith__destroy_action(action); ++ + if (xml == NULL) { + crm_warn("Metadata for %s is invalid", agent); + return -pcmk_err_schema_validation; +@@ -289,7 +297,19 @@ stonith__rhcs_validate(stonith_t *st, int call_options, const char *target, + + rc = stonith__execute(action); + if (rc == pcmk_ok) { +- stonith__action_result(action, &rc, output, error_output); ++ pcmk__action_result_t *result = stonith__action_result(action); ++ ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); ++ ++ // Take ownership of output so stonith__destroy_action() doesn't free it ++ if (output != NULL) { ++ *output = result->action_stdout; ++ result->action_stdout = NULL; ++ } ++ if (error_output != NULL) { ++ *error_output = result->action_stderr; ++ result->action_stderr = NULL; ++ } + } + stonith__destroy_action(action); + return rc; +-- +2.27.0 + + +From 7f7067014357cccb229a0bef091e234eb3765f7a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 13:05:54 -0500 +Subject: [PATCH 04/12] Refactor: fencing: pass full result to async action + callback + +When executing an asynchronous fence agent command, the fencing library gets +the full result (exit status, execution status, and exit reason) from the +services library, then maps that to a legacy return code. + +Now, pass the full result object to the fencing async callback, rather than +separate arguments for legacy code and stdout. The mapping to a legacy code now +happens in the fencer rather than the fencing library. + +The goal of this and following commits is to push the full result object +further down the code path, so that ultimately the full result is always +available internally, and the legacy code mapping is only done for backward +compatibility when sending the result back to a client. + +This commit focuses on the async callback (done_cb() in both the fencer's +async_command_t and the fencing library's stonith_action_t). Later commits will +follow the chain: + + st_child_done() and stonith_fence_get_devices_cb() + -> stonith_send_async_reply() + -> stonith_construct_async_reply() and log_async_result() +--- + daemons/fenced/fenced_commands.c | 78 +++++++++++++++++++++----------- + include/crm/fencing/internal.h | 3 +- + lib/fencing/st_client.c | 10 ++-- + 3 files changed, 58 insertions(+), 33 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index b5ae28d90..d5d04ae69 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -62,7 +62,8 @@ struct device_search_s { + }; + + static gboolean stonith_device_dispatch(gpointer user_data); +-static void st_child_done(int pid, int rc, const char *output, void *user_data); ++static void st_child_done(int pid, const pcmk__action_result_t *result, ++ void *user_data); + static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, + const char *client_id); + +@@ -99,7 +100,8 @@ typedef struct async_command_s { + GList *device_next; + + void *internal_user_data; +- void (*done_cb) (int pid, int rc, const char *output, void *user_data); ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); + guint timer_sigterm; + guint timer_sigkill; + /*! If the operation timed out, this is the last signal +@@ -377,13 +379,25 @@ get_agent_metadata_cb(gpointer data) { + * \internal + * \brief Call a command's action callback for an internal (not library) result + * +- * \param[in] cmd Command to report result for +- * \param[in] rc Legacy return code to pass to callback ++ * \param[in] cmd Command to report result for ++ * \param[in] execution_status Execution status to use for result ++ * \param[in] exit_status Exit status to use for result ++ * \param[in] exit_reason Exit reason to use for result + */ + static void +-report_internal_result(async_command_t *cmd, int rc) ++report_internal_result(async_command_t *cmd, int exit_status, ++ int execution_status, const char *exit_reason) + { +- cmd->done_cb(0, rc, NULL, cmd); ++ pcmk__action_result_t result = { ++ // Ensure we don't pass garbage to free() ++ .exit_reason = NULL, ++ .action_stdout = NULL, ++ .action_stderr = NULL ++ }; ++ ++ pcmk__set_result(&result, exit_status, execution_status, exit_reason); ++ cmd->done_cb(0, &result, cmd); ++ pcmk__reset_result(&result); + } + + static gboolean +@@ -446,7 +460,7 @@ stonith_device_execute(stonith_device_t * device) + } + } else { + crm_info("Faking success for %s watchdog operation", cmd->action); +- report_internal_result(cmd, pcmk_ok); ++ report_internal_result(cmd, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + goto done; + } + } +@@ -462,7 +476,8 @@ stonith_device_execute(stonith_device_t * device) + crm_err("Considering %s unconfigured " + "because unable to load CIB secrets: %s", + device->id, pcmk_rc_str(exec_rc)); +- report_internal_result(cmd, -EACCES); ++ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, ++ NULL); + goto done; + } + } +@@ -501,7 +516,7 @@ stonith_device_execute(stonith_device_t * device) + cmd->done_cb, fork_cb); + if (exec_rc < 0) { + cmd->activating_on = NULL; +- report_internal_result(cmd, exec_rc); ++ cmd->done_cb(0, stonith__action_result(action), cmd); + stonith__destroy_action(action); + } + +@@ -625,7 +640,8 @@ free_device(gpointer data) + async_command_t *cmd = gIter->data; + + crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); +- report_internal_result(cmd, -ENODEV); ++ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); + } + g_list_free(device->pending_ops); + +@@ -1079,7 +1095,8 @@ schedule_internal_command(const char *origin, + const char *victim, + int timeout, + void *internal_user_data, +- void (*done_cb) (int pid, int rc, const char *output, ++ void (*done_cb) (int pid, ++ const pcmk__action_result_t *result, + void *user_data)) + { + async_command_t *cmd = NULL; +@@ -1111,7 +1128,7 @@ enum fence_status_code { + }; + + static void +-status_search_cb(int pid, int rc, const char *output, void *user_data) ++status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) + { + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; +@@ -1127,7 +1144,7 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) + + mainloop_set_trigger(dev->work); + +- switch (rc) { ++ switch (result->exit_status) { + case fence_status_unknown: + crm_trace("%s reported it cannot fence %s", dev->id, search->host); + break; +@@ -1141,14 +1158,15 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) + default: + crm_warn("Assuming %s cannot fence %s " + "(status returned unknown code %d)", +- dev->id, search->host, rc); ++ dev->id, search->host, result->exit_status); + break; + } + search_devices_record_result(search, dev->id, can); + } + + static void +-dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) ++dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, ++ void *user_data) + { + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; +@@ -1169,21 +1187,21 @@ dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) + + mainloop_set_trigger(dev->work); + +- if (rc == CRM_EX_OK) { ++ if (result->exit_status == CRM_EX_OK) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); +- dev->targets = stonith__parse_targets(output); ++ dev->targets = stonith__parse_targets(result->action_stdout); + dev->targets_age = time(NULL); + + } else if (dev->targets != NULL) { + crm_info("Reusing most recent target list for %s " + "because list returned error code %d", +- dev->id, rc); ++ dev->id, result->exit_status); + + } else { // We have never successfully executed list + crm_warn("Assuming %s cannot fence %s " + "because list returned error code %d", +- dev->id, search->host, rc); ++ dev->id, search->host, result->exit_status); + + /* Fall back to pcmk_host_check="status" if the user didn't explicitly + * specify "dynamic-list". +@@ -2407,7 +2425,7 @@ cancel_stonith_command(async_command_t * cmd) + } + + static void +-st_child_done(int pid, int rc, const char *output, void *user_data) ++st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + { + stonith_device_t *device = NULL; + stonith_device_t *next_device = NULL; +@@ -2423,7 +2441,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + /* The device is ready to do something else now */ + device = g_hash_table_lookup(device_list, cmd->device); + if (device) { +- if (!device->verified && (rc == pcmk_ok) && ++ if (!device->verified && (result->exit_status == CRM_EX_OK) && + (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { + + device->verified = TRUE; +@@ -2432,7 +2450,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + mainloop_set_trigger(device->work); + } + +- if (rc == 0) { ++ if (result->exit_status == CRM_EX_OK) { + GList *iter; + /* see if there are any required devices left to execute for this op */ + for (iter = cmd->device_next; iter != NULL; iter = iter->next) { +@@ -2445,7 +2463,8 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + next_device = NULL; + } + +- } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) { ++ } else if ((cmd->device_next != NULL) ++ && !is_action_required(cmd->action, device)) { + /* if this device didn't work out, see if there are any others we can try. + * if the failed device was 'required', we can't pick another device. */ + next_device = g_hash_table_lookup(device_list, cmd->device_next->data); +@@ -2454,16 +2473,19 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + + /* this operation requires more fencing, hooray! */ + if (next_device) { +- log_async_result(cmd, rc, pid, next_device->id, output, FALSE); ++ log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, ++ next_device->id, result->action_stdout, FALSE); + schedule_stonith_command(cmd, next_device); + /* Prevent cmd from being freed */ + cmd = NULL; + goto done; + } + +- stonith_send_async_reply(cmd, output, rc, pid, false); ++ stonith_send_async_reply(cmd, result->action_stdout, ++ pcmk_rc2legacy(stonith__result2rc(result)), pid, ++ false); + +- if (rc != 0) { ++ if (result->exit_status != CRM_EX_OK) { + goto done; + } + +@@ -2509,7 +2531,9 @@ st_child_done(int pid, int rc, const char *output, void *user_data) + + cmd_list = g_list_remove_link(cmd_list, gIter); + +- stonith_send_async_reply(cmd_other, output, rc, pid, true); ++ stonith_send_async_reply(cmd_other, result->action_stdout, ++ pcmk_rc2legacy(stonith__result2rc(result)), ++ pid, true); + cancel_stonith_command(cmd_other); + + free_async_command(cmd_other); +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 4e9f50fe8..6a7e4232c 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -64,7 +64,8 @@ int stonith__result2rc(const pcmk__action_result_t *result); + int + stonith_action_execute_async(stonith_action_t * action, + void *userdata, +- void (*done) (int pid, int rc, const char *output, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, + void *user_data), + void (*fork_cb) (int pid, void *user_data)); + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index b9df18465..59dcab9a3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -46,7 +46,8 @@ struct stonith_action_s { + int timeout; + int async; + void *userdata; +- void (*done_cb) (int pid, int status, const char *output, void *user_data); ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); + void (*fork_cb) (int pid, void *user_data); + + svc_action_t *svc_action; +@@ -811,9 +812,7 @@ stonith_action_async_done(svc_action_t *svc_action) + } + + if (action->done_cb) { +- action->done_cb(action->pid, +- pcmk_rc2legacy(stonith__result2rc(&(action->result))), +- action->result.action_stdout, action->userdata); ++ action->done_cb(action->pid, &(action->result), action->userdata); + } + + action->svc_action = NULL; // don't remove our caller +@@ -933,7 +932,8 @@ internal_stonith_action_execute(stonith_action_t * action) + int + stonith_action_execute_async(stonith_action_t * action, + void *userdata, +- void (*done) (int pid, int rc, const char *output, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, + void *user_data), + void (*fork_cb) (int pid, void *user_data)) + { +-- +2.27.0 + + +From bbd022306df7a873c0ecb2be2d33c56fbf327b8c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 11:51:28 -0500 +Subject: [PATCH 05/12] Feature: fencing: set exit reason for internal + execution errors + +... most importantly, copying any exit reason set by the services library. +This ensures that the stonith_action_t exit reason is set when appropriate. +However, nothing uses it as of this commit. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + lib/fencing/st_client.c | 6 +++--- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index d5d04ae69..f55a32649 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -477,7 +477,7 @@ stonith_device_execute(stonith_device_t * device) + "because unable to load CIB secrets: %s", + device->id, pcmk_rc_str(exec_rc)); + report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, +- NULL); ++ "Failed to get CIB secrets"); + goto done; + } + } +@@ -641,7 +641,7 @@ free_device(gpointer data) + + crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); + report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, +- NULL); ++ "Device was removed before action could be executed"); + } + g_list_free(device->pending_ops); + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 59dcab9a3..3d4127eff 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -207,7 +207,7 @@ static void + set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) + { + pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, +- NULL); ++ services__exit_reason(svc_action)); + pcmk__set_result_output(&(action->result), + services__grab_stdout(svc_action), + services__grab_stderr(svc_action)); +@@ -706,7 +706,7 @@ stonith_action_create(const char *agent, + action->max_retries = FAILURE_MAX_RETRIES; + + pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, +- NULL); ++ "Initialization bug in fencing library"); + + if (device_args) { + char buffer[512]; +@@ -849,7 +849,7 @@ internal_stonith_action_execute(stonith_action_t * action) + if ((action->action == NULL) || (action->args == NULL) + || (action->agent == NULL)) { + pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, +- PCMK_EXEC_ERROR_FATAL, NULL); ++ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); + return -EINVAL; + } + +-- +2.27.0 + + +From ed08f600688af1d25412d2427502ba5d4a55c0d6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 12:06:10 -0500 +Subject: [PATCH 06/12] Fix: fencer: handle dynamic target query failures + better + +Previously, the callbacks for list and status queries checked only the result's +exit status. However, the services library will use PCMK_OCF_UNKNOWN_ERROR (1) +as the exit status for internal failures, and that value signifies a recognized +node (not an error) for fence list actions. + +Now, the callbacks check the execution status as well. +--- + daemons/fenced/fenced_commands.c | 46 +++++++++++++++++++++++++++----- + 1 file changed, 39 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f55a32649..7b3fb25a1 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1144,6 +1144,18 @@ status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) + + mainloop_set_trigger(dev->work); + ++ if (result->execution_status != PCMK_EXEC_DONE) { ++ crm_warn("Assuming %s cannot fence %s " ++ "because status could not be executed: %s%s%s%s", ++ dev->id, search->host, ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ search_devices_record_result(search, dev->id, FALSE); ++ return; ++ } ++ + switch (result->exit_status) { + case fence_status_unknown: + crm_trace("%s reported it cannot fence %s", dev->id, search->host); +@@ -1187,21 +1199,41 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, + + mainloop_set_trigger(dev->work); + +- if (result->exit_status == CRM_EX_OK) { ++ if ((result->execution_status == PCMK_EXEC_DONE) ++ && (result->exit_status == CRM_EX_OK)) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); + dev->targets = stonith__parse_targets(result->action_stdout); + dev->targets_age = time(NULL); + + } else if (dev->targets != NULL) { +- crm_info("Reusing most recent target list for %s " +- "because list returned error code %d", +- dev->id, result->exit_status); ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ crm_info("Reusing most recent target list for %s " ++ "because list returned error code %d", ++ dev->id, result->exit_status); ++ } else { ++ crm_info("Reusing most recent target list for %s " ++ "because list could not be executed: %s%s%s%s", ++ dev->id, pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ } + + } else { // We have never successfully executed list +- crm_warn("Assuming %s cannot fence %s " +- "because list returned error code %d", +- dev->id, search->host, result->exit_status); ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ crm_warn("Assuming %s cannot fence %s " ++ "because list returned error code %d", ++ dev->id, search->host, result->exit_status); ++ } else { ++ crm_warn("Assuming %s cannot fence %s " ++ "because list could not be executed: %s%s%s%s", ++ dev->id, search->host, ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : " ("), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ ((result->exit_reason == NULL)? "" : ")")); ++ } + + /* Fall back to pcmk_host_check="status" if the user didn't explicitly + * specify "dynamic-list". +-- +2.27.0 + + +From 5a30238a3b8691a5fc20f53906c0efcc50193306 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Sep 2021 15:57:50 -0500 +Subject: [PATCH 07/12] Refactor: fencer: pass result object when sending an + async reply + +... via stonith_send_async_reply(), instead of sending the mapped legacy code +and action stdout separately. Also, drop the "stonith_" prefix since the +function is static. + +This moves the mapping from the stonith_send_async_reply() callers to the +function itself, so we use the result object and standard codes as long as +possible, and map to a legacy code only where needed. +--- + daemons/fenced/fenced_commands.c | 62 +++++++++++++++++++------------- + daemons/fenced/fenced_remote.c | 2 +- + 2 files changed, 39 insertions(+), 25 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 7b3fb25a1..e5f8162ce 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2376,12 +2376,28 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + } + } + ++/*! ++ * \internal ++ * \brief Reply to requester after asynchronous command completion ++ * ++ * \param[in] cmd Command that completed ++ * \param[in] result Result of command ++ * \param[in] pid Process ID of command, if available ++ * \param[in] merged If true, command was merged with another, not executed ++ */ + static void +-stonith_send_async_reply(async_command_t *cmd, const char *output, int rc, +- int pid, bool merged) ++send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, ++ int pid, bool merged) + { + xmlNode *reply = NULL; + gboolean bcast = FALSE; ++ const char *output = NULL; ++ int rc = pcmk_ok; ++ ++ CRM_CHECK((cmd != NULL) && (result != NULL), return); ++ ++ output = result->action_stdout; ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + + reply = stonith_construct_async_reply(cmd, output, NULL, rc); + +@@ -2513,9 +2529,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + goto done; + } + +- stonith_send_async_reply(cmd, result->action_stdout, +- pcmk_rc2legacy(stonith__result2rc(result)), pid, +- false); ++ send_async_reply(cmd, result, pid, false); + + if (result->exit_status != CRM_EX_OK) { + goto done; +@@ -2563,9 +2577,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + cmd_list = g_list_remove_link(cmd_list, gIter); + +- stonith_send_async_reply(cmd_other, result->action_stdout, +- pcmk_rc2legacy(stonith__result2rc(result)), +- pid, true); ++ send_async_reply(cmd_other, result, pid, true); + cancel_stonith_command(cmd_other); + + free_async_command(cmd_other); +@@ -2604,26 +2616,28 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + /* Order based on priority */ + devices = g_list_sort(devices, sort_device_priority); + device = g_hash_table_lookup(device_list, devices->data); +- +- if (device) { +- cmd->device_list = devices; +- cmd->device_next = devices->next; +- devices = NULL; /* list owned by cmd now */ +- } + } + +- /* we have a device, schedule it for fencing. */ +- if (device) { +- schedule_stonith_command(cmd, device); +- /* in progress */ +- return; +- } ++ if (device == NULL) { // No device found ++ pcmk__action_result_t result = { ++ // Ensure we don't pass garbage to free() ++ .exit_reason = NULL, ++ .action_stdout = NULL, ++ .action_stderr = NULL ++ }; + +- /* no device found! */ +- stonith_send_async_reply(cmd, NULL, -ENODEV, 0, false); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "No fence device configured for target"); ++ send_async_reply(cmd, &result, 0, false); ++ pcmk__reset_result(&result); ++ free_async_command(cmd); ++ g_list_free_full(devices, free); + +- free_async_command(cmd); +- g_list_free_full(devices, free); ++ } else { // Device found, schedule it for fencing ++ cmd->device_list = devices; ++ cmd->device_next = devices->next; ++ schedule_stonith_command(cmd, device); ++ } + } + + static int +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index ffaf60018..b09d2865e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -996,7 +996,7 @@ stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) + + remote_op_done(op, msg, pcmk_ok, FALSE); + +- /* Replies are sent via done_cb->stonith_send_async_reply()->do_local_reply() */ ++ // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() + return -EINPROGRESS; + } + +-- +2.27.0 + + +From c67b6bfbe0baa1253058417ddfb9bc4cf0844e27 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 7 Oct 2021 17:25:38 -0500 +Subject: [PATCH 08/12] Refactor: fencer: pass result object when building + async reply + +... via stonith_construct_async_reply(), instead of passing a mapped legacy rc +and action output separately, which will be helpful when we add the exit reason +to the reply. Also, drop the "stonith_" prefix since the function is static, and +drop an unused argument. +--- + daemons/fenced/fenced_commands.c | 33 +++++++++++++++----------------- + 1 file changed, 15 insertions(+), 18 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index e5f8162ce..6bc12e6c4 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -112,8 +112,8 @@ typedef struct async_command_s { + stonith_device_t *activating_on; + } async_command_t; + +-static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output, +- xmlNode * data, int rc); ++static xmlNode *construct_async_reply(async_command_t *cmd, ++ const pcmk__action_result_t *result); + + static gboolean + is_action_required(const char *action, stonith_device_t *device) +@@ -2399,7 +2399,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + output = result->action_stdout; + rc = pcmk_rc2legacy(stonith__result2rc(result)); + +- reply = stonith_construct_async_reply(cmd, output, NULL, rc); ++ reply = construct_async_reply(cmd, result); + + // Only replies for certain actions are broadcast + if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", +@@ -2732,17 +2732,20 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + return reply; + } + ++/*! ++ * \internal ++ * \brief Build an XML reply to an asynchronous fencing command ++ * ++ * \param[in] cmd Fencing command that reply is for ++ * \param[in] result Command result ++ */ + static xmlNode * +-stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc) ++construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + { +- xmlNode *reply = NULL; +- +- crm_trace("Creating a basic reply"); +- reply = create_xml_node(NULL, T_STONITH_REPLY); ++ xmlNode *reply = create_xml_node(NULL, T_STONITH_REPLY); + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- + crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); + crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); + crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); +@@ -2753,15 +2756,9 @@ stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode + crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); + crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); +- +- crm_xml_add_int(reply, F_STONITH_RC, rc); +- +- crm_xml_add(reply, "st_output", output); +- +- if (data != NULL) { +- crm_info("Attaching reply output"); +- add_message_xml(reply, F_STONITH_CALLDATA, data); +- } ++ crm_xml_add_int(reply, F_STONITH_RC, ++ pcmk_rc2legacy(stonith__result2rc(result))); ++ crm_xml_add(reply, "st_output", result->action_stdout); + return reply; + } + +-- +2.27.0 + + +From 2686caeb3b74f687ddd86a4e483250ca8096ba7c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 19 Oct 2021 18:27:31 -0500 +Subject: [PATCH 09/12] Log: fencer: improve messages for asynchronous results + +Now that we have the full result object, pass it to log_async_result(). +Instead of logging a mapped legacy rc, log the execution status or exit status +as appropriate, along with the exit reason. +--- + daemons/fenced/fenced_commands.c | 43 +++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 20 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 6bc12e6c4..9d06c68dc 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2305,15 +2305,14 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + * \brief Log the result of an asynchronous command + * + * \param[in] cmd Command the result is for +- * \param[in] rc Legacy return code corresponding to result ++ * \param[in] result Result of command + * \param[in] pid Process ID of command, if available + * \param[in] next Alternate device that will be tried if command failed +- * \param[in] output Command output, if any + * \param[in] op_merged Whether this command was merged with an earlier one + */ + static void +-log_async_result(async_command_t *cmd, int rc, int pid, const char *next, +- const char *output, gboolean op_merged) ++log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, ++ int pid, const char *next, bool op_merged) + { + int log_level = LOG_ERR; + int output_log_level = LOG_NEVER; +@@ -2321,17 +2320,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + + GString *msg = g_string_sized_new(80); // Reasonable starting size + +- // Choose log levels appropriately +- if (rc == 0) { // Success ++ // Choose log levels appropriately if we have a result ++ if ((result->execution_status == PCMK_EXEC_DONE) ++ && (result->exit_status == CRM_EX_OK)) { // Success + log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; +- if ((output != NULL) ++ if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_DEBUG; + } + next = NULL; + } else { // Failure + log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; +- if ((output != NULL) ++ if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_WARNING; + } +@@ -2347,10 +2347,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + } + g_string_append_printf(msg, "using %s ", cmd->device); + +- // Add result +- g_string_append_printf(msg, "returned %d (%s)", rc, pcmk_strerror(rc)); ++ // Add exit status or execution status as appropriate ++ if (result->execution_status == PCMK_EXEC_DONE) { ++ g_string_append_printf(msg, "returned %d", result->exit_status); ++ } else { ++ g_string_append_printf(msg, "could not be executed: %s", ++ pcmk_exec_status_str(result->execution_status)); ++ } + +- // Add next device if appropriate ++ // Add exit reason and next device if appropriate ++ if (result->exit_reason != NULL) { ++ g_string_append_printf(msg, " (%s)", result->exit_reason); ++ } + if (next != NULL) { + g_string_append_printf(msg, ", retrying with %s", next); + } +@@ -2371,7 +2379,7 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, + if (output_log_level != LOG_NEVER) { + char *prefix = crm_strdup_printf("%s[%d]", cmd->device, pid); + +- crm_log_output(output_log_level, prefix, output); ++ crm_log_output(output_log_level, prefix, result->action_stdout); + free(prefix); + } + } +@@ -2391,14 +2399,9 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + { + xmlNode *reply = NULL; + gboolean bcast = FALSE; +- const char *output = NULL; +- int rc = pcmk_ok; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + +- output = result->action_stdout; +- rc = pcmk_rc2legacy(stonith__result2rc(result)); +- + reply = construct_async_reply(cmd, result); + + // Only replies for certain actions are broadcast +@@ -2412,7 +2415,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + bcast = TRUE; + } + +- log_async_result(cmd, rc, pid, NULL, output, merged); ++ log_async_result(cmd, result, pid, NULL, merged); + crm_log_xml_trace(reply, "Reply"); + + if (merged) { +@@ -2436,6 +2439,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); ++ int rc = pcmk_rc2legacy(stonith__result2rc(result)); + + crm_xml_add_int(notify_data, F_STONITH_RC, rc); + crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); +@@ -2521,8 +2525,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + /* this operation requires more fencing, hooray! */ + if (next_device) { +- log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, +- next_device->id, result->action_stdout, FALSE); ++ log_async_result(cmd, result, pid, next_device->id, false); + schedule_stonith_command(cmd, next_device); + /* Prevent cmd from being freed */ + cmd = NULL; +-- +2.27.0 + + +From 9f9dea518da50f629589d505ea0f330a47111d76 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 28 Oct 2021 13:29:31 -0500 +Subject: [PATCH 10/12] Test: cts-fencing: update expected log messages + +... which now log the original exit status rather than a mapped legacy rc +--- + cts/cts-fencing.in | 28 ++++++++++++++-------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in +index babfb6351..5cd9f7b8f 100644 +--- a/cts/cts-fencing.in ++++ b/cts/cts-fencing.in +@@ -886,7 +886,7 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") + + test.add_stonith_log_pattern("Total timeout set to 40") +- test.add_stonith_log_pattern("targeting node3 using false returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false returned 1") + test.add_stonith_log_pattern("targeting node3 using true returned 0") + + # test what happens when the first fencing level fails. +@@ -920,8 +920,8 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3") + + test.add_stonith_log_pattern("Total timeout set to 18") +- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") +- test.add_stonith_log_pattern("targeting node3 using false2 returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") ++ test.add_stonith_log_pattern("targeting node3 using false2 returned 1") + test.add_stonith_log_pattern("targeting node3 using true3 returned 0") + test.add_stonith_log_pattern("targeting node3 using true4 returned 0") + +@@ -987,7 +987,7 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") + + test.add_stonith_log_pattern("Total timeout set to 8") +- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") ++ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") + test.add_stonith_neg_log_pattern("targeting node3 using false2 returned ") + test.add_stonith_log_pattern("targeting node3 using true3 returned 0") + test.add_stonith_log_pattern("targeting node3 using true4 returned 0") +@@ -1147,7 +1147,7 @@ class Tests(object): + "--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") + test.add_stonith_log_pattern("does not support reboot") +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") + + # make sure reboot is used when reboot action is advertised + for test_type in test_types: +@@ -1158,7 +1158,7 @@ class Tests(object): + "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") + test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") + test.add_stonith_neg_log_pattern("does not advertise support for 'reboot', performing 'off'") +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") + + # make sure requested fencing delay is applied only for the first device in the first level + # make sure static delay from pcmk_delay_base is added +@@ -1240,8 +1240,8 @@ class Tests(object): + '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) + # both devices should be executed +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") + + ### verify unfencing using automatic unfencing fails if any of the required agents fail + test = self.new_test("cpg_unfence_required_2", +@@ -1264,8 +1264,8 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") + + ### verify unfencing using automatic devices with topology + test = self.new_test("cpg_unfence_required_4", +@@ -1296,10 +1296,10 @@ class Tests(object): + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname)) + test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("using true1 returned 0 (OK)") +- test.add_stonith_log_pattern("using true2 returned 0 (OK)") +- test.add_stonith_log_pattern("using true3 returned 0 (OK)") +- test.add_stonith_log_pattern("using true4 returned 0 (OK)") ++ test.add_stonith_log_pattern("using true1 returned 0") ++ test.add_stonith_log_pattern("using true2 returned 0") ++ test.add_stonith_log_pattern("using true3 returned 0") ++ test.add_stonith_log_pattern("using true4 returned 0") + + def build_unfence_on_target_tests(self): + """ Register tests that verify unfencing that runs on the target """ +-- +2.27.0 + + +From be72166ed9ccb53c218529783660503df95da719 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 16 Sep 2021 16:50:23 -0500 +Subject: [PATCH 11/12] Log: libcrmservice: downgrade failed action messages + +Previously, we would often get duplicate log messages for failed actions, +from the service library and again from its callers. + +Now that the service library tracks and provides exit reasons, callers can log +sufficient detail with better context, so downgrade the library's messages to +info level or lower. Similarly, avoid duplicate logs of process output. + +Certain messages (such as out-of-memory) remain at higher severity. +--- + daemons/controld/controld_execd.c | 15 +++--- + lib/fencing/st_client.c | 11 ++--- + lib/services/services.c | 14 +++--- + lib/services/services_linux.c | 80 ++++++++++++++++--------------- + lib/services/systemd.c | 20 ++++---- + lib/services/upstart.c | 19 ++++---- + 6 files changed, 80 insertions(+), 79 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index bded6e6b6..3ddff6e13 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -2684,16 +2684,15 @@ log_executor_event(lrmd_event_data_t *op, const char *op_key, + do_crm_log(log_level, "%s", str->str); + g_string_free(str, TRUE); + +- if (op->output != NULL) { +- char *prefix = crm_strdup_printf("%s-" PCMK__OP_FMT ":%d", node_name, ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to notice if it looks like a failure. ++ */ ++ if ((op->output != NULL) && (op->rc != PCMK_OCF_OK)) { ++ char *prefix = crm_strdup_printf(PCMK__OP_FMT "@%s output", + op->rsc_id, op->op_type, +- op->interval_ms, op->call_id); ++ op->interval_ms, node_name); + +- if (op->rc) { +- crm_log_output(LOG_NOTICE, prefix, op->output); +- } else { +- crm_log_output(LOG_DEBUG, prefix, op->output); +- } ++ crm_log_output(LOG_NOTICE, prefix, op->output); + free(prefix); + } + } +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 3d4127eff..2fbff7f24 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -276,14 +276,9 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + static void + log_action(stonith_action_t *action, pid_t pid) + { +- if (action->result.action_stdout != NULL) { +- /* Logging the whole string confuses syslog when the string is xml */ +- char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); +- +- crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); +- free(prefix); +- } +- ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to warning for stderr. ++ */ + if (action->result.action_stderr != NULL) { + /* Logging the whole string confuses syslog when the string is xml */ + char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); +diff --git a/lib/services/services.c b/lib/services/services.c +index 86a0a213c..cf8bbc70e 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -319,13 +319,13 @@ services__create_resource_action(const char *name, const char *standard, + rc = services__nagios_prepare(op); + #endif + } else { +- crm_err("Unknown resource standard: %s", op->standard); ++ crm_info("Unknown resource standard: %s", op->standard); + rc = ENOENT; + } + + if (rc != pcmk_rc_ok) { +- crm_err("Cannot prepare %s operation for %s: %s", +- action, name, strerror(rc)); ++ crm_info("Cannot prepare %s operation for %s: %s", ++ action, name, strerror(rc)); + services__handle_exec_error(op, rc); + } + return op; +@@ -967,14 +967,14 @@ execute_metadata_action(svc_action_t *op) + const char *class = op->standard; + + if (op->agent == NULL) { +- crm_err("meta-data requested without specifying agent"); ++ crm_info("Meta-data requested without specifying agent"); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_FATAL, "Agent not specified"); + return EINVAL; + } + + if (class == NULL) { +- crm_err("meta-data requested for agent %s without specifying class", ++ crm_info("Meta-data requested for agent %s without specifying class", + op->agent); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_FATAL, +@@ -986,8 +986,8 @@ execute_metadata_action(svc_action_t *op) + class = resources_find_service_class(op->agent); + } + if (class == NULL) { +- crm_err("meta-data requested for %s, but could not determine class", +- op->agent); ++ crm_info("Meta-data requested for %s, but could not determine class", ++ op->agent); + services__set_result(op, services__generic_error(op), + PCMK_EXEC_ERROR_HARD, + "Agent standard could not be determined"); +diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c +index b2ff27a0d..9a4c6cf80 100644 +--- a/lib/services/services_linux.c ++++ b/lib/services/services_linux.c +@@ -64,8 +64,8 @@ sigchld_setup(struct sigchld_data_s *data) + + // Block SIGCHLD (saving previous set of blocked signals to restore later) + if (sigprocmask(SIG_BLOCK, &(data->mask), &(data->old_mask)) < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=sigprocmask", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=sigprocmask", pcmk_strerror(errno)); + return false; + } + return true; +@@ -81,8 +81,8 @@ sigchld_open(struct sigchld_data_s *data) + + fd = signalfd(-1, &(data->mask), SFD_NONBLOCK); + if (fd < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=signalfd", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=signalfd", pcmk_strerror(errno)); + } + return fd; + } +@@ -108,8 +108,8 @@ sigchld_received(int fd) + } + s = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); + if (s != sizeof(struct signalfd_siginfo)) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=read", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=read", pcmk_strerror(errno)); + + } else if (fdsi.ssi_signo == SIGCHLD) { + return true; +@@ -149,8 +149,8 @@ sigchld_handler() + if ((last_sigchld_data != NULL) + && (last_sigchld_data->pipe_fd[1] >= 0) + && (write(last_sigchld_data->pipe_fd[1], "", 1) == -1)) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=write", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=write", pcmk_strerror(errno)); + } + } + +@@ -162,19 +162,19 @@ sigchld_setup(struct sigchld_data_s *data) + data->pipe_fd[0] = data->pipe_fd[1] = -1; + + if (pipe(data->pipe_fd) == -1) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=pipe", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=pipe", pcmk_strerror(errno)); + return false; + } + + rc = pcmk__set_nonblocking(data->pipe_fd[0]); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", ++ crm_info("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", + pcmk_rc_str(rc), rc); + } + rc = pcmk__set_nonblocking(data->pipe_fd[1]); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", ++ crm_info("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", + pcmk_rc_str(rc), rc); + } + +@@ -183,8 +183,8 @@ sigchld_setup(struct sigchld_data_s *data) + data->sa.sa_flags = 0; + sigemptyset(&(data->sa.sa_mask)); + if (sigaction(SIGCHLD, &(data->sa), &(data->old_sa)) < 0) { +- crm_err("Wait for child process completion failed: %s " +- CRM_XS " source=sigaction", pcmk_strerror(errno)); ++ crm_info("Wait for child process completion failed: %s " ++ CRM_XS " source=sigaction", pcmk_strerror(errno)); + } + + // Remember data for use in signal handler +@@ -585,7 +585,11 @@ log_op_output(svc_action_t *op) + { + char *prefix = crm_strdup_printf("%s[%d] error output", op->id, op->pid); + +- crm_log_output(LOG_NOTICE, prefix, op->stderr_data); ++ /* The library caller has better context to know how important the output ++ * is, so log it at info and debug severity here. They can log it again at ++ * higher severity if appropriate. ++ */ ++ crm_log_output(LOG_INFO, prefix, op->stderr_data); + strcpy(prefix + strlen(prefix) - strlen("error output"), "output"); + crm_log_output(LOG_DEBUG, prefix, op->stdout_data); + free(prefix); +@@ -673,7 +677,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + parse_exit_reason_from_stderr(op); + + } else if (mainloop_child_timeout(p)) { +- crm_warn("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); ++ crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); + services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, + "Process did not exit within specified timeout"); + +@@ -686,7 +690,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_CANCELLED, NULL); + + } else { +- crm_warn("%s[%d] terminated with signal %d (%s)", ++ crm_info("%s[%d] terminated with signal %d (%s)", + op->id, op->pid, signo, strsignal(signo)); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, + "Process interrupted by signal"); +@@ -908,12 +912,12 @@ action_launch_child(svc_action_t *op) + sp.sched_priority = 0; + + if (sched_setscheduler(0, SCHED_OTHER, &sp) == -1) { +- crm_warn("Could not reset scheduling policy for %s", op->id); ++ crm_info("Could not reset scheduling policy for %s", op->id); + } + } + #endif + if (setpriority(PRIO_PROCESS, 0, 0) == -1) { +- crm_warn("Could not reset process priority for %s", op->id); ++ crm_info("Could not reset process priority for %s", op->id); + } + + /* Man: The call setpgrp() is equivalent to setpgid(0,0) +@@ -941,7 +945,7 @@ action_launch_child(svc_action_t *op) + } else { + crm_err("Considering %s unconfigured " + "because unable to load CIB secrets: %s", +- op->rsc, pcmk_rc_str(rc)); ++ op->rsc, pcmk_rc_str(rc)); + exit_child(op, services__configuration_error(op, false), + "Unable to load CIB secrets"); + } +@@ -1043,7 +1047,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + } else if (wait_rc < 0) { + wait_reason = pcmk_rc_str(errno); +- crm_warn("Wait for completion of %s[%d] failed: %s " ++ crm_info("Wait for completion of %s[%d] failed: %s " + CRM_XS " source=waitpid", + op->id, op->pid, wait_reason); + wait_rc = 0; // Act as if process is still running +@@ -1057,8 +1061,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + } else if ((poll_rc < 0) && (errno != EINTR)) { + wait_reason = pcmk_rc_str(errno); +- crm_err("Wait for completion of %s[%d] failed: %s " +- CRM_XS " source=poll", op->id, op->pid, wait_reason); ++ crm_info("Wait for completion of %s[%d] failed: %s " ++ CRM_XS " source=poll", op->id, op->pid, wait_reason); + break; + } + +@@ -1078,7 +1082,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + services__set_result(op, services__generic_error(op), + PCMK_EXEC_TIMEOUT, + "Process did not exit within specified timeout"); +- crm_warn("%s[%d] timed out after %dms", ++ crm_info("%s[%d] timed out after %dms", + op->id, op->pid, op->timeout); + + } else { +@@ -1110,8 +1114,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) + + services__set_result(op, services__generic_error(op), PCMK_EXEC_ERROR, + "Process interrupted by signal"); +- crm_err("%s[%d] terminated with signal %d (%s)", +- op->id, op->pid, signo, strsignal(signo)); ++ crm_info("%s[%d] terminated with signal %d (%s)", ++ op->id, op->pid, signo, strsignal(signo)); + + #ifdef WCOREDUMP + if (WCOREDUMP(status)) { +@@ -1155,7 +1159,7 @@ services__execute_file(svc_action_t *op) + // Catch common failure conditions early + if (stat(op->opaque->exec, &st) != 0) { + rc = errno; +- crm_warn("Cannot execute '%s': %s " CRM_XS " stat rc=%d", ++ crm_info("Cannot execute '%s': %s " CRM_XS " stat rc=%d", + op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; +@@ -1163,8 +1167,8 @@ services__execute_file(svc_action_t *op) + + if (pipe(stdout_fd) < 0) { + rc = errno; +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1174,8 +1178,8 @@ services__execute_file(svc_action_t *op) + + close_pipe(stdout_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1187,8 +1191,8 @@ services__execute_file(svc_action_t *op) + close_pipe(stdout_fd); + close_pipe(stderr_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + goto done; + } +@@ -1212,8 +1216,8 @@ services__execute_file(svc_action_t *op) + close_pipe(stdout_fd); + close_pipe(stderr_fd); + +- crm_err("Cannot execute '%s': %s " CRM_XS " fork rc=%d", +- op->opaque->exec, pcmk_strerror(rc), rc); ++ crm_info("Cannot execute '%s': %s " CRM_XS " fork rc=%d", ++ op->opaque->exec, pcmk_strerror(rc), rc); + services__handle_exec_error(op, rc); + if (op->synchronous) { + sigchld_cleanup(&data); +@@ -1271,7 +1275,7 @@ services__execute_file(svc_action_t *op) + op->opaque->stdout_fd = stdout_fd[0]; + rc = pcmk__set_nonblocking(op->opaque->stdout_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' output non-blocking: %s " ++ crm_info("Could not set '%s' output non-blocking: %s " + CRM_XS " rc=%d", + op->opaque->exec, pcmk_rc_str(rc), rc); + } +@@ -1279,7 +1283,7 @@ services__execute_file(svc_action_t *op) + op->opaque->stderr_fd = stderr_fd[0]; + rc = pcmk__set_nonblocking(op->opaque->stderr_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' error output non-blocking: %s " ++ crm_info("Could not set '%s' error output non-blocking: %s " + CRM_XS " rc=%d", + op->opaque->exec, pcmk_rc_str(rc), rc); + } +@@ -1290,7 +1294,7 @@ services__execute_file(svc_action_t *op) + // as long as no other standard uses stdin_fd assume stonith + rc = pcmk__set_nonblocking(op->opaque->stdin_fd); + if (rc != pcmk_rc_ok) { +- crm_warn("Could not set '%s' input non-blocking: %s " ++ crm_info("Could not set '%s' input non-blocking: %s " + CRM_XS " fd=%d,rc=%d", op->opaque->exec, + pcmk_rc_str(rc), op->opaque->stdin_fd, rc); + } +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 6f5bef960..8e9fff484 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -232,7 +232,8 @@ systemd_daemon_reload_complete(DBusPendingCall *pending, void *user_data) + } + + if (pcmk_dbus_find_error(pending, reply, &error)) { +- crm_err("Could not issue systemd reload %d: %s", reload_count, error.message); ++ crm_warn("Could not issue systemd reload %d: %s", ++ reload_count, error.message); + dbus_error_free(&error); + + } else { +@@ -291,8 +292,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) + PCMK_EXEC_NOT_INSTALLED, "systemd unit not found"); + } + +- crm_err("DBus request for %s of systemd unit %s for resource %s failed: %s", +- op->action, op->agent, crm_str(op->rsc), error->message); ++ crm_info("DBus request for %s of systemd unit %s for resource %s failed: %s", ++ op->action, op->agent, crm_str(op->rsc), error->message); + } + + /*! +@@ -325,11 +326,11 @@ execute_after_loadunit(DBusMessage *reply, svc_action_t *op) + if (op != NULL) { + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, + "systemd DBus method had unexpected reply"); +- crm_err("Could not load systemd unit %s for %s: " +- "DBus reply has unexpected type", op->agent, op->id); ++ crm_info("Could not load systemd unit %s for %s: " ++ "DBus reply has unexpected type", op->agent, op->id); + } else { +- crm_err("Could not load systemd unit: " +- "DBus reply has unexpected type"); ++ crm_info("Could not load systemd unit: " ++ "DBus reply has unexpected type"); + } + + } else { +@@ -688,7 +689,7 @@ process_unit_method_reply(DBusMessage *reply, svc_action_t *op) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("DBus request for %s of %s succeeded but " ++ crm_info("DBus request for %s of %s succeeded but " + "return type was unexpected", op->action, crm_str(op->rsc)); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, + "systemd DBus method had unexpected reply"); +@@ -981,7 +982,8 @@ systemd_timeout_callback(gpointer p) + svc_action_t * op = p; + + op->opaque->timerid = 0; +- crm_warn("%s operation on systemd unit %s named '%s' timed out", op->action, op->agent, op->rsc); ++ crm_info("%s action for systemd unit %s named '%s' timed out", ++ op->action, op->agent, op->rsc); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, + "Systemd action did not complete within specified timeout"); + services__finalize_async_op(op); +diff --git a/lib/services/upstart.c b/lib/services/upstart.c +index 2fdc229ad..2ece803e1 100644 +--- a/lib/services/upstart.c ++++ b/lib/services/upstart.c +@@ -308,21 +308,21 @@ get_first_instance(const gchar * job, int timeout) + dbus_message_unref(msg); + + if (dbus_error_is_set(&error)) { +- crm_err("Call to %s failed: %s", method, error.message); ++ crm_info("Call to %s failed: %s", method, error.message); + dbus_error_free(&error); + goto done; + + } else if(reply == NULL) { +- crm_err("Call to %s failed: no reply", method); ++ crm_info("Call to %s failed: no reply", method); + goto done; + + } else if (!dbus_message_iter_init(reply, &args)) { +- crm_err("Call to %s failed: Message has no arguments", method); ++ crm_info("Call to %s failed: Message has no arguments", method); + goto done; + } + + if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __func__, __LINE__)) { +- crm_err("Call to %s failed: Message has invalid arguments", method); ++ crm_info("Call to %s failed: Message has invalid arguments", method); + goto done; + } + +@@ -432,8 +432,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) + return; + } + +- crm_err("DBus request for %s of Upstart job %s for resource %s failed: %s", +- op->action, op->agent, crm_str(op->rsc), error->message); ++ crm_info("DBus request for %s of Upstart job %s for resource %s failed: %s", ++ op->action, op->agent, crm_str(op->rsc), error->message); + } + + /*! +@@ -468,7 +468,7 @@ job_method_complete(DBusPendingCall *pending, void *user_data) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("DBus request for %s of %s succeeded but " ++ crm_info("DBus request for %s of %s succeeded but " + "return type was unexpected", op->action, crm_str(op->rsc)); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + +@@ -667,7 +667,8 @@ services__execute_upstart(svc_action_t *op) + + } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, + __func__, __LINE__)) { +- crm_warn("Call to %s passed but return type was unexpected", op->action); ++ crm_info("Call to %s passed but return type was unexpected", ++ op->action); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + + } else { +@@ -675,7 +676,7 @@ services__execute_upstart(svc_action_t *op) + + dbus_message_get_args(reply, NULL, DBUS_TYPE_OBJECT_PATH, &path, + DBUS_TYPE_INVALID); +- crm_info("Call to %s passed: %s", op->action, path); ++ crm_debug("Call to %s passed: %s", op->action, path); + services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); + } + +-- +2.27.0 + + +From 39f6861c72eb9dd76d2cf3da287fe7485615631b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 8 Nov 2021 09:43:38 -0600 +Subject: [PATCH 12/12] Low: fencing: avoid use-after-free with new result + object + +itnroduced by 153c9b552 (not released) +--- + lib/fencing/st_rhcs.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 23e694975..6c8cbedc7 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -143,15 +143,17 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + if (result->execution_status != PCMK_EXEC_DONE) { + crm_warn("Could not execute metadata action for %s: %s", + agent, pcmk_exec_status_str(result->execution_status)); ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + stonith__destroy_action(action); +- return pcmk_rc2legacy(stonith__result2rc(result)); ++ return rc; + } + + if (result->exit_status != CRM_EX_OK) { + crm_warn("Metadata action for %s returned error code %d", + agent, result->exit_status); ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); + stonith__destroy_action(action); +- return pcmk_rc2legacy(stonith__result2rc(result)); ++ return rc; + } + + if (result->action_stdout == NULL) { +-- +2.27.0 + diff --git a/SOURCES/002-pacemakerd-options.patch b/SOURCES/002-pacemakerd-options.patch deleted file mode 100644 index 56941ec..0000000 --- a/SOURCES/002-pacemakerd-options.patch +++ /dev/null @@ -1,451 +0,0 @@ -From 0d40ebf10b1794ece2c5c9768ea7222d3834d3b3 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 13 May 2021 11:42:18 -0400 -Subject: [PATCH 1/4] Build: Use a different variable to find man page - includes. - -With other programs outside of the tools directory being converted to -use glib for command line handling, their includes are not going to be -in tools/. So we need to use a different autoconf variable to find -them. ---- - mk/common.mk | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/mk/common.mk b/mk/common.mk -index b247670..aa59feb 100644 ---- a/mk/common.mk -+++ b/mk/common.mk -@@ -1,5 +1,5 @@ - # --# Copyright 2014-2020 the Pacemaker project contributors -+# Copyright 2014-2021 the Pacemaker project contributors - # - # The version control history for this file may have further details. - # -@@ -68,11 +68,11 @@ HELP2MAN_ARGS = -N --section 8 --name "Part of the Pacemaker cluster resource ma - # and all wrappers to C code. - %.8: % $(MAN8DEPS) - $(AM_V_at)chmod a+x $(abs_builddir)/$< -- $(AM_V_MAN)if [ -f $(top_srcdir)/tools/$@.inc ]; then \ -+ $(AM_V_MAN)if [ -f $(abs_srcdir)/$@.inc ]; then \ - PATH=$(abs_builddir):$$PATH $(HELP2MAN) $(HELP2MAN_ARGS) \ - -h --help-all \ - --no-discard-stderr \ -- -i $(top_srcdir)/tools/$@.inc $(abs_builddir)/$< \ -+ -i $(abs_srcdir)/$@.inc $(abs_builddir)/$< \ - | sed -f $(top_srcdir)/tools/fix-manpages > $@ ; \ - else \ - PATH=$(abs_builddir):$$PATH $(HELP2MAN) $(HELP2MAN_ARGS) \ --- -1.8.3.1 - - -From c7ab1d901bcbbf0137277e783e072777ca2f82d9 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 13 May 2021 11:44:16 -0400 -Subject: [PATCH 2/4] Refactor: daemons: Remove the pid_file variable from - pacemakerd. - -It's never used anywhere. ---- - daemons/pacemakerd/pacemakerd.c | 3 --- - 1 file changed, 3 deletions(-) - -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index 8ec9708..03d688e 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -27,8 +27,7 @@ - - static crm_trigger_t *shutdown_trigger = NULL; - static crm_trigger_t *startup_trigger = NULL; --static const char *pid_file = PCMK_RUN_DIR "/pacemaker.pid"; - - /* state we report when asked via pacemakerd-api status-ping */ - static const char *pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT; - static gboolean running_with_sbd = FALSE; /* local copy */ -@@ -224,7 +222,6 @@ main(int argc, char **argv) - /* Legacy */ - break; - case 'p': -- pid_file = optarg; - break; - case 's': - pcmk__set_env_option("node_start_state", "standby"); --- -1.8.3.1 - - -From 98990eed9f6a5dbde7c8a5aa0783e93d5479295b Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 13 May 2021 13:14:38 -0400 -Subject: [PATCH 3/4] Refactor: daemons: Use glib for command line handling in - pacemakerd. - ---- - daemons/pacemakerd/Makefile.am | 2 + - daemons/pacemakerd/pacemakerd.8.inc | 5 + - daemons/pacemakerd/pacemakerd.c | 195 ++++++++++++++++++------------------ - 3 files changed, 102 insertions(+), 100 deletions(-) - create mode 100644 daemons/pacemakerd/pacemakerd.8.inc - -diff --git a/daemons/pacemakerd/Makefile.am b/daemons/pacemakerd/Makefile.am -index cc657f5..84517a3 100644 ---- a/daemons/pacemakerd/Makefile.am -+++ b/daemons/pacemakerd/Makefile.am -@@ -15,6 +15,8 @@ if BUILD_SYSTEMD - systemdsystemunit_DATA = pacemaker.service - endif - -+EXTRA_DIST = pacemakerd.8.inc -+ - ## SOURCES - - noinst_HEADERS = pacemakerd.h -diff --git a/daemons/pacemakerd/pacemakerd.8.inc b/daemons/pacemakerd/pacemakerd.8.inc -new file mode 100644 -index 0000000..902af4e ---- /dev/null -+++ b/daemons/pacemakerd/pacemakerd.8.inc -@@ -0,0 +1,5 @@ -+[synopsis] -+pacemakerd [options] -+ -+/subsidiary Pacemaker daemons/ -+.SH OPTIONS -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index 03d688e..ce194bf 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -23,12 +23,54 @@ - #include - #include - #include -+#include - #include - #include - - #include - #include - -+#define SUMMARY "pacemakerd - primary Pacemaker daemon that launches and monitors all subsidiary Pacemaker daemons" -+ -+struct { -+ gboolean features; -+ gboolean foreground; -+ gboolean shutdown; -+ gboolean standby; -+} options; -+ -+static gboolean -+pid_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { -+ return TRUE; -+} -+ -+static gboolean -+standby_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { -+ options.standby = TRUE; -+ pcmk__set_env_option("node_start_state", "standby"); -+ return TRUE; -+} -+ -+static GOptionEntry entries[] = { -+ { "features", 'F', 0, G_OPTION_ARG_NONE, &options.features, -+ "Display full version and list of features Pacemaker was built with", -+ NULL }, -+ { "foreground", 'f', 0, G_OPTION_ARG_NONE, &options.foreground, -+ "(Ignored) Pacemaker always runs in the foreground", -+ NULL }, -+ { "pid-file", 'p', 0, G_OPTION_ARG_CALLBACK, pid_cb, -+ "(Ignored) Daemon pid file location", -+ "FILE" }, -+ { "shutdown", 'S', 0, G_OPTION_ARG_NONE, &options.shutdown, -+ "Instruct Pacemaker to shutdown on this machine", -+ NULL }, -+ { "standby", 's', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, standby_cb, -+ "Start node in standby state", -+ NULL }, -+ -+ { NULL } -+}; -+ - static gboolean fatal_error = FALSE; - static GMainLoop *mainloop = NULL; - static bool global_keep_tracking = false; -@@ -642,49 +685,6 @@ pcmk_sigquit(int nsig) - .connection_destroyed = pcmk_ipc_destroy - }; - --static pcmk__cli_option_t long_options[] = { -- // long option, argument type, storage, short option, description, flags -- { -- "help", no_argument, NULL, '?', -- "\tThis text", pcmk__option_default -- }, -- { -- "version", no_argument, NULL, '$', -- "\tVersion information", pcmk__option_default -- }, -- { -- "verbose", no_argument, NULL, 'V', -- "\tIncrease debug output", pcmk__option_default -- }, -- { -- "shutdown", no_argument, NULL, 'S', -- "\tInstruct Pacemaker to shutdown on this machine", pcmk__option_default -- }, -- { -- "features", no_argument, NULL, 'F', -- "\tDisplay full version and list of features Pacemaker was built with", -- pcmk__option_default -- }, -- { -- "-spacer-", no_argument, NULL, '-', -- "\nAdditional Options:", pcmk__option_default -- }, -- { -- "foreground", no_argument, NULL, 'f', -- "\t(Ignored) Pacemaker always runs in the foreground", -- pcmk__option_default -- }, -- { -- "pid-file", required_argument, NULL, 'p', -- "\t(Ignored) Daemon pid file location", pcmk__option_default -- }, -- { -- "standby", no_argument, NULL, 's', -- "\tStart node in standby state", pcmk__option_default -- }, -- { 0, 0, 0, 0 } --}; -- - static void - mcp_chown(const char *path, uid_t uid, gid_t gid) - { -@@ -1168,83 +1211,66 @@ request_shutdown(crm_ipc_t *ipc) - return status; - } - -+static GOptionContext * -+build_arg_context(pcmk__common_args_t *args) { -+ GOptionContext *context = NULL; -+ -+ context = pcmk__build_arg_context(args, NULL, NULL, NULL); -+ pcmk__add_main_args(context, entries); -+ return context; -+} -+ - int - main(int argc, char **argv) - { -- int flag; -- int argerr = 0; -+ crm_exit_t exit_code = CRM_EX_OK; -+ -+ GError *error = NULL; -+ -+ pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "p"); -+ GOptionContext *context = build_arg_context(args); - -- int option_index = 0; - bool old_instance_connected = false; -- gboolean shutdown = FALSE; - - crm_ipc_t *old_instance = NULL; - qb_ipcs_service_t *ipcs = NULL; - - crm_log_preinit(NULL, argc, argv); -- pcmk__set_cli_options(NULL, "[options]", long_options, -- "primary Pacemaker daemon that launches and " -- "monitors all subsidiary Pacemaker daemons"); - mainloop_add_signal(SIGHUP, pcmk_ignore); - mainloop_add_signal(SIGQUIT, pcmk_sigquit); - -- while (1) { -- flag = pcmk__next_cli_option(argc, argv, &option_index, NULL); -- if (flag == -1) -- break; -- -- switch (flag) { -- case 'V': -- crm_bump_log_level(argc, argv); -- break; -- case 'f': -- /* Legacy */ -- break; -- case 'p': -- break; -- case 's': -- pcmk__set_env_option("node_start_state", "standby"); -- break; -- case '$': -- case '?': -- pcmk__cli_help(flag, CRM_EX_OK); -- break; -- case 'S': -- shutdown = TRUE; -- break; -- case 'F': -- printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION, -- CRM_FEATURE_SET, CRM_FEATURES); -- crm_exit(CRM_EX_OK); -- default: -- printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); -- ++argerr; -- break; -- } -+ if (!g_option_context_parse_strv(context, &processed_args, &error)) { -+ exit_code = CRM_EX_USAGE; -+ goto done; - } - -- if (optind < argc) { -- printf("non-option ARGV-elements: "); -- while (optind < argc) -- printf("%s ", argv[optind++]); -- printf("\n"); -- } -- if (argerr) { -- pcmk__cli_help('?', CRM_EX_USAGE); -+ if (options.features) { -+ printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION, -+ CRM_FEATURE_SET, CRM_FEATURES); -+ exit_code = CRM_EX_OK; -+ goto done; - } - -+ if (args->version) { -+ g_strfreev(processed_args); -+ pcmk__free_arg_context(context); -+ /* FIXME: When pacemakerd is converted to use formatted output, this can go. */ -+ pcmk__cli_help('v', CRM_EX_USAGE); -+ } - - setenv("LC_ALL", "C", 1); - - pcmk__set_env_option("mcp", "true"); - -+ pcmk__cli_init_logging("pacemakerd", args->verbosity); - crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); - - crm_debug("Checking for existing Pacemaker instance"); - old_instance = crm_ipc_new(CRM_SYSTEM_MCP, 0); - old_instance_connected = crm_ipc_connect(old_instance); - -- if (shutdown) { -+ if (options.shutdown) { - if (old_instance_connected) { - crm_exit(request_shutdown(old_instance)); - } else { -@@ -1253,22 +1279,25 @@ main(int argc, char **argv) - "Pacemaker instance: %s", strerror(errno)); - crm_ipc_close(old_instance); - crm_ipc_destroy(old_instance); -- crm_exit(CRM_EX_DISCONNECT); -+ exit_code = CRM_EX_DISCONNECT; -+ goto done; - } - - } else if (old_instance_connected) { - crm_ipc_close(old_instance); - crm_ipc_destroy(old_instance); - crm_err("Aborting start-up because active Pacemaker instance found"); -- crm_exit(CRM_EX_FATAL); -+ exit_code = CRM_EX_FATAL; -+ goto done; - } - - crm_ipc_close(old_instance); - crm_ipc_destroy(old_instance); - - #ifdef SUPPORT_COROSYNC - if (mcp_read_config() == FALSE) { -- crm_exit(CRM_EX_UNAVAILABLE); -+ exit_code = CRM_EX_UNAVAILABLE; -+ goto done; - } - #endif - -@@ -1292,7 +1321,8 @@ main(int argc, char **argv) - #ifdef SUPPORT_COROSYNC - /* Allows us to block shutdown */ - if (!cluster_connect_cfg()) { -- crm_exit(CRM_EX_PROTOCOL); -+ exit_code = CRM_EX_PROTOCOL; -+ goto done; - } - #endif - -@@ -1307,9 +1337,11 @@ main(int argc, char **argv) - case pcmk_rc_ok: - break; - case pcmk_rc_ipc_unauthorized: -- crm_exit(CRM_EX_CANTCREAT); -+ exit_code = CRM_EX_CANTCREAT; -+ goto done; - default: -- crm_exit(CRM_EX_FATAL); -+ exit_code = CRM_EX_FATAL; -+ goto done; - }; - - mainloop_add_signal(SIGTERM, pcmk_shutdown); -@@ -1342,5 +1374,11 @@ main(int argc, char **argv) - #ifdef SUPPORT_COROSYNC - cluster_disconnect_cfg(); - #endif -- crm_exit(CRM_EX_OK); -+ -+done: -+ g_strfreev(processed_args); -+ pcmk__free_arg_context(context); -+ -+ pcmk__output_and_clear_error(error, NULL); -+ crm_exit(exit_code); - } --- -1.8.3.1 - - -From 8f7924fbb2a012bedcad59335b7bebc5020b26e3 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 13 May 2021 13:27:13 -0400 -Subject: [PATCH 4/4] Low: pacemaker.service: Don't start pacemakerd with -f. - -This option is completely ignored by pacemakerd. ---- - daemons/pacemakerd/pacemaker.service.in | 2 +- - doc/sphinx/Clusters_from_Scratch/verification.rst | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in -index b128ddc..0363a22 100644 ---- a/daemons/pacemakerd/pacemaker.service.in -+++ b/daemons/pacemakerd/pacemaker.service.in -@@ -44,7 +44,7 @@ EnvironmentFile=-@CONFIGDIR@/pacemaker - EnvironmentFile=-@CONFIGDIR@/sbd - SuccessExitStatus=100 - --ExecStart=@sbindir@/pacemakerd -f -+ExecStart=@sbindir@/pacemakerd - - # Systemd v227 and above can limit the number of processes spawned by a - # service. That is a bad idea for an HA cluster resource manager, so disable it -diff --git a/doc/sphinx/Clusters_from_Scratch/verification.rst b/doc/sphinx/Clusters_from_Scratch/verification.rst -index 9d647f8..b7fa20e 100644 ---- a/doc/sphinx/Clusters_from_Scratch/verification.rst -+++ b/doc/sphinx/Clusters_from_Scratch/verification.rst -@@ -103,7 +103,7 @@ the necessary processes are running: - 2 ? S 0:00 [kthreadd] - ...lots of processes... - 17121 ? SLsl 0:01 /usr/sbin/corosync -f -- 17133 ? Ss 0:00 /usr/sbin/pacemakerd -f -+ 17133 ? Ss 0:00 /usr/sbin/pacemakerd - 17134 ? Ss 0:00 \_ /usr/libexec/pacemaker/pacemaker-based - 17135 ? Ss 0:00 \_ /usr/libexec/pacemaker/pacemaker-fenced - 17136 ? Ss 0:00 \_ /usr/libexec/pacemaker/pacemaker-execd --- -1.8.3.1 - diff --git a/SOURCES/003-fencing-reasons.patch b/SOURCES/003-fencing-reasons.patch new file mode 100644 index 0000000..666a12a --- /dev/null +++ b/SOURCES/003-fencing-reasons.patch @@ -0,0 +1,2476 @@ +From 8e6362cb2129bd56f817d449a195f3da87a545fa Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 14:28:56 -0600 +Subject: [PATCH 01/13] Refactor: libcrmcommon,fencer: convenience macro for + initializing results + +for future reuse +--- + daemons/fenced/fenced_commands.c | 14 ++------------ + include/crm/common/results_internal.h | 15 +++++++++++++++ + 2 files changed, 17 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 87600573e..9f2f1cc40 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -388,12 +388,7 @@ static void + report_internal_result(async_command_t *cmd, int exit_status, + int execution_status, const char *exit_reason) + { +- pcmk__action_result_t result = { +- // Ensure we don't pass garbage to free() +- .exit_reason = NULL, +- .action_stdout = NULL, +- .action_stderr = NULL +- }; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + pcmk__set_result(&result, exit_status, execution_status, exit_reason); + cmd->done_cb(0, &result, cmd); +@@ -2616,12 +2611,7 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + } + + if (device == NULL) { // No device found +- pcmk__action_result_t result = { +- // Ensure we don't pass garbage to free() +- .exit_reason = NULL, +- .action_stdout = NULL, +- .action_stderr = NULL +- }; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, + "No fence device configured for target"); +diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h +index 804bf2a7a..6befaa0ed 100644 +--- a/include/crm/common/results_internal.h ++++ b/include/crm/common/results_internal.h +@@ -30,6 +30,21 @@ typedef struct { + char *action_stderr; // Action error output + } pcmk__action_result_t; + ++/*! ++ * \internal ++ * \brief Static initialization for an action result ++ * ++ * \note Importantly, this ensures pcmk__reset_result() won't try to free ++ * garbage. ++ */ ++#define PCMK__UNKNOWN_RESULT { \ ++ .exit_status = CRM_EX_OK, \ ++ .execution_status = PCMK_EXEC_UNKNOWN, \ ++ .exit_reason = NULL, \ ++ .action_stdout = NULL, \ ++ .action_stderr = NULL, \ ++ } ++ + void pcmk__set_result(pcmk__action_result_t *result, int exit_status, + enum pcmk_exec_status exec_status, + const char *exit_reason); +-- +2.27.0 + + +From 0937c92476ac737a5f5146932824bde8bdd7db98 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 16:02:27 -0600 +Subject: [PATCH 02/13] Refactor: various: add convenience function for + checking result success + +A successful pcmk__action_result_t has both exit status CRM_EX_OK (a.k.a +PCMK_OCF_OK) and execution status PCMK_EXEC_DONE. Since checking that is +clunky, we sometimes just check exit status, which is less than ideal. + +The convenience function makes it easy to check both, and improves readability. +--- + daemons/controld/controld_remote_ra.c | 4 ++-- + daemons/execd/execd_commands.c | 12 ++++++------ + daemons/fenced/fenced_commands.c | 14 ++++++-------- + include/crm/common/results_internal.h | 16 ++++++++++++++++ + lib/fencing/st_client.c | 4 ++-- + lib/fencing/st_rhcs.c | 2 +- + 6 files changed, 33 insertions(+), 19 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 74cbfd673..55ac162c7 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -297,7 +297,7 @@ static void + check_remote_node_state(remote_ra_cmd_t *cmd) + { + /* Only successful actions can change node state */ +- if (cmd->result.exit_status != PCMK_OCF_OK) { ++ if (!pcmk__result_ok(&(cmd->result))) { + return; + } + +@@ -365,7 +365,7 @@ report_remote_ra_result(remote_ra_cmd_t * cmd) + lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status, + cmd->result.exit_reason); + +- if (cmd->reported_success && (cmd->result.exit_status != PCMK_OCF_OK)) { ++ if (cmd->reported_success && !pcmk__result_ok(&(cmd->result))) { + op.t_rcchange = (unsigned int) time(NULL); + /* This edge case will likely never ever occur, but if it does the + * result is that a failure will not be processed correctly. This is only +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 667525039..02070bf11 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -878,7 +878,7 @@ action_complete(svc_action_t * action) + } + + if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { +- if ((cmd->result.exit_status == PCMK_OCF_OK) ++ if (pcmk__result_ok(&(cmd->result)) + && pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) { + /* systemd returns from start and stop actions after the action + * begins, not after it completes. We have to jump through a few +@@ -894,7 +894,7 @@ action_complete(svc_action_t * action) + if (cmd->result.execution_status == PCMK_EXEC_PENDING) { + goagain = true; + +- } else if ((cmd->result.exit_status == PCMK_OCF_OK) ++ } else if (pcmk__result_ok(&(cmd->result)) + && pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { + goagain = true; + +@@ -927,12 +927,12 @@ action_complete(svc_action_t * action) + #if SUPPORT_NAGIOS + if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) { + if (action_matches(cmd, "monitor", 0) +- && (cmd->result.exit_status == PCMK_OCF_OK)) { ++ && pcmk__result_ok(&(cmd->result))) { + /* Successfully executed --version for the nagios plugin */ + cmd->result.exit_status = PCMK_OCF_NOT_RUNNING; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei) +- && (cmd->result.exit_status != PCMK_OCF_OK)) { ++ && !pcmk__result_ok(&(cmd->result))) { + #ifdef PCMK__TIME_USE_CGT + goagain = true; + #endif +@@ -955,7 +955,7 @@ action_complete(svc_action_t * action) + cmd->start_delay = delay; + cmd->timeout = timeout_left; + +- if (cmd->result.exit_status == PCMK_OCF_OK) { ++ if (pcmk__result_ok(&(cmd->result))) { + crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", + cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); + +@@ -1066,7 +1066,7 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + cmd->interval_ms, rc); + + // Certain successful actions change the known state of the resource +- if ((rsc != NULL) && (cmd->result.exit_status == PCMK_OCF_OK)) { ++ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 9f2f1cc40..26501a4b3 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1188,8 +1188,7 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, + + mainloop_set_trigger(dev->work); + +- if ((result->execution_status == PCMK_EXEC_DONE) +- && (result->exit_status == CRM_EX_OK)) { ++ if (pcmk__result_ok(result)) { + crm_info("Refreshing target list for %s", dev->id); + g_list_free_full(dev->targets, free); + dev->targets = stonith__parse_targets(result->action_stdout); +@@ -2310,15 +2309,14 @@ log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, + GString *msg = g_string_sized_new(80); // Reasonable starting size + + // Choose log levels appropriately if we have a result +- if ((result->execution_status == PCMK_EXEC_DONE) +- && (result->exit_status == CRM_EX_OK)) { // Success ++ if (pcmk__result_ok(result)) { + log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; + if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { + output_log_level = LOG_DEBUG; + } + next = NULL; +- } else { // Failure ++ } else { + log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; + if ((result->action_stdout != NULL) + && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { +@@ -2482,7 +2480,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + /* The device is ready to do something else now */ + device = g_hash_table_lookup(device_list, cmd->device); + if (device) { +- if (!device->verified && (result->exit_status == CRM_EX_OK) && ++ if (!device->verified && pcmk__result_ok(result) && + (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { + + device->verified = TRUE; +@@ -2491,7 +2489,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + mainloop_set_trigger(device->work); + } + +- if (result->exit_status == CRM_EX_OK) { ++ if (pcmk__result_ok(result)) { + GList *iter; + /* see if there are any required devices left to execute for this op */ + for (iter = cmd->device_next; iter != NULL; iter = iter->next) { +@@ -2523,7 +2521,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) + + send_async_reply(cmd, result, pid, false); + +- if (result->exit_status != CRM_EX_OK) { ++ if (!pcmk__result_ok(result)) { + goto done; + } + +diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h +index 6befaa0ed..0c5833937 100644 +--- a/include/crm/common/results_internal.h ++++ b/include/crm/common/results_internal.h +@@ -54,4 +54,20 @@ void pcmk__set_result_output(pcmk__action_result_t *result, + + void pcmk__reset_result(pcmk__action_result_t *result); + ++/*! ++ * \internal ++ * \brief Check whether a result is OK ++ * ++ * \param[in] result ++ * ++ * \return true if the result's exit status is CRM_EX_OK and its ++ * execution status is PCMK_EXEC_DONE, otherwise false ++ */ ++static inline bool ++pcmk__result_ok(const pcmk__action_result_t *result) ++{ ++ return (result != NULL) && (result->exit_status == CRM_EX_OK) ++ && (result->execution_status == PCMK_EXEC_DONE); ++} ++ + #endif // PCMK__COMMON_RESULTS_INTERNAL__H +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2fbff7f24..af461d0d4 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -760,7 +760,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + default: break; + } + +- if (result->exit_status == CRM_EX_OK) { ++ if (pcmk__result_ok(result)) { + return pcmk_rc_ok; + } + +@@ -797,7 +797,7 @@ stonith_action_async_done(svc_action_t *svc_action) + + log_action(action, action->pid); + +- if ((action->result.exit_status != CRM_EX_OK) ++ if (!pcmk__result_ok(&(action->result)) + && update_remaining_timeout(action)) { + + int rc = internal_stonith_action_execute(action); +diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c +index 6c8cbedc7..865e04bc2 100644 +--- a/lib/fencing/st_rhcs.c ++++ b/lib/fencing/st_rhcs.c +@@ -148,7 +148,7 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) + return rc; + } + +- if (result->exit_status != CRM_EX_OK) { ++ if (!pcmk__result_ok(result)) { + crm_warn("Metadata action for %s returned error code %d", + agent, result->exit_status); + rc = pcmk_rc2legacy(stonith__result2rc(result)); +-- +2.27.0 + + +From 4c39ff00a0c028354a9da7f80986f7e34b05ba08 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 16:07:01 -0600 +Subject: [PATCH 03/13] Low: fencing: improve mapping of execution status to + legacy return code + +PCMK_EXEC_PENDING is likely not possible with the current code, but map it to +EINPROGRESS for completeness. + +PCMK_EXEC_INVALID is not yet used by the fencer but will be. +--- + lib/fencing/st_client.c | 30 ++++++++++++++++++++++++++---- + 1 file changed, 26 insertions(+), 4 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index af461d0d4..93513e9f3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -749,7 +749,12 @@ update_remaining_timeout(stonith_action_t * action) + int + stonith__result2rc(const pcmk__action_result_t *result) + { ++ if (pcmk__result_ok(result)) { ++ return pcmk_rc_ok; ++ } ++ + switch (result->execution_status) { ++ case PCMK_EXEC_PENDING: return EINPROGRESS; + case PCMK_EXEC_CANCELLED: return ECANCELED; + case PCMK_EXEC_TIMEOUT: return ETIME; + case PCMK_EXEC_NOT_INSTALLED: return ENOENT; +@@ -757,11 +762,28 @@ stonith__result2rc(const pcmk__action_result_t *result) + case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; + case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; + case PCMK_EXEC_NO_SECRETS: return EACCES; +- default: break; +- } + +- if (pcmk__result_ok(result)) { +- return pcmk_rc_ok; ++ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API ++ * operations that don't involve executing an agent (for example, ++ * registering devices). This allows us to use the CRM_EX_* codes in the ++ * exit status for finer-grained responses. ++ */ ++ case PCMK_EXEC_INVALID: ++ switch (result->exit_status) { ++ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; ++ case CRM_EX_PROTOCOL: return EPROTO; ++ ++ /* CRM_EX_EXPIRED is used for orphaned fencing operations left ++ * over from a previous instance of the fencer. For API backward ++ * compatibility, this is mapped to the previously used code for ++ * this case, EHOSTUNREACH. ++ */ ++ case CRM_EX_EXPIRED: return EHOSTUNREACH; ++ default: break; ++ } ++ ++ default: ++ break; + } + + // Try to provide useful error code based on result's error output +-- +2.27.0 + + +From 4e638783d1cd7c9398a603fc6df7e9d868262b16 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 11:41:12 -0600 +Subject: [PATCH 04/13] Refactor: libstonithd: separate action-related code + into own source file + +Everything related to stonith_action_t has been moved from st_client.c to a new +st_actions.c, since st_client.c was ridiculously large, and the action stuff +isn't all client-related. No code was changed. + +Before: + 2804 st_client.c + +After: + 545 lib/fencing/st_actions.c + 2278 lib/fencing/st_client.c +--- + lib/fencing/Makefile.am | 2 +- + lib/fencing/st_actions.c | 545 +++++++++++++++++++++++++++++++++++++++ + lib/fencing/st_client.c | 528 +------------------------------------ + 3 files changed, 547 insertions(+), 528 deletions(-) + create mode 100644 lib/fencing/st_actions.c + +diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am +index 205c4873d..dac215c16 100644 +--- a/lib/fencing/Makefile.am ++++ b/lib/fencing/Makefile.am +@@ -22,7 +22,7 @@ libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) + libstonithd_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la + libstonithd_la_LIBADD += $(top_builddir)/lib/services/libcrmservice.la + +-libstonithd_la_SOURCES = st_client.c st_output.c st_rhcs.c ++libstonithd_la_SOURCES = st_actions.c st_client.c st_output.c st_rhcs.c + if BUILD_LHA_SUPPORT + libstonithd_la_SOURCES += st_lha.c + endif +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +new file mode 100644 +index 000000000..64d3afd5d +--- /dev/null ++++ b/lib/fencing/st_actions.c +@@ -0,0 +1,545 @@ ++/* ++ * Copyright 2004-2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "fencing_private.h" ++ ++struct stonith_action_s { ++ /*! user defined data */ ++ char *agent; ++ char *action; ++ char *victim; ++ GHashTable *args; ++ int timeout; ++ int async; ++ void *userdata; ++ void (*done_cb) (int pid, const pcmk__action_result_t *result, ++ void *user_data); ++ void (*fork_cb) (int pid, void *user_data); ++ ++ svc_action_t *svc_action; ++ ++ /*! internal timing information */ ++ time_t initial_start_time; ++ int tries; ++ int remaining_timeout; ++ int max_retries; ++ ++ int pid; ++ pcmk__action_result_t result; ++}; ++ ++static int internal_stonith_action_execute(stonith_action_t *action); ++static void log_action(stonith_action_t *action, pid_t pid); ++ ++/*! ++ * \internal ++ * \brief Set an action's result based on services library result ++ * ++ * \param[in] action Fence action to set result for ++ * \param[in] svc_action Service action to get result from ++ */ ++static void ++set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) ++{ ++ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, ++ services__exit_reason(svc_action)); ++ pcmk__set_result_output(&(action->result), ++ services__grab_stdout(svc_action), ++ services__grab_stderr(svc_action)); ++} ++ ++static void ++log_action(stonith_action_t *action, pid_t pid) ++{ ++ /* The services library has already logged the output at info or debug ++ * level, so just raise to warning for stderr. ++ */ ++ if (action->result.action_stderr != NULL) { ++ /* Logging the whole string confuses syslog when the string is xml */ ++ char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); ++ ++ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); ++ free(prefix); ++ } ++} ++ ++static void ++append_config_arg(gpointer key, gpointer value, gpointer user_data) ++{ ++ /* The fencer will filter "action" out when it registers the device, ++ * but ignore it here in case any external API users don't. ++ * ++ * Also filter out parameters handled directly by Pacemaker. ++ */ ++ if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) ++ && !pcmk_stonith_param(key) ++ && (strstr(key, CRM_META) == NULL) ++ && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { ++ ++ crm_trace("Passing %s=%s with fence action", ++ (const char *) key, (const char *) (value? value : "")); ++ g_hash_table_insert((GHashTable *) user_data, ++ strdup(key), strdup(value? value : "")); ++ } ++} ++ ++static GHashTable * ++make_args(const char *agent, const char *action, const char *victim, ++ uint32_t victim_nodeid, GHashTable * device_args, ++ GHashTable * port_map, const char *host_arg) ++{ ++ GHashTable *arg_list = NULL; ++ const char *value = NULL; ++ ++ CRM_CHECK(action != NULL, return NULL); ++ ++ arg_list = pcmk__strkey_table(free, free); ++ ++ // Add action to arguments (using an alias if requested) ++ if (device_args) { ++ char buffer[512]; ++ ++ snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); ++ value = g_hash_table_lookup(device_args, buffer); ++ if (value) { ++ crm_debug("Substituting '%s' for fence action %s targeting %s", ++ value, action, victim); ++ action = value; ++ } ++ } ++ g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), ++ strdup(action)); ++ ++ /* If this is a fencing operation against another node, add more standard ++ * arguments. ++ */ ++ if (victim && device_args) { ++ const char *param = NULL; ++ ++ /* Always pass the target's name, per ++ * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md ++ */ ++ g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); ++ ++ // If the target's node ID was specified, pass it, too ++ if (victim_nodeid) { ++ char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); ++ ++ // cts-fencing looks for this log message ++ crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", ++ nodeid, action, victim); ++ g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); ++ } ++ ++ // Check whether target must be specified in some other way ++ param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); ++ if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) ++ && !pcmk__str_eq(param, "none", pcmk__str_casei)) { ++ ++ if (param == NULL) { ++ /* Use the caller's default for pcmk_host_argument, or "port" if ++ * none was given ++ */ ++ param = (host_arg == NULL)? "port" : host_arg; ++ } ++ value = g_hash_table_lookup(device_args, param); ++ ++ if (pcmk__str_eq(value, "dynamic", ++ pcmk__str_casei|pcmk__str_null_matches)) { ++ /* If the host argument was "dynamic" or not explicitly specified, ++ * add it with the target ++ */ ++ const char *alias = NULL; ++ ++ if (port_map) { ++ alias = g_hash_table_lookup(port_map, victim); ++ } ++ if (alias == NULL) { ++ alias = victim; ++ } ++ crm_debug("Passing %s='%s' with fence action %s targeting %s", ++ param, alias, action, victim); ++ g_hash_table_insert(arg_list, strdup(param), strdup(alias)); ++ } ++ } ++ } ++ ++ if (device_args) { ++ g_hash_table_foreach(device_args, append_config_arg, arg_list); ++ } ++ ++ return arg_list; ++} ++ ++/*! ++ * \internal ++ * \brief Free all memory used by a stonith action ++ * ++ * \param[in,out] action Action to free ++ */ ++void ++stonith__destroy_action(stonith_action_t *action) ++{ ++ if (action) { ++ free(action->agent); ++ if (action->args) { ++ g_hash_table_destroy(action->args); ++ } ++ free(action->action); ++ free(action->victim); ++ if (action->svc_action) { ++ services_action_free(action->svc_action); ++ } ++ pcmk__reset_result(&(action->result)); ++ free(action); ++ } ++} ++ ++/*! ++ * \internal ++ * \brief Get the result of an executed stonith action ++ * ++ * \param[in] action Executed action ++ * ++ * \return Pointer to action's result (or NULL if \p action is NULL) ++ */ ++pcmk__action_result_t * ++stonith__action_result(stonith_action_t *action) ++{ ++ return (action == NULL)? NULL : &(action->result); ++} ++ ++#define FAILURE_MAX_RETRIES 2 ++stonith_action_t * ++stonith_action_create(const char *agent, ++ const char *_action, ++ const char *victim, ++ uint32_t victim_nodeid, ++ int timeout, GHashTable * device_args, ++ GHashTable * port_map, const char *host_arg) ++{ ++ stonith_action_t *action; ++ ++ action = calloc(1, sizeof(stonith_action_t)); ++ action->args = make_args(agent, _action, victim, victim_nodeid, ++ device_args, port_map, host_arg); ++ crm_debug("Preparing '%s' action for %s using agent %s", ++ _action, (victim? victim : "no target"), agent); ++ action->agent = strdup(agent); ++ action->action = strdup(_action); ++ if (victim) { ++ action->victim = strdup(victim); ++ } ++ action->timeout = action->remaining_timeout = timeout; ++ action->max_retries = FAILURE_MAX_RETRIES; ++ ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, ++ "Initialization bug in fencing library"); ++ ++ if (device_args) { ++ char buffer[512]; ++ const char *value = NULL; ++ ++ snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); ++ value = g_hash_table_lookup(device_args, buffer); ++ ++ if (value) { ++ action->max_retries = atoi(value); ++ } ++ } ++ ++ return action; ++} ++ ++static gboolean ++update_remaining_timeout(stonith_action_t * action) ++{ ++ int diff = time(NULL) - action->initial_start_time; ++ ++ if (action->tries >= action->max_retries) { ++ crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", ++ action->agent, action->action, action->max_retries); ++ action->remaining_timeout = 0; ++ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) ++ && (diff < (action->timeout * 0.7))) { ++ /* only set remaining timeout period if there is 30% ++ * or greater of the original timeout period left */ ++ action->remaining_timeout = action->timeout - diff; ++ } else { ++ action->remaining_timeout = 0; ++ } ++ return action->remaining_timeout ? TRUE : FALSE; ++} ++ ++/*! ++ * \internal ++ * \brief Map a fencing action result to a standard return code ++ * ++ * \param[in] result Fencing action result to map ++ * ++ * \return Standard Pacemaker return code that best corresponds to \p result ++ */ ++int ++stonith__result2rc(const pcmk__action_result_t *result) ++{ ++ if (pcmk__result_ok(result)) { ++ return pcmk_rc_ok; ++ } ++ ++ switch (result->execution_status) { ++ case PCMK_EXEC_PENDING: return EINPROGRESS; ++ case PCMK_EXEC_CANCELLED: return ECANCELED; ++ case PCMK_EXEC_TIMEOUT: return ETIME; ++ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; ++ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; ++ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; ++ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; ++ case PCMK_EXEC_NO_SECRETS: return EACCES; ++ ++ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API ++ * operations that don't involve executing an agent (for example, ++ * registering devices). This allows us to use the CRM_EX_* codes in the ++ * exit status for finer-grained responses. ++ */ ++ case PCMK_EXEC_INVALID: ++ switch (result->exit_status) { ++ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; ++ case CRM_EX_PROTOCOL: return EPROTO; ++ ++ /* CRM_EX_EXPIRED is used for orphaned fencing operations left ++ * over from a previous instance of the fencer. For API backward ++ * compatibility, this is mapped to the previously used code for ++ * this case, EHOSTUNREACH. ++ */ ++ case CRM_EX_EXPIRED: return EHOSTUNREACH; ++ default: break; ++ } ++ ++ default: ++ break; ++ } ++ ++ // Try to provide useful error code based on result's error output ++ ++ if (result->action_stderr == NULL) { ++ return ENODATA; ++ ++ } else if (strcasestr(result->action_stderr, "timed out") ++ || strcasestr(result->action_stderr, "timeout")) { ++ return ETIME; ++ ++ } else if (strcasestr(result->action_stderr, "unrecognised action") ++ || strcasestr(result->action_stderr, "unrecognized action") ++ || strcasestr(result->action_stderr, "unsupported action")) { ++ return EOPNOTSUPP; ++ } ++ ++ // Oh well, we tried ++ return pcmk_rc_error; ++} ++ ++static void ++stonith_action_async_done(svc_action_t *svc_action) ++{ ++ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; ++ ++ set_result_from_svc_action(action, svc_action); ++ ++ svc_action->params = NULL; ++ ++ crm_debug("Child process %d performing action '%s' exited with rc %d", ++ action->pid, action->action, svc_action->rc); ++ ++ log_action(action, action->pid); ++ ++ if (!pcmk__result_ok(&(action->result)) ++ && update_remaining_timeout(action)) { ++ ++ int rc = internal_stonith_action_execute(action); ++ if (rc == pcmk_ok) { ++ return; ++ } ++ } ++ ++ if (action->done_cb) { ++ action->done_cb(action->pid, &(action->result), action->userdata); ++ } ++ ++ action->svc_action = NULL; // don't remove our caller ++ stonith__destroy_action(action); ++} ++ ++static void ++stonith_action_async_forked(svc_action_t *svc_action) ++{ ++ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; ++ ++ action->pid = svc_action->pid; ++ action->svc_action = svc_action; ++ ++ if (action->fork_cb) { ++ (action->fork_cb) (svc_action->pid, action->userdata); ++ } ++ ++ crm_trace("Child process %d performing action '%s' successfully forked", ++ action->pid, action->action); ++} ++ ++static int ++internal_stonith_action_execute(stonith_action_t * action) ++{ ++ int rc = -EPROTO; ++ int is_retry = 0; ++ svc_action_t *svc_action = NULL; ++ static int stonith_sequence = 0; ++ char *buffer = NULL; ++ ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ if ((action->action == NULL) || (action->args == NULL) ++ || (action->agent == NULL)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); ++ return -EINVAL; ++ } ++ ++ if (!action->tries) { ++ action->initial_start_time = time(NULL); ++ } ++ action->tries++; ++ ++ if (action->tries > 1) { ++ crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", ++ action->tries, action->agent, action->action, action->remaining_timeout); ++ is_retry = 1; ++ } ++ ++ buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", ++ basename(action->agent)); ++ svc_action = services_action_create_generic(buffer, NULL); ++ free(buffer); ++ ++ if (svc_action->rc != PCMK_OCF_UNKNOWN) { ++ set_result_from_svc_action(action, svc_action); ++ services_action_free(svc_action); ++ return -E2BIG; ++ } ++ ++ svc_action->timeout = 1000 * action->remaining_timeout; ++ svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); ++ svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), ++ action->action, action->tries); ++ svc_action->agent = strdup(action->agent); ++ svc_action->sequence = stonith_sequence++; ++ svc_action->params = action->args; ++ svc_action->cb_data = (void *) action; ++ svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, ++ LOG_TRACE, "Action", ++ svc_action->id, svc_action->flags, ++ SVC_ACTION_NON_BLOCKED, ++ "SVC_ACTION_NON_BLOCKED"); ++ ++ /* keep retries from executing out of control and free previous results */ ++ if (is_retry) { ++ pcmk__reset_result(&(action->result)); ++ sleep(1); ++ } ++ ++ if (action->async) { ++ /* async */ ++ if (services_action_async_fork_notify(svc_action, ++ &stonith_action_async_done, ++ &stonith_action_async_forked)) { ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, ++ PCMK_EXEC_PENDING, NULL); ++ return pcmk_ok; ++ } ++ ++ } else if (services_action_sync(svc_action)) { // sync success ++ rc = pcmk_ok; ++ ++ } else { // sync failure ++ rc = -ECONNABORTED; ++ } ++ ++ set_result_from_svc_action(action, svc_action); ++ svc_action->params = NULL; ++ services_action_free(svc_action); ++ return rc; ++} ++ ++/*! ++ * \internal ++ * \brief Kick off execution of an async stonith action ++ * ++ * \param[in,out] action Action to be executed ++ * \param[in,out] userdata Datapointer to be passed to callbacks ++ * \param[in] done Callback to notify action has failed/succeeded ++ * \param[in] fork_callback Callback to notify successful fork of child ++ * ++ * \return pcmk_ok if ownership of action has been taken, -errno otherwise ++ */ ++int ++stonith_action_execute_async(stonith_action_t * action, ++ void *userdata, ++ void (*done) (int pid, ++ const pcmk__action_result_t *result, ++ void *user_data), ++ void (*fork_cb) (int pid, void *user_data)) ++{ ++ if (!action) { ++ return -EINVAL; ++ } ++ ++ action->userdata = userdata; ++ action->done_cb = done; ++ action->fork_cb = fork_cb; ++ action->async = 1; ++ ++ return internal_stonith_action_execute(action); ++} ++ ++/*! ++ * \internal ++ * \brief Execute a stonith action ++ * ++ * \param[in,out] action Action to execute ++ * ++ * \return pcmk_ok on success, -errno otherwise ++ */ ++int ++stonith__execute(stonith_action_t *action) ++{ ++ int rc = pcmk_ok; ++ ++ CRM_CHECK(action != NULL, return -EINVAL); ++ ++ // Keep trying until success, max retries, or timeout ++ do { ++ rc = internal_stonith_action_execute(action); ++ } while ((rc != pcmk_ok) && update_remaining_timeout(action)); ++ ++ return rc; ++} +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 93513e9f3..944cd1863 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -8,28 +8,20 @@ + */ + + #include +-#include ++ + #include + #include + #include + #include + #include +-#include + #include +- +-#include + #include +-#include +- + #include + + #include + #include + #include + #include +-#include +-#include +-#include + + #include + +@@ -37,31 +29,6 @@ + + CRM_TRACE_INIT_DATA(stonith); + +-struct stonith_action_s { +- /*! user defined data */ +- char *agent; +- char *action; +- char *victim; +- GHashTable *args; +- int timeout; +- int async; +- void *userdata; +- void (*done_cb) (int pid, const pcmk__action_result_t *result, +- void *user_data); +- void (*fork_cb) (int pid, void *user_data); +- +- svc_action_t *svc_action; +- +- /*! internal timing information */ +- time_t initial_start_time; +- int tries; +- int remaining_timeout; +- int max_retries; +- +- int pid; +- pcmk__action_result_t result; +-}; +- + typedef struct stonith_private_s { + char *token; + crm_ipc_t *ipc; +@@ -118,8 +85,6 @@ static int stonith_send_command(stonith_t *stonith, const char *op, + + static void stonith_connection_destroy(gpointer user_data); + static void stonith_send_notification(gpointer data, gpointer user_data); +-static int internal_stonith_action_execute(stonith_action_t * action); +-static void log_action(stonith_action_t *action, pid_t pid); + + /*! + * \brief Get agent namespace by name +@@ -196,23 +161,6 @@ stonith_get_namespace(const char *agent, const char *namespace_s) + return st_namespace_invalid; + } + +-/*! +- * \internal +- * \brief Set an action's result based on services library result +- * +- * \param[in] action Fence action to set result for +- * \param[in] svc_action Service action to get result from +- */ +-static void +-set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) +-{ +- pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, +- services__exit_reason(svc_action)); +- pcmk__set_result_output(&(action->result), +- services__grab_stdout(svc_action), +- services__grab_stderr(svc_action)); +-} +- + gboolean + stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + { +@@ -273,21 +221,6 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) + return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); + } + +-static void +-log_action(stonith_action_t *action, pid_t pid) +-{ +- /* The services library has already logged the output at info or debug +- * level, so just raise to warning for stderr. +- */ +- if (action->result.action_stderr != NULL) { +- /* Logging the whole string confuses syslog when the string is xml */ +- char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); +- +- crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); +- free(prefix); +- } +-} +- + /* when cycling through the list we don't want to delete items + so just mark them and when we know nobody is using the list + loop over it to remove the marked items +@@ -530,465 +463,6 @@ stonith_api_register_level(stonith_t * st, int options, const char *node, int le + level, device_list); + } + +-static void +-append_config_arg(gpointer key, gpointer value, gpointer user_data) +-{ +- /* The fencer will filter "action" out when it registers the device, +- * but ignore it here in case any external API users don't. +- * +- * Also filter out parameters handled directly by Pacemaker. +- */ +- if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) +- && !pcmk_stonith_param(key) +- && (strstr(key, CRM_META) == NULL) +- && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { +- +- crm_trace("Passing %s=%s with fence action", +- (const char *) key, (const char *) (value? value : "")); +- g_hash_table_insert((GHashTable *) user_data, +- strdup(key), strdup(value? value : "")); +- } +-} +- +-static GHashTable * +-make_args(const char *agent, const char *action, const char *victim, +- uint32_t victim_nodeid, GHashTable * device_args, +- GHashTable * port_map, const char *host_arg) +-{ +- GHashTable *arg_list = NULL; +- const char *value = NULL; +- +- CRM_CHECK(action != NULL, return NULL); +- +- arg_list = pcmk__strkey_table(free, free); +- +- // Add action to arguments (using an alias if requested) +- if (device_args) { +- char buffer[512]; +- +- snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); +- value = g_hash_table_lookup(device_args, buffer); +- if (value) { +- crm_debug("Substituting '%s' for fence action %s targeting %s", +- value, action, victim); +- action = value; +- } +- } +- g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), +- strdup(action)); +- +- /* If this is a fencing operation against another node, add more standard +- * arguments. +- */ +- if (victim && device_args) { +- const char *param = NULL; +- +- /* Always pass the target's name, per +- * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md +- */ +- g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); +- +- // If the target's node ID was specified, pass it, too +- if (victim_nodeid) { +- char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); +- +- // cts-fencing looks for this log message +- crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", +- nodeid, action, victim); +- g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); +- } +- +- // Check whether target must be specified in some other way +- param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); +- if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) +- && !pcmk__str_eq(param, "none", pcmk__str_casei)) { +- +- if (param == NULL) { +- /* Use the caller's default for pcmk_host_argument, or "port" if +- * none was given +- */ +- param = (host_arg == NULL)? "port" : host_arg; +- } +- value = g_hash_table_lookup(device_args, param); +- +- if (pcmk__str_eq(value, "dynamic", +- pcmk__str_casei|pcmk__str_null_matches)) { +- /* If the host argument was "dynamic" or not explicitly specified, +- * add it with the target +- */ +- const char *alias = NULL; +- +- if (port_map) { +- alias = g_hash_table_lookup(port_map, victim); +- } +- if (alias == NULL) { +- alias = victim; +- } +- crm_debug("Passing %s='%s' with fence action %s targeting %s", +- param, alias, action, victim); +- g_hash_table_insert(arg_list, strdup(param), strdup(alias)); +- } +- } +- } +- +- if (device_args) { +- g_hash_table_foreach(device_args, append_config_arg, arg_list); +- } +- +- return arg_list; +-} +- +-/*! +- * \internal +- * \brief Free all memory used by a stonith action +- * +- * \param[in,out] action Action to free +- */ +-void +-stonith__destroy_action(stonith_action_t *action) +-{ +- if (action) { +- free(action->agent); +- if (action->args) { +- g_hash_table_destroy(action->args); +- } +- free(action->action); +- free(action->victim); +- if (action->svc_action) { +- services_action_free(action->svc_action); +- } +- pcmk__reset_result(&(action->result)); +- free(action); +- } +-} +- +-/*! +- * \internal +- * \brief Get the result of an executed stonith action +- * +- * \param[in] action Executed action +- * +- * \return Pointer to action's result (or NULL if \p action is NULL) +- */ +-pcmk__action_result_t * +-stonith__action_result(stonith_action_t *action) +-{ +- return (action == NULL)? NULL : &(action->result); +-} +- +-#define FAILURE_MAX_RETRIES 2 +-stonith_action_t * +-stonith_action_create(const char *agent, +- const char *_action, +- const char *victim, +- uint32_t victim_nodeid, +- int timeout, GHashTable * device_args, +- GHashTable * port_map, const char *host_arg) +-{ +- stonith_action_t *action; +- +- action = calloc(1, sizeof(stonith_action_t)); +- action->args = make_args(agent, _action, victim, victim_nodeid, +- device_args, port_map, host_arg); +- crm_debug("Preparing '%s' action for %s using agent %s", +- _action, (victim? victim : "no target"), agent); +- action->agent = strdup(agent); +- action->action = strdup(_action); +- if (victim) { +- action->victim = strdup(victim); +- } +- action->timeout = action->remaining_timeout = timeout; +- action->max_retries = FAILURE_MAX_RETRIES; +- +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, +- "Initialization bug in fencing library"); +- +- if (device_args) { +- char buffer[512]; +- const char *value = NULL; +- +- snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); +- value = g_hash_table_lookup(device_args, buffer); +- +- if (value) { +- action->max_retries = atoi(value); +- } +- } +- +- return action; +-} +- +-static gboolean +-update_remaining_timeout(stonith_action_t * action) +-{ +- int diff = time(NULL) - action->initial_start_time; +- +- if (action->tries >= action->max_retries) { +- crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", +- action->agent, action->action, action->max_retries); +- action->remaining_timeout = 0; +- } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) +- && (diff < (action->timeout * 0.7))) { +- /* only set remaining timeout period if there is 30% +- * or greater of the original timeout period left */ +- action->remaining_timeout = action->timeout - diff; +- } else { +- action->remaining_timeout = 0; +- } +- return action->remaining_timeout ? TRUE : FALSE; +-} +- +-/*! +- * \internal +- * \brief Map a fencing action result to a standard return code +- * +- * \param[in] result Fencing action result to map +- * +- * \return Standard Pacemaker return code that best corresponds to \p result +- */ +-int +-stonith__result2rc(const pcmk__action_result_t *result) +-{ +- if (pcmk__result_ok(result)) { +- return pcmk_rc_ok; +- } +- +- switch (result->execution_status) { +- case PCMK_EXEC_PENDING: return EINPROGRESS; +- case PCMK_EXEC_CANCELLED: return ECANCELED; +- case PCMK_EXEC_TIMEOUT: return ETIME; +- case PCMK_EXEC_NOT_INSTALLED: return ENOENT; +- case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; +- case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; +- case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; +- case PCMK_EXEC_NO_SECRETS: return EACCES; +- +- /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API +- * operations that don't involve executing an agent (for example, +- * registering devices). This allows us to use the CRM_EX_* codes in the +- * exit status for finer-grained responses. +- */ +- case PCMK_EXEC_INVALID: +- switch (result->exit_status) { +- case CRM_EX_INSUFFICIENT_PRIV: return EACCES; +- case CRM_EX_PROTOCOL: return EPROTO; +- +- /* CRM_EX_EXPIRED is used for orphaned fencing operations left +- * over from a previous instance of the fencer. For API backward +- * compatibility, this is mapped to the previously used code for +- * this case, EHOSTUNREACH. +- */ +- case CRM_EX_EXPIRED: return EHOSTUNREACH; +- default: break; +- } +- +- default: +- break; +- } +- +- // Try to provide useful error code based on result's error output +- +- if (result->action_stderr == NULL) { +- return ENODATA; +- +- } else if (strcasestr(result->action_stderr, "timed out") +- || strcasestr(result->action_stderr, "timeout")) { +- return ETIME; +- +- } else if (strcasestr(result->action_stderr, "unrecognised action") +- || strcasestr(result->action_stderr, "unrecognized action") +- || strcasestr(result->action_stderr, "unsupported action")) { +- return EOPNOTSUPP; +- } +- +- // Oh well, we tried +- return pcmk_rc_error; +-} +- +-static void +-stonith_action_async_done(svc_action_t *svc_action) +-{ +- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; +- +- set_result_from_svc_action(action, svc_action); +- +- svc_action->params = NULL; +- +- crm_debug("Child process %d performing action '%s' exited with rc %d", +- action->pid, action->action, svc_action->rc); +- +- log_action(action, action->pid); +- +- if (!pcmk__result_ok(&(action->result)) +- && update_remaining_timeout(action)) { +- +- int rc = internal_stonith_action_execute(action); +- if (rc == pcmk_ok) { +- return; +- } +- } +- +- if (action->done_cb) { +- action->done_cb(action->pid, &(action->result), action->userdata); +- } +- +- action->svc_action = NULL; // don't remove our caller +- stonith__destroy_action(action); +-} +- +-static void +-stonith_action_async_forked(svc_action_t *svc_action) +-{ +- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; +- +- action->pid = svc_action->pid; +- action->svc_action = svc_action; +- +- if (action->fork_cb) { +- (action->fork_cb) (svc_action->pid, action->userdata); +- } +- +- crm_trace("Child process %d performing action '%s' successfully forked", +- action->pid, action->action); +-} +- +-static int +-internal_stonith_action_execute(stonith_action_t * action) +-{ +- int rc = -EPROTO; +- int is_retry = 0; +- svc_action_t *svc_action = NULL; +- static int stonith_sequence = 0; +- char *buffer = NULL; +- +- CRM_CHECK(action != NULL, return -EINVAL); +- +- if ((action->action == NULL) || (action->args == NULL) +- || (action->agent == NULL)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, +- PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); +- return -EINVAL; +- } +- +- if (!action->tries) { +- action->initial_start_time = time(NULL); +- } +- action->tries++; +- +- if (action->tries > 1) { +- crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", +- action->tries, action->agent, action->action, action->remaining_timeout); +- is_retry = 1; +- } +- +- buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", +- basename(action->agent)); +- svc_action = services_action_create_generic(buffer, NULL); +- free(buffer); +- +- if (svc_action->rc != PCMK_OCF_UNKNOWN) { +- set_result_from_svc_action(action, svc_action); +- services_action_free(svc_action); +- return -E2BIG; +- } +- +- svc_action->timeout = 1000 * action->remaining_timeout; +- svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); +- svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), +- action->action, action->tries); +- svc_action->agent = strdup(action->agent); +- svc_action->sequence = stonith_sequence++; +- svc_action->params = action->args; +- svc_action->cb_data = (void *) action; +- svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, +- LOG_TRACE, "Action", +- svc_action->id, svc_action->flags, +- SVC_ACTION_NON_BLOCKED, +- "SVC_ACTION_NON_BLOCKED"); +- +- /* keep retries from executing out of control and free previous results */ +- if (is_retry) { +- pcmk__reset_result(&(action->result)); +- sleep(1); +- } +- +- if (action->async) { +- /* async */ +- if (services_action_async_fork_notify(svc_action, +- &stonith_action_async_done, +- &stonith_action_async_forked)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, +- PCMK_EXEC_PENDING, NULL); +- return pcmk_ok; +- } +- +- } else if (services_action_sync(svc_action)) { // sync success +- rc = pcmk_ok; +- +- } else { // sync failure +- rc = -ECONNABORTED; +- } +- +- set_result_from_svc_action(action, svc_action); +- svc_action->params = NULL; +- services_action_free(svc_action); +- return rc; +-} +- +-/*! +- * \internal +- * \brief Kick off execution of an async stonith action +- * +- * \param[in,out] action Action to be executed +- * \param[in,out] userdata Datapointer to be passed to callbacks +- * \param[in] done Callback to notify action has failed/succeeded +- * \param[in] fork_callback Callback to notify successful fork of child +- * +- * \return pcmk_ok if ownership of action has been taken, -errno otherwise +- */ +-int +-stonith_action_execute_async(stonith_action_t * action, +- void *userdata, +- void (*done) (int pid, +- const pcmk__action_result_t *result, +- void *user_data), +- void (*fork_cb) (int pid, void *user_data)) +-{ +- if (!action) { +- return -EINVAL; +- } +- +- action->userdata = userdata; +- action->done_cb = done; +- action->fork_cb = fork_cb; +- action->async = 1; +- +- return internal_stonith_action_execute(action); +-} +- +-/*! +- * \internal +- * \brief Execute a stonith action +- * +- * \param[in,out] action Action to execute +- * +- * \return pcmk_ok on success, -errno otherwise +- */ +-int +-stonith__execute(stonith_action_t *action) +-{ +- int rc = pcmk_ok; +- +- CRM_CHECK(action != NULL, return -EINVAL); +- +- // Keep trying until success, max retries, or timeout +- do { +- rc = internal_stonith_action_execute(action); +- } while ((rc != pcmk_ok) && update_remaining_timeout(action)); +- +- return rc; +-} +- + static int + stonith_api_device_list(stonith_t * stonith, int call_options, const char *namespace, + stonith_key_value_t ** devices, int timeout) +-- +2.27.0 + + +From 883a3cf7d3f73d02417d3997a7885dd5a7bebac7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:39:17 -0600 +Subject: [PATCH 05/13] Low: fencing,executor: improve mapping of legacy return + code to execution status + +Move stonith_rc2status() from the executor to the fencing library for future +reuse, exposing it internally as stonith__legacy2status(). Update it to use +recently added execution status codes. +--- + daemons/execd/execd_commands.c | 66 ++++++++-------------------------- + include/crm/fencing/internal.h | 2 ++ + lib/fencing/st_actions.c | 36 +++++++++++++++++++ + 3 files changed, 52 insertions(+), 52 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 02070bf11..0ccaa1ced 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -21,6 +21,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -999,56 +1000,6 @@ action_complete(svc_action_t * action) + cmd_finalize(cmd, rsc); + } + +-/*! +- * \internal +- * \brief Determine operation status of a stonith operation +- * +- * Non-stonith resource operations get their operation status directly from the +- * service library, but the fencer does not have an equivalent, so we must infer +- * an operation status from the fencer API's return code. +- * +- * \param[in] action Name of action performed on stonith resource +- * \param[in] interval_ms Action interval +- * \param[in] rc Action result from fencer +- * +- * \return Operation status corresponding to fencer API return code +- */ +-static int +-stonith_rc2status(const char *action, guint interval_ms, int rc) +-{ +- int status = PCMK_EXEC_DONE; +- +- switch (rc) { +- case pcmk_ok: +- break; +- +- case -EOPNOTSUPP: +- case -EPROTONOSUPPORT: +- status = PCMK_EXEC_NOT_SUPPORTED; +- break; +- +- case -ETIME: +- case -ETIMEDOUT: +- status = PCMK_EXEC_TIMEOUT; +- break; +- +- case -ENOTCONN: +- case -ECOMM: +- // Couldn't talk to fencer +- status = PCMK_EXEC_ERROR; +- break; +- +- case -ENODEV: +- // The device is not registered with the fencer +- status = PCMK_EXEC_ERROR; +- break; +- +- default: +- break; +- } +- return status; +-} +- + static void + stonith_action_complete(lrmd_cmd_t * cmd, int rc) + { +@@ -1062,8 +1013,19 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + * the fencer return code. + */ + if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { +- cmd->result.execution_status = stonith_rc2status(cmd->action, +- cmd->interval_ms, rc); ++ cmd->result.execution_status = stonith__legacy2status(rc); ++ ++ // Simplify status codes from fencer ++ switch (cmd->result.execution_status) { ++ case PCMK_EXEC_NOT_CONNECTED: ++ case PCMK_EXEC_INVALID: ++ case PCMK_EXEC_NO_FENCE_DEVICE: ++ case PCMK_EXEC_NO_SECRETS: ++ cmd->result.execution_status = PCMK_EXEC_ERROR; ++ break; ++ default: ++ break; ++ } + + // Certain successful actions change the known state of the resource + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 6a7e4232c..80f6443be 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -182,6 +182,8 @@ bool stonith__event_state_pending(stonith_history_t *history, void *user_data); + bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + ++int stonith__legacy2status(int rc); ++ + /*! + * \internal + * \brief Is a fencing operation in pending state? +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 64d3afd5d..9e785595a 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -360,6 +360,42 @@ stonith__result2rc(const pcmk__action_result_t *result) + return pcmk_rc_error; + } + ++/*! ++ * \internal ++ * \brief Determine execution status equivalent of legacy fencer return code ++ * ++ * Fence action notifications, and fence action callbacks from older fencers ++ * (<=2.1.2) in a rolling upgrade, will have only a legacy return code. Map this ++ * to an execution status as best as possible (essentially, the inverse of ++ * stonith__result2rc()). ++ * ++ * \param[in] rc Legacy return code from fencer ++ * ++ * \return Execution status best corresponding to \p rc ++ */ ++int ++stonith__legacy2status(int rc) ++{ ++ if (rc >= 0) { ++ return PCMK_EXEC_DONE; ++ } ++ switch (-rc) { ++ case EACCES: return PCMK_EXEC_NO_SECRETS; ++ case ECANCELED: return PCMK_EXEC_CANCELLED; ++ case EHOSTUNREACH: return PCMK_EXEC_INVALID; ++ case EINPROGRESS: return PCMK_EXEC_PENDING; ++ case ENODEV: return PCMK_EXEC_NO_FENCE_DEVICE; ++ case ENOENT: return PCMK_EXEC_NOT_INSTALLED; ++ case ENOTCONN: return PCMK_EXEC_NOT_CONNECTED; ++ case EOPNOTSUPP: return PCMK_EXEC_NOT_SUPPORTED; ++ case EPROTO: return PCMK_EXEC_INVALID; ++ case EPROTONOSUPPORT: return PCMK_EXEC_NOT_SUPPORTED; ++ case ETIME: return PCMK_EXEC_TIMEOUT; ++ case ETIMEDOUT: return PCMK_EXEC_TIMEOUT; ++ default: return PCMK_EXEC_ERROR; ++ } ++} ++ + static void + stonith_action_async_done(svc_action_t *svc_action) + { +-- +2.27.0 + + +From 639a9f4a2cbeb6cc41b754a1dcb1f360a9500e03 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 16:54:32 -0600 +Subject: [PATCH 06/13] Refactor: fencing: add functions for getting/setting + result via XML + +These will come in handy as we update the various fencer messages to include a +full result rather than just a legacy return code. The functions are in a new +source file fenced_messages.c which can have other stuff moved to it later. +--- + include/crm/fencing/internal.h | 3 + + lib/fencing/st_actions.c | 107 +++++++++++++++++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 80f6443be..4b5fd3959 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -60,6 +60,9 @@ stonith_action_t *stonith_action_create(const char *agent, + void stonith__destroy_action(stonith_action_t *action); + pcmk__action_result_t *stonith__action_result(stonith_action_t *action); + int stonith__result2rc(const pcmk__action_result_t *result); ++void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result); ++void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result); ++xmlNode *stonith__find_xe_with_result(xmlNode *xml); + + int + stonith_action_execute_async(stonith_action_t * action, +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 9e785595a..d4fc3f5ed 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -396,6 +396,113 @@ stonith__legacy2status(int rc) + } + } + ++/*! ++ * \internal ++ * \brief Add a fencing result to an XML element as attributes ++ * ++ * \param[in] xml XML element to add result to ++ * \param[in] result Fencing result to add (assume success if NULL) ++ */ ++void ++stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) ++{ ++ int exit_status = CRM_EX_OK; ++ enum pcmk_exec_status execution_status = PCMK_EXEC_DONE; ++ const char *exit_reason = NULL; ++ const char *action_stdout = NULL; ++ int rc = pcmk_ok; ++ ++ CRM_CHECK(xml != NULL, return); ++ ++ if (result != NULL) { ++ exit_status = result->exit_status; ++ execution_status = result->execution_status; ++ exit_reason = result->exit_reason; ++ action_stdout = result->action_stdout; ++ rc = pcmk_rc2legacy(stonith__result2rc(result)); ++ } ++ ++ crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); ++ crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); ++ crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); ++ crm_xml_add(xml, "st_output", action_stdout); ++ ++ /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external ++ * code that use libstonithd <=2.1.2 don't check for the full result, and ++ * need a legacy return code instead. ++ */ ++ crm_xml_add_int(xml, F_STONITH_RC, rc); ++} ++ ++/*! ++ * \internal ++ * \brief Find a fencing result beneath an XML element ++ * ++ * \param[in] xml XML element to search ++ * ++ * \return \p xml or descendent of it that contains a fencing result, else NULL ++ */ ++xmlNode * ++stonith__find_xe_with_result(xmlNode *xml) ++{ ++ xmlNode *match = get_xpath_object("//@" XML_LRM_ATTR_RC, xml, LOG_NEVER); ++ ++ if (match == NULL) { ++ /* @COMPAT Peers <=2.1.2 in a rolling upgrade provide only a legacy ++ * return code, not a full result, so check for that. ++ */ ++ match = get_xpath_object("//@" F_STONITH_RC, xml, LOG_ERR); ++ } ++ return match; ++} ++ ++/*! ++ * \internal ++ * \brief Get a fencing result from an XML element's attributes ++ * ++ * \param[in] xml XML element with fencing result ++ * \param[out] result Where to store fencing result ++ */ ++void ++stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) ++{ ++ int exit_status = CRM_EX_OK; ++ int execution_status = PCMK_EXEC_DONE; ++ const char *exit_reason = NULL; ++ char *action_stdout = NULL; ++ ++ CRM_CHECK((xml != NULL) && (result != NULL), return); ++ ++ exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); ++ action_stdout = crm_element_value_copy(xml, "st_output"); ++ ++ // A result must include an exit status and execution status ++ if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) ++ || (crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, ++ &execution_status) < 0)) { ++ int rc = pcmk_ok; ++ exit_status = CRM_EX_ERROR; ++ ++ /* @COMPAT Peers <=2.1.2 in rolling upgrades provide only a legacy ++ * return code, not a full result, so check for that. ++ */ ++ if (crm_element_value_int(xml, F_STONITH_RC, &rc) == 0) { ++ if ((rc == pcmk_ok) || (rc == -EINPROGRESS)) { ++ exit_status = CRM_EX_OK; ++ } ++ execution_status = stonith__legacy2status(rc); ++ exit_reason = pcmk_strerror(rc); ++ ++ } else { ++ execution_status = PCMK_EXEC_ERROR; ++ exit_reason = "Fencer reply contained neither a full result " ++ "nor a legacy return code (bug?)"; ++ } ++ } ++ pcmk__set_result(result, exit_status, execution_status, exit_reason); ++ pcmk__set_result_output(result, action_stdout, NULL); ++} ++ + static void + stonith_action_async_done(svc_action_t *svc_action) + { +-- +2.27.0 + + +From 1f0121c6ad0d0235bcf01c8b60f9153592b3db83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:10:53 -0600 +Subject: [PATCH 07/13] Refactor: fencing: rename functions for invoking fence + callbacks + +... to make it clearer what the difference between them is +--- + lib/fencing/st_client.c | 44 +++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 944cd1863..dfc5860fc 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -847,9 +847,21 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) + return pcmk_ok; + } + ++/*! ++ * \internal ++ * \brief Invoke a (single) specified fence action callback ++ * ++ * \param[in] st Fencer API connection ++ * \param[in] call_id If positive, call ID of completed fence action, otherwise ++ * legacy return code for early action failure ++ * \param[in] rc Legacy return code for action result ++ * \param[in] userdata User data to pass to callback ++ * \param[in] callback Fence action callback to invoke ++ */ + static void +-invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, +- void (*callback) (stonith_t * st, stonith_callback_data_t * data)) ++invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, ++ void (*callback) (stonith_t *st, ++ stonith_callback_data_t *data)) + { + stonith_callback_data_t data = { 0, }; + +@@ -860,8 +872,21 @@ invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, + callback(st, &data); + } + ++/*! ++ * \internal ++ * \brief Invoke any callbacks registered for a specified fence action result ++ * ++ * Given a fence action result from the fencer, invoke any callback registered ++ * for that action, as well as any global callback registered. ++ * ++ * \param[in] st Fencer API connection ++ * \param[in] msg If non-NULL, fencer reply ++ * \param[in] call_id If \p msg is NULL, call ID of action that timed out ++ * \param[in] rc Legacy return code for result of action ++ */ + static void +-stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc) ++invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, ++ int rc) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *blob = NULL; +@@ -899,7 +924,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc + + if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { + crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); +- invoke_callback(stonith, call_id, rc, local_blob.user_data, local_blob.callback); ++ invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, ++ local_blob.callback); + + } else if (private->op_callback == NULL && rc != pcmk_ok) { + crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); +@@ -908,7 +934,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc + + if (private->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); +- invoke_callback(stonith, call_id, rc, NULL, private->op_callback); ++ invoke_fence_action_callback(stonith, call_id, rc, NULL, ++ private->op_callback); + } + crm_trace("OP callback activated."); + } +@@ -919,7 +946,7 @@ stonith_async_timeout_handler(gpointer data) + struct timer_rec_s *timer = data; + + crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); +- stonith_perform_callback(timer->stonith, NULL, timer->call_id, -ETIME); ++ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); + + /* Always return TRUE, never remove the handler + * We do that in stonith_del_callback() +@@ -994,7 +1021,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) + crm_trace("Activating %s callbacks...", type); + + if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { +- stonith_perform_callback(st, blob.xml, 0, 0); ++ invoke_registered_callbacks(st, blob.xml, 0, 0); + + } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { + foreach_notify_entry(private, stonith_send_notification, &blob); +@@ -1229,7 +1256,8 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti + } else if (call_id < 0) { + if (!(options & st_opt_report_only_success)) { + crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); +- invoke_callback(stonith, call_id, call_id, user_data, callback); ++ invoke_fence_action_callback(stonith, call_id, call_id, user_data, ++ callback); + } else { + crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); + } +-- +2.27.0 + + +From c32f11e70a88244f5a3217608055a4eaf8d28231 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:21:00 -0600 +Subject: [PATCH 08/13] Refactor: fencing: drop unnecessary argument when + invoking callbacks + +Refactor invoke_registered_callbacks() to treat a NULL message as a timeout, so +we can drop the rc argument. +--- + lib/fencing/st_client.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index dfc5860fc..9f2b0c1c1 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -882,15 +882,14 @@ invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, + * \param[in] st Fencer API connection + * \param[in] msg If non-NULL, fencer reply + * \param[in] call_id If \p msg is NULL, call ID of action that timed out +- * \param[in] rc Legacy return code for result of action + */ + static void +-invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, +- int rc) ++invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *blob = NULL; + stonith_callback_client_t local_blob; ++ int rc = pcmk_ok; + + CRM_CHECK(stonith != NULL, return); + CRM_CHECK(stonith->st_private != NULL, return); +@@ -902,7 +901,13 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, + local_blob.user_data = NULL; + local_blob.only_success = FALSE; + +- if (msg != NULL) { ++ if (msg == NULL) { ++ // Fencer didn't reply in time ++ rc = -ETIME; ++ ++ } else { ++ // We have the fencer reply ++ + crm_element_value_int(msg, F_STONITH_RC, &rc); + crm_element_value_int(msg, F_STONITH_CALLID, &call_id); + } +@@ -946,7 +951,7 @@ stonith_async_timeout_handler(gpointer data) + struct timer_rec_s *timer = data; + + crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); +- invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); ++ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id); + + /* Always return TRUE, never remove the handler + * We do that in stonith_del_callback() +@@ -1021,7 +1026,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) + crm_trace("Activating %s callbacks...", type); + + if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { +- invoke_registered_callbacks(st, blob.xml, 0, 0); ++ invoke_registered_callbacks(st, blob.xml, 0); + + } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { + foreach_notify_entry(private, stonith_send_notification, &blob); +-- +2.27.0 + + +From 5d8279b51ea9df738354649e4065663f2c16f1e6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 10:21:57 -0600 +Subject: [PATCH 09/13] Log: fencing: improve message for callback errors + +Improve checking of fencer replies, which also allows us to distinguish an +internal bug from a bad fencer reply in logs. Lower the bad reply message to +warning. +--- + lib/fencing/st_client.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 9f2b0c1c1..170b9d450 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -904,15 +904,20 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + if (msg == NULL) { + // Fencer didn't reply in time + rc = -ETIME; ++ CRM_LOG_ASSERT(call_id > 0); + + } else { + // We have the fencer reply + +- crm_element_value_int(msg, F_STONITH_RC, &rc); +- crm_element_value_int(msg, F_STONITH_CALLID, &call_id); +- } ++ if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { ++ rc = -pcmk_err_generic; ++ } + +- CRM_CHECK(call_id > 0, crm_log_xml_err(msg, "Bad result")); ++ if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) ++ || (call_id <= 0)) { ++ crm_log_xml_warn(msg, "Bad fencer reply"); ++ } ++ } + + blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, + call_id); +-- +2.27.0 + + +From e03c14d24e8cb011e870b9460930d139705bf0a2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 9 Nov 2021 14:59:12 -0600 +Subject: [PATCH 10/13] Doc: fencing: correct stonith_api_operations_t method + descriptions + +Many of the methods return a positive call ID on success +--- + include/crm/stonith-ng.h | 60 ++++++++++++++++++++++------------------ + 1 file changed, 33 insertions(+), 27 deletions(-) + +diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h +index 8d6ad477d..9643820e9 100644 +--- a/include/crm/stonith-ng.h ++++ b/include/crm/stonith-ng.h +@@ -164,39 +164,38 @@ typedef struct stonith_api_operations_s + int (*disconnect)(stonith_t *st); + + /*! +- * \brief Remove a registered stonith device with the local stonith daemon. ++ * \brief Unregister a fence device with the local fencer + * +- * \note Synchronous, guaranteed to occur in daemon before function returns. +- * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*remove_device)( + stonith_t *st, int options, const char *name); + + /*! +- * \brief Register a stonith device with the local stonith daemon. ++ * \brief Register a fence device with the local fencer + * +- * \note Synchronous, guaranteed to occur in daemon before function returns. +- * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*register_device)( + stonith_t *st, int options, const char *id, + const char *provider, const char *agent, stonith_key_value_t *params); + + /*! +- * \brief Remove a fencing level for a specific node. ++ * \brief Unregister a fencing level for specified node with local fencer + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*remove_level)( + stonith_t *st, int options, const char *node, int level); + + /*! +- * \brief Register a fencing level containing the fencing devices to be used +- * at that level for a specific node. ++ * \brief Register a fencing level for specified node with local fencer + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*register_level)( + stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list); +@@ -226,21 +225,24 @@ typedef struct stonith_api_operations_s + /*! + * \brief Retrieve string listing hosts and port assignments from a local stonith device. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*list)(stonith_t *st, int options, const char *id, char **list_output, int timeout); + + /*! + * \brief Check to see if a local stonith device is reachable + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*monitor)(stonith_t *st, int options, const char *id, int timeout); + + /*! + * \brief Check to see if a local stonith device's port is reachable + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*status)(stonith_t *st, int options, const char *id, const char *port, int timeout); + +@@ -267,7 +269,8 @@ typedef struct stonith_api_operations_s + * \param timeout, The default per device timeout to use with each device + * capable of fencing the target. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*fence)(stonith_t *st, int options, const char *node, const char *action, + int timeout, int tolerance); +@@ -275,7 +278,8 @@ typedef struct stonith_api_operations_s + /*! + * \brief Manually confirm that a node is down. + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*confirm)(stonith_t *st, int options, const char *node); + +@@ -304,9 +308,6 @@ typedef struct stonith_api_operations_s + * \param[in] callback The callback function to register + * + * \return \c TRUE on success, \c FALSE if call_id is negative, -errno otherwise +- * +- * \todo This function should return \c pcmk_ok on success, and \c call_id +- * when negative, but that would break backward compatibility. + */ + int (*register_callback)(stonith_t *st, + int call_id, +@@ -317,12 +318,14 @@ typedef struct stonith_api_operations_s + void (*callback)(stonith_t *st, stonith_callback_data_t *data)); + + /*! +- * \brief Remove a registered callback for a given call id. ++ * \brief Remove a registered callback for a given call id ++ * ++ * \return pcmk_ok + */ + int (*remove_callback)(stonith_t *st, int call_id, bool all_callbacks); + + /*! +- * \brief Remove fencing level for specific node, node regex or attribute ++ * \brief Unregister fencing level for specified node, pattern or attribute + * + * \param[in] st Fencer connection to use + * \param[in] options Bitmask of stonith_call_options to pass to the fencer +@@ -332,7 +335,8 @@ typedef struct stonith_api_operations_s + * \param[in] value If not NULL, target by this node attribute value + * \param[in] level Index number of level to remove + * +- * \return 0 on success, negative error code otherwise ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + * + * \note The caller should set only one of node, pattern or attr/value. + */ +@@ -341,7 +345,7 @@ typedef struct stonith_api_operations_s + const char *attr, const char *value, int level); + + /*! +- * \brief Register fencing level for specific node, node regex or attribute ++ * \brief Register fencing level for specified node, pattern or attribute + * + * \param[in] st Fencer connection to use + * \param[in] options Bitmask of stonith_call_options to pass to fencer +@@ -352,7 +356,8 @@ typedef struct stonith_api_operations_s + * \param[in] level Index number of level to add + * \param[in] device_list Devices to use in level + * +- * \return 0 on success, negative error code otherwise ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + * + * \note The caller should set only one of node, pattern or attr/value. + */ +@@ -398,7 +403,8 @@ typedef struct stonith_api_operations_s + * \param delay, Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max + * +- * \return Legacy Pacemaker return code ++ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) ++ * on success, otherwise a negative legacy Pacemaker return code + */ + int (*fence_with_delay)(stonith_t *st, int options, const char *node, const char *action, + int timeout, int tolerance, int delay); +-- +2.27.0 + + +From 18c382731889b626b21ba6a14f9213ef1e45a524 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 11:14:24 -0600 +Subject: [PATCH 11/13] Refactor: fencing: define constant for XML attribute + for action output + +--- + daemons/fenced/fenced_commands.c | 4 ++-- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_actions.c | 4 ++-- + lib/fencing/st_client.c | 2 +- + 4 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 26501a4b3..aa14c52af 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2677,7 +2677,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- crm_xml_add(reply, "st_output", output); ++ crm_xml_add(reply, F_STONITH_OUTPUT, output); + crm_xml_add_int(reply, F_STONITH_RC, rc); + + if (request == NULL) { +@@ -2743,7 +2743,7 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); + crm_xml_add_int(reply, F_STONITH_RC, + pcmk_rc2legacy(stonith__result2rc(result))); +- crm_xml_add(reply, "st_output", result->action_stdout); ++ crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); + return reply; + } + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index 4b5fd3959..f0d294a0b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -105,6 +105,7 @@ void stonith__device_parameter_flags(uint32_t *device_flags, + # define F_STONITH_REMOTE_OP_ID "st_remote_op" + # define F_STONITH_REMOTE_OP_ID_RELAY "st_remote_op_relay" + # define F_STONITH_RC "st_rc" ++# define F_STONITH_OUTPUT "st_output" + /*! Timeout period per a device execution */ + # define F_STONITH_TIMEOUT "st_timeout" + # define F_STONITH_TOLERANCE "st_tolerance" +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index d4fc3f5ed..5636810a5 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -425,7 +425,7 @@ stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) + crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); + crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); + crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); +- crm_xml_add(xml, "st_output", action_stdout); ++ crm_xml_add(xml, F_STONITH_OUTPUT, action_stdout); + + /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external + * code that use libstonithd <=2.1.2 don't check for the full result, and +@@ -474,7 +474,7 @@ stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) + CRM_CHECK((xml != NULL) && (result != NULL), return); + + exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); +- action_stdout = crm_element_value_copy(xml, "st_output"); ++ action_stdout = crm_element_value_copy(xml, F_STONITH_OUTPUT); + + // A result must include an exit status and execution status + if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 170b9d450..2dfadf922 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -600,7 +600,7 @@ stonith_api_list(stonith_t * stonith, int call_options, const char *id, char **l + if (output && list_info) { + const char *list_str; + +- list_str = crm_element_value(output, "st_output"); ++ list_str = crm_element_value(output, F_STONITH_OUTPUT); + + if (list_str) { + *list_info = strdup(list_str); +-- +2.27.0 + + +From 9fe9ed5d46c810cb9c12eb07271373ab92d271cd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 11:39:32 -0600 +Subject: [PATCH 12/13] Refactor: fencing: simplify invoking callbacks + +--- + lib/fencing/st_client.c | 42 +++++++++++++++++------------------------ + 1 file changed, 17 insertions(+), 25 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2dfadf922..2ca094566 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -887,8 +887,7 @@ static void + invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; +- stonith_callback_client_t *blob = NULL; +- stonith_callback_client_t local_blob; ++ stonith_callback_client_t *cb_info = NULL; + int rc = pcmk_ok; + + CRM_CHECK(stonith != NULL, return); +@@ -896,11 +895,6 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + + private = stonith->st_private; + +- local_blob.id = NULL; +- local_blob.callback = NULL; +- local_blob.user_data = NULL; +- local_blob.only_success = FALSE; +- + if (msg == NULL) { + // Fencer didn't reply in time + rc = -ETIME; +@@ -919,26 +913,21 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + } + } + +- blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, +- call_id); +- if (blob != NULL) { +- local_blob = *blob; +- blob = NULL; +- +- stonith_api_del_callback(stonith, call_id, FALSE); +- +- } else { +- crm_trace("No callback found for call %d", call_id); +- local_blob.callback = NULL; ++ if (call_id > 0) { ++ cb_info = pcmk__intkey_table_lookup(private->stonith_op_callback_table, ++ call_id); + } + +- if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { +- crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); +- invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, +- local_blob.callback); ++ if ((cb_info != NULL) && (cb_info->callback != NULL) ++ && (rc == pcmk_ok || !(cb_info->only_success))) { ++ crm_trace("Invoking callback %s for call %d", ++ crm_str(cb_info->id), call_id); ++ invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, ++ cb_info->callback); + +- } else if (private->op_callback == NULL && rc != pcmk_ok) { +- crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); ++ } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { ++ crm_warn("Fencing action without registered callback failed: %s", ++ pcmk_strerror(rc)); + crm_log_xml_debug(msg, "Failed fence update"); + } + +@@ -947,7 +936,10 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + invoke_fence_action_callback(stonith, call_id, rc, NULL, + private->op_callback); + } +- crm_trace("OP callback activated."); ++ ++ if (cb_info != NULL) { ++ stonith_api_del_callback(stonith, call_id, FALSE); ++ } + } + + static gboolean +-- +2.27.0 + + +From 8113b800ce677ba17a16ca176e8f6f9b4a042316 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 18:14:48 -0600 +Subject: [PATCH 13/13] Refactor: fencing: add a missing "break" statement + +No effect, but more correct +--- + lib/fencing/st_actions.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 5636810a5..7eaa8b0f2 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -336,6 +336,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + case CRM_EX_EXPIRED: return EHOSTUNREACH; + default: break; + } ++ break; + + default: + break; +-- +2.27.0 + diff --git a/SOURCES/003-pacemakerd-output.patch b/SOURCES/003-pacemakerd-output.patch deleted file mode 100644 index 167e22b..0000000 --- a/SOURCES/003-pacemakerd-output.patch +++ /dev/null @@ -1,343 +0,0 @@ -From 7c35387a9896cb968cf4087b5cbed94af44e1ea5 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 14 May 2021 12:03:46 -0400 -Subject: [PATCH 1/5] Feature: daemons: Convert pacemakerd to formatted output. - -The main purpose of this is to finish getting pacemakerd moved off the -existing command line handling code (pcmk__cli_help in particular) so -that code can eventually be deprecated or removed. pacemakerd itself -does fairly little printing. ---- - daemons/pacemakerd/pacemakerd.c | 58 ++++++++++++++++++++++++++++++----------- - 1 file changed, 43 insertions(+), 15 deletions(-) - -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index ce194bf..bd59729 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -25,6 +25,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -37,6 +38,14 @@ struct { - gboolean standby; - } options; - -+static pcmk__output_t *out = NULL; -+ -+static pcmk__supported_format_t formats[] = { -+ PCMK__SUPPORTED_FORMAT_NONE, -+ PCMK__SUPPORTED_FORMAT_TEXT, -+ { NULL, NULL, NULL } -+}; -+ - static gboolean - pid_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { - return TRUE; -@@ -1167,10 +1176,10 @@ pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, - } - - static GOptionContext * --build_arg_context(pcmk__common_args_t *args) { -+build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { - GOptionContext *context = NULL; - -- context = pcmk__build_arg_context(args, NULL, NULL, NULL); -+ context = pcmk__build_arg_context(args, "text", group, NULL); - pcmk__add_main_args(context, entries); - return context; - } -@@ -1182,9 +1191,11 @@ main(int argc, char **argv) - - GError *error = NULL; - -+ int rc = pcmk_rc_ok; -+ GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); - gchar **processed_args = pcmk__cmdline_preproc(argv, "p"); -- GOptionContext *context = build_arg_context(args); -+ GOptionContext *context = build_arg_context(args, &output_group); - - bool old_instance_connected = false; - -@@ -1195,23 +1205,30 @@ main(int argc, char **argv) - mainloop_add_signal(SIGHUP, pcmk_ignore); - mainloop_add_signal(SIGQUIT, pcmk_sigquit); - -+ pcmk__register_formats(output_group, formats); - if (!g_option_context_parse_strv(context, &processed_args, &error)) { - exit_code = CRM_EX_USAGE; - goto done; - } - -+ rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); -+ if (rc != pcmk_rc_ok) { -+ exit_code = CRM_EX_ERROR; -+ g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Error creating output format %s: %s", -+ args->output_ty, pcmk_rc_str(rc)); -+ goto done; -+ } -+ - if (options.features) { -- printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION, -- CRM_FEATURE_SET, CRM_FEATURES); -+ out->info(out, "Pacemaker %s (Build: %s)\n Supporting v%s: %s", PACEMAKER_VERSION, -+ BUILD_VERSION, CRM_FEATURE_SET, CRM_FEATURES); - exit_code = CRM_EX_OK; - goto done; - } - - if (args->version) { -- g_strfreev(processed_args); -- pcmk__free_arg_context(context); -- /* FIXME: When pacemakerd is converted to use formatted output, this can go. */ -- pcmk__cli_help('v', CRM_EX_USAGE); -+ out->version(out, false); -+ goto done; - } - - setenv("LC_ALL", "C", 1); -@@ -1248,6 +1265,13 @@ main(int argc, char **argv) - crm_ipc_close(old_instance); - crm_ipc_destroy(old_instance); - -+ /* Don't allow any accidental output after this point. */ -+ if (out != NULL) { -+ out->finish(out, exit_code, true, NULL); -+ pcmk__output_free(out); -+ out = NULL; -+ } -+ - #ifdef SUPPORT_COROSYNC - if (mcp_read_config() == FALSE) { - exit_code = CRM_EX_UNAVAILABLE; -@@ -1333,6 +1357,11 @@ done: - g_strfreev(processed_args); - pcmk__free_arg_context(context); - -- pcmk__output_and_clear_error(error, NULL); -+ pcmk__output_and_clear_error(error, out); -+ -+ if (out != NULL) { -+ out->finish(out, exit_code, true, NULL); -+ pcmk__output_free(out); -+ } - crm_exit(exit_code); - } --- -1.8.3.1 - - -From 35e6da64381fcb092d81ce16835cc28670b077cb Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 17 May 2021 10:04:04 -0400 -Subject: [PATCH 2/5] Features: daemons: Output the pacemakerd feature list in - XML. - ---- - daemons/pacemakerd/pacemakerd.c | 45 ++++++++++++++++++++++++++++++++++++++--- - 1 file changed, 42 insertions(+), 3 deletions(-) - -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index bd59729..93cf743 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -43,6 +43,42 @@ static pcmk__output_t *out = NULL; - static pcmk__supported_format_t formats[] = { - PCMK__SUPPORTED_FORMAT_NONE, - PCMK__SUPPORTED_FORMAT_TEXT, -+ PCMK__SUPPORTED_FORMAT_XML, -+ { NULL, NULL, NULL } -+}; -+ -+static int -+pacemakerd_features(pcmk__output_t *out, va_list args) { -+ out->info(out, "Pacemaker %s (Build: %s)\n Supporting v%s: %s", PACEMAKER_VERSION, -+ BUILD_VERSION, CRM_FEATURE_SET, CRM_FEATURES); -+ return pcmk_rc_ok; -+} -+ -+static int -+pacemakerd_features_xml(pcmk__output_t *out, va_list args) { -+ gchar **feature_list = g_strsplit(CRM_FEATURES, " ", 0); -+ -+ pcmk__output_xml_create_parent(out, "pacemakerd", -+ "version", PACEMAKER_VERSION, -+ "build", BUILD_VERSION, -+ "feature_set", CRM_FEATURE_SET, -+ NULL); -+ out->begin_list(out, NULL, NULL, "features"); -+ -+ for (char **s = feature_list; *s != NULL; s++) { -+ pcmk__output_create_xml_text_node(out, "feature", *s); -+ } -+ -+ out->end_list(out); -+ -+ g_strfreev(feature_list); -+ return pcmk_rc_ok; -+} -+ -+static pcmk__message_entry_t fmt_functions[] = { -+ { "features", "default", pacemakerd_features }, -+ { "features", "xml", pacemakerd_features_xml }, -+ - { NULL, NULL, NULL } - }; - -@@ -200,7 +236,7 @@ static GOptionContext * - build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { - GOptionContext *context = NULL; - -- context = pcmk__build_arg_context(args, "text", group, NULL); -+ context = pcmk__build_arg_context(args, "text (default), xml", group, NULL); - pcmk__add_main_args(context, entries); - return context; - } -@@ -241,9 +277,12 @@ main(int argc, char **argv) - goto done; - } - -+ pcmk__force_args(context, &error, "%s --xml-simple-list", g_get_prgname()); -+ -+ pcmk__register_messages(out, fmt_functions); -+ - if (options.features) { -- out->info(out, "Pacemaker %s (Build: %s)\n Supporting v%s: %s", PACEMAKER_VERSION, -- BUILD_VERSION, CRM_FEATURE_SET, CRM_FEATURES); -+ out->message(out, "features"); - exit_code = CRM_EX_OK; - goto done; - } --- -1.8.3.1 - - -From 5b7f5eb35b025b59805cf3c7c3dcb6a3cf4b71b3 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 17 May 2021 11:09:53 -0400 -Subject: [PATCH 3/5] Low: daemons: Conditionally enable logging in pacemakerd. - -If we're doing an interactive command-line call, use -pcmk__cli_init_logging. At the moment, all command line calls except -for --shutdown do their work before logging would even come up, so we -really only need to do this for --shutdown. - -If we're doing a daemon call, use crm_log_init. ---- - daemons/pacemakerd/pacemakerd.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index 93cf743..c20bde7 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -296,8 +296,11 @@ main(int argc, char **argv) - - pcmk__set_env_option("mcp", "true"); - -- pcmk__cli_init_logging("pacemakerd", args->verbosity); -- crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); -+ if (options.shutdown) { -+ pcmk__cli_init_logging("pacemakerd", args->verbosity); -+ } else { -+ crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); -+ } - - crm_debug("Checking for existing Pacemaker instance"); - old_instance = crm_ipc_new(CRM_SYSTEM_MCP, 0); --- -1.8.3.1 - - -From 2393362bb7489e86d937ed46a1c5cfb93d9bf3ab Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 17 May 2021 11:58:06 -0400 -Subject: [PATCH 4/5] Fix: include: Bump CRM_FEATURE_SET for new pacemakerd - args. - ---- - include/crm/crm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index fdfc825..92a98fa 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.10.0" -+# define CRM_FEATURE_SET "3.10.1" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node --- -1.8.3.1 - - -From 3ad8edbd91631b87ef5f53fa2d68f0c8bbb9ee2b Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 17 May 2021 11:57:09 -0400 -Subject: [PATCH 5/5] Feature: xml: Add schema for pacemakerd. - ---- - xml/Makefile.am | 1 + - xml/api/pacemakerd-2.10.rng | 28 ++++++++++++++++++++++++++++ - 2 files changed, 29 insertions(+) - create mode 100644 xml/api/pacemakerd-2.10.rng - -diff --git a/xml/Makefile.am b/xml/Makefile.am -index 12a51c5..b9448d4 100644 ---- a/xml/Makefile.am -+++ b/xml/Makefile.am -@@ -56,6 +56,7 @@ API_request_base = command-output \ - crm_simulate \ - crmadmin \ - digests \ -+ pacemakerd \ - stonith_admin \ - version - -diff --git a/xml/api/pacemakerd-2.10.rng b/xml/api/pacemakerd-2.10.rng -new file mode 100644 -index 0000000..41a11e7 ---- /dev/null -+++ b/xml/api/pacemakerd-2.10.rng -@@ -0,0 +1,28 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - diff --git a/SOURCES/004-check-level.patch b/SOURCES/004-check-level.patch deleted file mode 100644 index f2abb5f..0000000 --- a/SOURCES/004-check-level.patch +++ /dev/null @@ -1,199 +0,0 @@ -From 3905e7eac11298fc20efd567a773666f948edf61 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 3 May 2021 11:19:04 -0400 -Subject: [PATCH 1/2] Feature: tools: Add OCF_CHECK_LEVEL to crm_resource - environment. - -If --validate= or --force-check= are given with a level, pass that along -as OCF_CHECK_LEVEL. This argument is optional, and if no value is given -then the environment variable will not be set and whatever's the default -on the resource agent will be used. - -See: rhbz#1955792. ---- - tools/crm_resource.c | 29 +++++++++++++++++++++-------- - tools/crm_resource.h | 4 ++-- - tools/crm_resource_runtime.c | 13 ++++++++++--- - 3 files changed, 33 insertions(+), 13 deletions(-) - -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index 45db2b2..6ca96f8 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -100,6 +100,7 @@ struct { - int timeout_ms; // Parsed from --timeout value - char *agent_spec; // Standard and/or provider and/or agent - gchar *xml_file; // Value of (deprecated) --xml-file -+ int check_level; // Optional value of --validate or --force-check - - // Resource configuration specified via command-line arguments - gboolean cmdline_config; // Resource configuration was via arguments -@@ -113,6 +114,7 @@ struct { - GHashTable *override_params; // Resource parameter values that override config - } options = { - .attr_set_type = XML_TAG_ATTR_SETS, -+ .check_level = -1, - .cib_options = cib_sync_call, - .require_cib = TRUE, - .require_dataset = TRUE, -@@ -402,14 +404,15 @@ static GOptionEntry query_entries[] = { - }; - - static GOptionEntry command_entries[] = { -- { "validate", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, -+ { "validate", 0, G_OPTION_FLAG_OPTIONAL_ARG, G_OPTION_ARG_CALLBACK, - validate_or_force_cb, - "Validate resource configuration by calling agent's validate-all\n" - INDENT "action. The configuration may be specified either by giving an\n" - INDENT "existing resource name with -r, or by specifying --class,\n" - INDENT "--agent, and --provider arguments, along with any number of\n" -- INDENT "--option arguments.", -- NULL }, -+ INDENT "--option arguments. An optional LEVEL argument can be given\n" -+ INDENT "to control the level of checking performed.", -+ "LEVEL" }, - { "cleanup", 'C', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, cleanup_refresh_cb, - "If resource has any past failures, clear its history and fail\n" - INDENT "count. Optionally filtered by --resource, --node, --operation\n" -@@ -546,11 +549,12 @@ static GOptionEntry advanced_entries[] = { - INDENT "the cluster believes the resource is a clone instance already\n" - INDENT "running on the local node.", - NULL }, -- { "force-check", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, -+ { "force-check", 0, G_OPTION_FLAG_OPTIONAL_ARG, G_OPTION_ARG_CALLBACK, - validate_or_force_cb, - "(Advanced) Bypass the cluster and check the state of a resource on\n" -- INDENT "the local node", -- NULL }, -+ INDENT "the local node. An optional LEVEL argument can be given\n" -+ INDENT "to control the level of checking performed.", -+ "LEVEL" }, - - { NULL } - }; -@@ -910,6 +914,15 @@ validate_or_force_cb(const gchar *option_name, const gchar *optarg, - if (options.override_params == NULL) { - options.override_params = pcmk__strkey_table(free, free); - } -+ -+ if (optarg != NULL) { -+ if (pcmk__scan_min_int(optarg, &options.check_level, 0) != pcmk_rc_ok) { -+ g_set_error(error, G_OPTION_ERROR, CRM_EX_INVALID_PARAM, -+ "Invalid check level setting: %s", optarg); -+ return FALSE; -+ } -+ } -+ - return TRUE; - } - -@@ -1826,12 +1839,12 @@ main(int argc, char **argv) - options.v_class, options.v_provider, options.v_agent, - "validate-all", options.cmdline_params, - options.override_params, options.timeout_ms, -- args->verbosity, options.force); -+ args->verbosity, options.force, options.check_level); - } else { - exit_code = cli_resource_execute(rsc, options.rsc_id, - options.operation, options.override_params, - options.timeout_ms, cib_conn, data_set, -- args->verbosity, options.force); -+ args->verbosity, options.force, options.check_level); - } - goto done; - -diff --git a/tools/crm_resource.h b/tools/crm_resource.h -index 3560377..5ab10d6 100644 ---- a/tools/crm_resource.h -+++ b/tools/crm_resource.h -@@ -88,11 +88,11 @@ crm_exit_t cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc - const char *rsc_type, const char *rsc_action, - GHashTable *params, GHashTable *override_hash, - int timeout_ms, int resource_verbose, -- gboolean force); -+ gboolean force, int check_level); - crm_exit_t cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - const char *rsc_action, GHashTable *override_hash, - int timeout_ms, cib_t *cib, pe_working_set_t *data_set, -- int resource_verbose, gboolean force); -+ int resource_verbose, gboolean force, int check_level); - - int cli_resource_update_attribute(pe_resource_t *rsc, const char *requested_name, - const char *attr_set, const char *attr_set_type, -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index fe0ec98..bde83b6 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1679,7 +1679,8 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - const char *rsc_class, const char *rsc_prov, - const char *rsc_type, const char *action, - GHashTable *params, GHashTable *override_hash, -- int timeout_ms, int resource_verbose, gboolean force) -+ int timeout_ms, int resource_verbose, gboolean force, -+ int check_level) - { - GHashTable *params_copy = NULL; - crm_exit_t exit_code = CRM_EX_OK; -@@ -1703,6 +1704,12 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - /* add crm_feature_set env needed by some resource agents */ - g_hash_table_insert(params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET)); - -+ if (check_level >= 0) { -+ char *level = crm_strdup_printf("%d", check_level); -+ setenv("OCF_CHECK_LEVEL", level, 1); -+ free(level); -+ } -+ - /* resources_action_create frees the params hash table it's passed, but we - * may need to reuse it in a second call to resources_action_create. Thus - * we'll make a copy here so that gets freed and the original remains for -@@ -1790,7 +1797,7 @@ crm_exit_t - cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - const char *rsc_action, GHashTable *override_hash, - int timeout_ms, cib_t * cib, pe_working_set_t *data_set, -- int resource_verbose, gboolean force) -+ int resource_verbose, gboolean force, int check_level) - { - pcmk__output_t *out = data_set->priv; - crm_exit_t exit_code = CRM_EX_OK; -@@ -1856,7 +1863,7 @@ cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - - exit_code = cli_resource_execute_from_params(out, rid, rclass, rprov, rtype, action, - params, override_hash, timeout_ms, -- resource_verbose, force); -+ resource_verbose, force, check_level); - return exit_code; - } - --- -1.8.3.1 - - -From d13ba4bd6defe0dd81fdf8ab39ae5b889513c0c0 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 May 2021 10:59:23 -0400 -Subject: [PATCH 2/2] Fix: include: Bump feature set to 3.10.2. - -This is for the OCF_CHECK_LEVEL environment variable. - -See: rhbz#1955792. ---- - include/crm/crm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 92a98fa..ee52c36 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.10.1" -+# define CRM_FEATURE_SET "3.10.2" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node --- -1.8.3.1 - diff --git a/SOURCES/004-systemd-metadata.patch b/SOURCES/004-systemd-metadata.patch new file mode 100644 index 0000000..142ef6a --- /dev/null +++ b/SOURCES/004-systemd-metadata.patch @@ -0,0 +1,73 @@ +From 09ef95a2eed48b4eb7488788a1b655d67eafe783 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 30 Nov 2021 14:47:12 -0500 +Subject: [PATCH] Low: libcrmservice: Handle systemd service templates. + +These unit files (which have an @ sign at the end) expect to be +parameterized by an instance name. Not providing an instance name +causes the dbus lookup to fail, and we fall back to assume this is an +LSB service. If the user doesn't provide an instance name, just add a +fake one. It doesn't seem to matter what name is given for the lookup. + +See: rhbz#2003151 +--- + lib/services/systemd.c | 22 ++++++++++++++++------ + 1 file changed, 16 insertions(+), 6 deletions(-) + +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 8e9fff484..27a3b376d 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -206,17 +206,27 @@ systemd_unit_extension(const char *name) + } + + static char * +-systemd_service_name(const char *name) ++systemd_service_name(const char *name, bool add_instance_name) + { +- if (name == NULL) { ++ if (pcmk__str_empty(name)) { + return NULL; + } + + if (systemd_unit_extension(name)) { + return strdup(name); +- } + +- return crm_strdup_printf("%s.service", name); ++ /* Services that end with an @ sign are systemd templates. They expect an ++ * instance name to follow the service name. If no instance name was ++ * provided, just add "x" to the string as the instance name. It doesn't ++ * seem to matter for purposes of looking up whether a service exists or ++ * not. ++ */ ++ } else if (add_instance_name && *(name+strlen(name)-1) == '@') { ++ return crm_strdup_printf("%sx.service", name); ++ ++ } else { ++ return crm_strdup_printf("%s.service", name); ++ } + } + + static void +@@ -427,7 +437,7 @@ invoke_unit_by_name(const char *arg_name, svc_action_t *op, char **path) + CRM_ASSERT(msg != NULL); + + // Add the (expanded) unit name as the argument +- name = systemd_service_name(arg_name); ++ name = systemd_service_name(arg_name, op == NULL || pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, + DBUS_TYPE_INVALID)); + free(name); +@@ -944,7 +954,7 @@ invoke_unit_by_path(svc_action_t *op, const char *unit) + /* (ss) */ + { + const char *replace_s = "replace"; +- char *name = systemd_service_name(op->agent); ++ char *name = systemd_service_name(op->agent, pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); + + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID)); +-- +2.27.0 + diff --git a/SOURCES/005-crm_resource.patch b/SOURCES/005-crm_resource.patch deleted file mode 100644 index 1683026..0000000 --- a/SOURCES/005-crm_resource.patch +++ /dev/null @@ -1,866 +0,0 @@ -From a5a507d4e1abf242903472719a19977811e6f164 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 May 2021 11:59:36 -0400 -Subject: [PATCH 01/10] Feature: libcrmcommon: Add OCF_OUTPUT_FORMAT to - crm_resource environment. - -See: rhbz#1644628 ---- - lib/common/output.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/lib/common/output.c b/lib/common/output.c -index 6cb49b5..58872e0 100644 ---- a/lib/common/output.c -+++ b/lib/common/output.c -@@ -71,6 +71,8 @@ pcmk__output_new(pcmk__output_t **out, const char *fmt_name, const char *filenam - return ENOMEM; - } - -+ setenv("OCF_OUTPUT_FORMAT", (*out)->fmt_name, 1); -+ - return pcmk_rc_ok; - } - --- -1.8.3.1 - - -From acc6ecdbfb797d69794e68f75a734d6252434e01 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 21 May 2021 14:20:30 -0400 -Subject: [PATCH 02/10] Feature: schemas: Copy crm_resource schema in - preparation for changes. - -See: rhbz#1644628 ---- - xml/api/crm_resource-2.11.rng | 238 ++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 238 insertions(+) - create mode 100644 xml/api/crm_resource-2.11.rng - -diff --git a/xml/api/crm_resource-2.11.rng b/xml/api/crm_resource-2.11.rng -new file mode 100644 -index 0000000..8e386db ---- /dev/null -+++ b/xml/api/crm_resource-2.11.rng -@@ -0,0 +1,238 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ promoted -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ ocf -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ true -+ false -+ -+ -+ -+ true -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ Stopped -+ Started -+ Master -+ Slave -+ -+ -+ --- -1.8.3.1 - - -From 1bbdf2149a111e9e19c388834f82001e0d31c427 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 24 May 2021 12:23:55 -0400 -Subject: [PATCH 03/10] Feature: xml: Update the crm_resource schema for XML - output. - -See: rhbz#1644628 ---- - xml/api/crm_resource-2.11.rng | 50 +++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 50 insertions(+) - -diff --git a/xml/api/crm_resource-2.11.rng b/xml/api/crm_resource-2.11.rng -index 8e386db..aaa54d6 100644 ---- a/xml/api/crm_resource-2.11.rng -+++ b/xml/api/crm_resource-2.11.rng -@@ -20,6 +20,7 @@ - - - -+ - - - -@@ -227,6 +228,55 @@ - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - Stopped --- -1.8.3.1 - - -From d89f5bc7fec856fdcd32fa14edbd0019507d5d15 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 1 Jun 2021 15:26:58 -0400 -Subject: [PATCH 04/10] Low: libcrmcommon: Increase PCMK__API_VERSION for new - crm_resource output. - -See: rhbz#1644628 ---- - include/crm/common/output_internal.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h -index 10b315b..0436cde 100644 ---- a/include/crm/common/output_internal.h -+++ b/include/crm/common/output_internal.h -@@ -27,7 +27,7 @@ extern "C" { - # include - # include - --# define PCMK__API_VERSION "2.9" -+# define PCMK__API_VERSION "2.11" - - #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) - # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) --- -1.8.3.1 - - -From 30bd2ddf43ee2a911681e51f40ed9ba20ec250b0 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 27 May 2021 13:57:12 -0400 -Subject: [PATCH 05/10] Low: tools: Pass NULL to - cli_resource_execute_from_params... - -if no resource name is given. This happens if we are validating based -on the --class/--agent/--provider command line options instead. ---- - tools/crm_resource.c | 2 +- - tools/crm_resource_runtime.c | 8 ++++---- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index 24f1121..37a0bb0 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -1840,7 +1840,7 @@ main(int argc, char **argv) - - case cmd_execute_agent: - if (options.cmdline_config) { -- exit_code = cli_resource_execute_from_params(out, "test", -+ exit_code = cli_resource_execute_from_params(out, NULL, - options.v_class, options.v_provider, options.v_agent, - "validate-all", options.cmdline_params, - options.override_params, options.timeout_ms, -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index 48a4b40..ebf48bb 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1717,14 +1717,14 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - */ - params_copy = pcmk__str_table_dup(params); - -- op = resources_action_create(rsc_name, rsc_class, rsc_prov, rsc_type, action, 0, -- timeout_ms, params_copy, 0); -+ op = resources_action_create(rsc_name ? rsc_name : "test", rsc_class, rsc_prov, -+ rsc_type, action, 0, timeout_ms, params_copy, 0); - if (op == NULL) { - /* Re-run with stderr enabled so we can display a sane error message */ - crm_enable_stderr(TRUE); - params_copy = pcmk__str_table_dup(params); -- op = resources_action_create(rsc_name, rsc_class, rsc_prov, rsc_type, action, 0, -- timeout_ms, params_copy, 0); -+ op = resources_action_create(rsc_name ? rsc_name : "test", rsc_class, rsc_prov, -+ rsc_type, action, 0, timeout_ms, params_copy, 0); - - /* Callers of cli_resource_execute expect that the params hash table will - * be freed. That function uses this one, so for that reason and for --- -1.8.3.1 - - -From ee56efd53d14cfc4f902769540b72b3bb6096a73 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 24 May 2021 12:08:52 -0400 -Subject: [PATCH 06/10] Feature: tools: Add an agent-status message for - crm_resource. - -This moves what was previously only done in an out->info call to its own -output message, which means it will appear in XML output as well. Also, -note that if --class/--agent/--provider are given, the resource name -will be set to "test". In that case, do not display the resource name -in the output. - -This message will be used for --validate and the --force-* command line -options to crm_resource. - -See: rhbz#1644628 ---- - tools/crm_resource_print.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 53 insertions(+) - -diff --git a/tools/crm_resource_print.c b/tools/crm_resource_print.c -index 9d82cf8..88d5878 100644 ---- a/tools/crm_resource_print.c -+++ b/tools/crm_resource_print.c -@@ -152,6 +152,57 @@ attribute_list_default(pcmk__output_t *out, va_list args) { - return pcmk_rc_ok; - } - -+PCMK__OUTPUT_ARGS("agent-status", "int", "const char *", "const char *", "const char *", -+ "const char *", "const char *", "int") -+static int -+agent_status_default(pcmk__output_t *out, va_list args) { -+ int status = va_arg(args, int); -+ const char *action = va_arg(args, const char *); -+ const char *name = va_arg(args, const char *); -+ const char *class = va_arg(args, const char *); -+ const char *provider = va_arg(args, const char *); -+ const char *type = va_arg(args, const char *); -+ int rc = va_arg(args, int); -+ -+ if (status == PCMK_LRM_OP_DONE) { -+ out->info(out, "Operation %s%s%s (%s%s%s:%s) returned: '%s' (%d)", -+ action, name ? " for " : "", name ? name : "", -+ class, provider ? ":" : "", provider ? provider : "", type, -+ services_ocf_exitcode_str(rc), rc); -+ } else { -+ out->err(out, "Operation %s%s%s (%s%s%s:%s) failed: '%s' (%d)", -+ action, name ? " for " : "", name ? name : "", -+ class, provider ? ":" : "", provider ? provider : "", type, -+ services_lrm_status_str(status), status); -+ } -+ -+ return pcmk_rc_ok; -+} -+ -+PCMK__OUTPUT_ARGS("agent-status", "int", "const char *", "const char *", "const char *", -+ "const char *", "const char *", "int") -+static int -+agent_status_xml(pcmk__output_t *out, va_list args) { -+ int status G_GNUC_UNUSED = va_arg(args, int); -+ const char *action G_GNUC_UNUSED = va_arg(args, const char *); -+ const char *name G_GNUC_UNUSED = va_arg(args, const char *); -+ const char *class G_GNUC_UNUSED = va_arg(args, const char *); -+ const char *provider G_GNUC_UNUSED = va_arg(args, const char *); -+ const char *type G_GNUC_UNUSED = va_arg(args, const char *); -+ int rc = va_arg(args, int); -+ -+ char *status_str = pcmk__itoa(rc); -+ -+ pcmk__output_create_xml_node(out, "agent-status", -+ "code", status_str, -+ "message", services_ocf_exitcode_str(rc), -+ NULL); -+ -+ free(status_str); -+ -+ return pcmk_rc_ok; -+} -+ - PCMK__OUTPUT_ARGS("attribute-list", "pe_resource_t *", "char *", "GHashTable *") - static int - attribute_list_text(pcmk__output_t *out, va_list args) { -@@ -562,6 +613,8 @@ resource_names(pcmk__output_t *out, va_list args) { - } - - static pcmk__message_entry_t fmt_functions[] = { -+ { "agent-status", "default", agent_status_default }, -+ { "agent-status", "xml", agent_status_xml }, - { "attribute-list", "default", attribute_list_default }, - { "attribute-list", "text", attribute_list_text }, - { "property-list", "default", property_list_default }, --- -1.8.3.1 - - -From 85cb6b6bff96b18c5174d11e4de4d49cbfb20bb7 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 1 Jun 2021 14:47:30 -0400 -Subject: [PATCH 07/10] Feature: tools: Add an overridden params output - message. - -This also replaces what was previously being done in an out->info call -with an output message. This means it shows up in XML output as well. -Also, note that if --class/--agent/--provider are given, the resource -name will be set to "test". In that case, do not display the resource -name in the output. - -See: rhbz#1644628 ---- - tools/crm_resource_print.c | 39 +++++++++++++++++++++++++++++++++++++++ - 1 file changed, 39 insertions(+) - -diff --git a/tools/crm_resource_print.c b/tools/crm_resource_print.c -index 88d5878..119d83f 100644 ---- a/tools/crm_resource_print.c -+++ b/tools/crm_resource_print.c -@@ -224,6 +224,43 @@ attribute_list_text(pcmk__output_t *out, va_list args) { - return pcmk_rc_ok; - } - -+PCMK__OUTPUT_ARGS("override", "const char *", "const char *", "const char *") -+static int -+override_default(pcmk__output_t *out, va_list args) { -+ const char *rsc_name = va_arg(args, const char *); -+ const char *name = va_arg(args, const char *); -+ const char *value = va_arg(args, const char *); -+ -+ if (rsc_name == NULL) { -+ out->list_item(out, NULL, "Overriding the cluster configuration with '%s' = '%s'", -+ name, value); -+ } else { -+ out->list_item(out, NULL, "Overriding the cluster configuration for '%s' with '%s' = '%s'", -+ rsc_name, name, value); -+ } -+ -+ return pcmk_rc_ok; -+} -+ -+PCMK__OUTPUT_ARGS("override", "const char *", "const char *", "const char *") -+static int -+override_xml(pcmk__output_t *out, va_list args) { -+ const char *rsc_name = va_arg(args, const char *); -+ const char *name = va_arg(args, const char *); -+ const char *value = va_arg(args, const char *); -+ -+ xmlNodePtr node = pcmk__output_create_xml_node(out, "override", -+ "name", name, -+ "value", value, -+ NULL); -+ -+ if (rsc_name != NULL) { -+ crm_xml_add(node, "rsc", rsc_name); -+ } -+ -+ return pcmk_rc_ok; -+} -+ - PCMK__OUTPUT_ARGS("property-list", "pe_resource_t *", "char *") - static int - property_list_default(pcmk__output_t *out, va_list args) { -@@ -617,6 +654,8 @@ static pcmk__message_entry_t fmt_functions[] = { - { "agent-status", "xml", agent_status_xml }, - { "attribute-list", "default", attribute_list_default }, - { "attribute-list", "text", attribute_list_text }, -+ { "override", "default", override_default }, -+ { "override", "xml", override_xml }, - { "property-list", "default", property_list_default }, - { "property-list", "text", property_list_text }, - { "resource-check-list", "default", resource_check_list_default }, --- -1.8.3.1 - - -From e5e24592c7c3231c619fb5253e7925ffbc634a99 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 4 Jun 2021 10:24:51 -0400 -Subject: [PATCH 08/10] Low: tools: Use simple XML lists for resource actions - as well. - -See: rhbz#1644628 ---- - tools/crm_resource.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index 37a0bb0..e957011 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -1643,6 +1643,7 @@ main(int argc, char **argv) - * saves from having to write custom messages to build the lists around all these things - */ - switch (options.rsc_cmd) { -+ case cmd_execute_agent: - case cmd_list_resources: - case cmd_query_xml: - case cmd_query_raw_xml: --- -1.8.3.1 - - -From 3e75174d0bc31b261adb1994214a5878b79da85b Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 4 Jun 2021 10:30:10 -0400 -Subject: [PATCH 09/10] Feature: tools: Add an output message for resource - actions. - -This wraps up the override and agent-status messages into a single -message, along with any stdout/stderr from the resource action. This -message should be called after taking the action. - -This also implements handling XML output from resource actions. Check -to see if the validate-all action returns XML. If so, output it as a -CDATA block under a "command" element. If not, treat it as plain text -and output it as stdout/stderr from a command. - -See: rhbz#1644628 ---- - tools/crm_resource_print.c | 122 +++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 122 insertions(+) - -diff --git a/tools/crm_resource_print.c b/tools/crm_resource_print.c -index 119d83f..19a366d 100644 ---- a/tools/crm_resource_print.c -+++ b/tools/crm_resource_print.c -@@ -293,6 +293,126 @@ property_list_text(pcmk__output_t *out, va_list args) { - return pcmk_rc_ok; - } - -+PCMK__OUTPUT_ARGS("resource-agent-action", "int", "const char *", "const char *", -+ "const char *", "const char *", "const char *", "GHashTable *", -+ "int", "int", "char *", "char *") -+static int -+resource_agent_action_default(pcmk__output_t *out, va_list args) { -+ int verbose = va_arg(args, int); -+ -+ const char *class = va_arg(args, const char *); -+ const char *provider = va_arg(args, const char *); -+ const char *type = va_arg(args, const char *); -+ const char *rsc_name = va_arg(args, const char *); -+ const char *action = va_arg(args, const char *); -+ GHashTable *overrides = va_arg(args, GHashTable *); -+ int rc = va_arg(args, int); -+ int status = va_arg(args, int); -+ char *stdout_data = va_arg(args, char *); -+ char *stderr_data = va_arg(args, char *); -+ -+ if (overrides) { -+ GHashTableIter iter; -+ char *name = NULL; -+ char *value = NULL; -+ -+ out->begin_list(out, NULL, NULL, "overrides"); -+ -+ g_hash_table_iter_init(&iter, overrides); -+ while (g_hash_table_iter_next(&iter, (gpointer *) &name, (gpointer *) &value)) { -+ out->message(out, "override", rsc_name, name, value); -+ } -+ -+ out->end_list(out); -+ } -+ -+ out->message(out, "agent-status", status, action, rsc_name, class, provider, -+ type, rc); -+ -+ /* hide output for validate-all if not in verbose */ -+ if (verbose == 0 && pcmk__str_eq(action, "validate-all", pcmk__str_casei)) { -+ return pcmk_rc_ok; -+ } -+ -+ if (stdout_data || stderr_data) { -+ xmlNodePtr doc = string2xml(stdout_data); -+ -+ if (doc != NULL) { -+ out->output_xml(out, "command", stdout_data); -+ xmlFreeNode(doc); -+ } else { -+ out->subprocess_output(out, rc, stdout_data, stderr_data); -+ } -+ } -+ -+ return pcmk_rc_ok; -+} -+ -+PCMK__OUTPUT_ARGS("resource-agent-action", "int", "const char *", "const char *", -+ "const char *", "const char *", "const char *", "GHashTable *", -+ "int", "int", "char *", "char *") -+static int -+resource_agent_action_xml(pcmk__output_t *out, va_list args) { -+ int verbose G_GNUC_UNUSED = va_arg(args, int); -+ -+ const char *class = va_arg(args, const char *); -+ const char *provider = va_arg(args, const char *); -+ const char *type = va_arg(args, const char *); -+ const char *rsc_name = va_arg(args, const char *); -+ const char *action = va_arg(args, const char *); -+ GHashTable *overrides = va_arg(args, GHashTable *); -+ int rc = va_arg(args, int); -+ int status = va_arg(args, int); -+ char *stdout_data = va_arg(args, char *); -+ char *stderr_data = va_arg(args, char *); -+ -+ xmlNodePtr node = pcmk__output_xml_create_parent(out, "resource-agent-action", -+ "action", action, -+ "class", class, -+ "type", type, -+ NULL); -+ -+ if (rsc_name) { -+ crm_xml_add(node, "rsc", rsc_name); -+ } -+ -+ if (provider) { -+ crm_xml_add(node, "provider", provider); -+ } -+ -+ if (overrides) { -+ GHashTableIter iter; -+ char *name = NULL; -+ char *value = NULL; -+ -+ out->begin_list(out, NULL, NULL, "overrides"); -+ -+ g_hash_table_iter_init(&iter, overrides); -+ while (g_hash_table_iter_next(&iter, (gpointer *) &name, (gpointer *) &value)) { -+ out->message(out, "override", rsc_name, name, value); -+ } -+ -+ out->end_list(out); -+ } -+ -+ out->message(out, "agent-status", status, action, rsc_name, class, provider, -+ type, rc); -+ -+ if (stdout_data || stderr_data) { -+ xmlNodePtr doc = string2xml(stdout_data); -+ -+ if (doc != NULL) { -+ out->output_xml(out, "command", stdout_data); -+ xmlFreeNode(doc); -+ } else { -+ out->subprocess_output(out, rc, stdout_data, stderr_data); -+ } -+ } -+ -+ pcmk__output_xml_pop_parent(out); -+ return pcmk_rc_ok; -+} -+ - PCMK__OUTPUT_ARGS("resource-check-list", "resource_checks_t *") - static int - resource_check_list_default(pcmk__output_t *out, va_list args) { -@@ -658,6 +778,8 @@ static pcmk__message_entry_t fmt_functions[] = { - { "override", "xml", override_xml }, - { "property-list", "default", property_list_default }, - { "property-list", "text", property_list_text }, -+ { "resource-agent-action", "default", resource_agent_action_default }, -+ { "resource-agent-action", "xml", resource_agent_action_xml }, - { "resource-check-list", "default", resource_check_list_default }, - { "resource-check-list", "xml", resource_check_list_xml }, - { "resource-search-list", "default", resource_search_list_default }, --- -1.8.3.1 - - -From b50b2418e1e997b42f5370b4672a3f105d74634f Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 4 Jun 2021 10:40:16 -0400 -Subject: [PATCH 10/10] Feature: tools: Use the new resource-agent-action - message. - -See: rhbz#1644628 ---- - tools/crm_resource_runtime.c | 21 +++------------------ - 1 file changed, 3 insertions(+), 18 deletions(-) - -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index ebf48bb..755be9f 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1765,28 +1765,13 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - if (services_action_sync(op)) { - exit_code = op->rc; - -- if (op->status == PCMK_LRM_OP_DONE) { -- out->info(out, "Operation %s for %s (%s:%s:%s) returned: '%s' (%d)", -- action, rsc_name, rsc_class, rsc_prov ? rsc_prov : "", rsc_type, -- services_ocf_exitcode_str(op->rc), op->rc); -- } else { -- out->err(out, "Operation %s for %s (%s:%s:%s) failed: '%s' (%d)", -- action, rsc_name, rsc_class, rsc_prov ? rsc_prov : "", rsc_type, -- services_lrm_status_str(op->status), op->status); -- } -- -- /* hide output for validate-all if not in verbose */ -- if (resource_verbose == 0 && pcmk__str_eq(action, "validate-all", pcmk__str_casei)) -- goto done; -- -- if (op->stdout_data || op->stderr_data) { -- out->subprocess_output(out, op->rc, op->stdout_data, op->stderr_data); -- } -+ out->message(out, "resource-agent-action", resource_verbose, rsc_class, -+ rsc_prov, rsc_type, rsc_name, action, override_hash, op->rc, -+ op->status, op->stdout_data, op->stderr_data); - } else { - exit_code = op->rc == 0 ? CRM_EX_ERROR : op->rc; - } - --done: - services_action_free(op); - /* See comment above about why we free params here. */ - g_hash_table_destroy(params); --- -1.8.3.1 - diff --git a/SOURCES/005-fencing-reasons.patch b/SOURCES/005-fencing-reasons.patch new file mode 100644 index 0000000..e0772c6 --- /dev/null +++ b/SOURCES/005-fencing-reasons.patch @@ -0,0 +1,2200 @@ +From 3d10dad9a555aae040d8473edfe31a4e4279c066 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:34:03 -0600 +Subject: [PATCH 01/19] Refactor: libcrmcommon: add internal API for checking + for fencing action + +The naming is a little awkward -- "fencing action" has multiple meanings +depending on the context. It can refer to fencer API requests, fence device +actions, fence agent actions, or just those actions that fence a node (off and +reboot). + +This new function pcmk__is_fencing_action() uses the last meaning, so it does +*not* return true for unfencing ("on" actions). +--- + include/crm/common/internal.h | 1 + + lib/common/operations.c | 14 ++++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h +index a35c5769a..694fc6cd4 100644 +--- a/include/crm/common/internal.h ++++ b/include/crm/common/internal.h +@@ -218,6 +218,7 @@ char *pcmk__notify_key(const char *rsc_id, const char *notify_type, + char *pcmk__transition_key(int transition_id, int action_id, int target_rc, + const char *node); + void pcmk__filter_op_for_digest(xmlNode *param_set); ++bool pcmk__is_fencing_action(const char *action); + + + // bitwise arithmetic utilities +diff --git a/lib/common/operations.c b/lib/common/operations.c +index aa7106ce6..366c18970 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -523,3 +523,17 @@ crm_op_needs_metadata(const char *rsc_class, const char *op) + CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, + CRMD_ACTION_NOTIFY, NULL); + } ++ ++/*! ++ * \internal ++ * \brief Check whether an action name is for a fencing action ++ * ++ * \param[in] action Action name to check ++ * ++ * \return true if \p action is "off", "reboot", or "poweroff", otherwise false ++ */ ++bool ++pcmk__is_fencing_action(const char *action) ++{ ++ return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); ++} +-- +2.27.0 + + +From 86ac00fb3e99d79ca2c442ae1670fe850146f734 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:38:58 -0600 +Subject: [PATCH 02/19] Low: fencer,scheduler: compare fence action names + case-sensitively + +Use the new convenience function pcmk__is_fencing_action() to check whether +an action name is a fencing action ("off", "reboot", or "poweroff"). This +changes the behavior from case-insensitive to case-sensitive, which is more +appropriate (the case-insensitivity was inherited from lazy use of the old +safe_str_eq() function which was always case-insensitive). +--- + daemons/fenced/fenced_commands.c | 6 +++--- + daemons/fenced/fenced_remote.c | 2 +- + lib/pacemaker/pcmk_graph_producer.c | 2 +- + lib/pengine/common.c | 8 +------- + 4 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 63bfad3a9..46c840f2a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -128,7 +128,7 @@ get_action_delay_max(stonith_device_t * device, const char * action) + const char *value = NULL; + int delay_max = 0; + +- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { ++ if (!pcmk__is_fencing_action(action)) { + return 0; + } + +@@ -146,7 +146,7 @@ get_action_delay_base(stonith_device_t *device, const char *action, const char * + char *hash_value = NULL; + int delay_base = 0; + +- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { ++ if (!pcmk__is_fencing_action(action)) { + return 0; + } + +@@ -448,7 +448,7 @@ stonith_device_execute(stonith_device_t * device) + + if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, + STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { +- if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { ++ if (pcmk__is_fencing_action(cmd->action)) { + if (node_does_watchdog_fencing(stonith_our_uname)) { + pcmk__panic(__func__); + goto done; +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 963433bf3..358ea3aa7 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1758,7 +1758,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + if (!tp) { + return FALSE; + } +- if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) { ++ if (pcmk__is_fencing_action(op->action)) { + /* Don't count the devices on the target node if we are killing + * the target node. */ + skip_target = TRUE; +diff --git a/lib/pacemaker/pcmk_graph_producer.c b/lib/pacemaker/pcmk_graph_producer.c +index ffcbd1274..5bec9d8ce 100644 +--- a/lib/pacemaker/pcmk_graph_producer.c ++++ b/lib/pacemaker/pcmk_graph_producer.c +@@ -721,7 +721,7 @@ add_downed_nodes(xmlNode *xml, const pe_action_t *action, + /* Fencing makes the action's node and any hosted guest nodes down */ + const char *fence = g_hash_table_lookup(action->meta, "stonith_action"); + +- if (pcmk__strcase_any_of(fence, "off", "reboot", NULL)) { ++ if (pcmk__is_fencing_action(fence)) { + xmlNode *downed = create_xml_node(xml, XML_GRAPH_TAG_DOWNED); + add_node_to_xml_by_id(action->node->details->id, downed); + pe_foreach_guest_node(data_set, action->node, add_node_to_xml, downed); +diff --git a/lib/pengine/common.c b/lib/pengine/common.c +index 236fc26b1..fe4223816 100644 +--- a/lib/pengine/common.c ++++ b/lib/pengine/common.c +@@ -27,12 +27,6 @@ check_health(const char *value) + "migrate-on-red", NULL); + } + +-static bool +-check_stonith_action(const char *value) +-{ +- return pcmk__strcase_any_of(value, "reboot", "poweroff", "off", NULL); +-} +- + static bool + check_placement_strategy(const char *value) + { +@@ -114,7 +108,7 @@ static pcmk__cluster_option_t pe_opts[] = { + }, + { + "stonith-action", NULL, "select", "reboot, off, poweroff", +- "reboot", check_stonith_action, ++ "reboot", pcmk__is_fencing_action, + "Action to send to fence device when a node needs to be fenced " + "(\"poweroff\" is a deprecated alias for \"off\")", + NULL +-- +2.27.0 + + +From c8f6e8a04c4fa4271db817af0a23aa941c9d7689 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 17:42:21 -0600 +Subject: [PATCH 03/19] Refactor: fencing: rename type for peer query replies + +st_query_result_t contains the device information parsed from a peer's query +reply, but the name could easily be confused with the actual success/failure +result of the query action itself. Rename it to peer_device_info_t. +--- + daemons/fenced/fenced_remote.c | 103 +++++++++++++++++---------------- + 1 file changed, 52 insertions(+), 51 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 358ea3aa7..9e2f62804 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -41,7 +41,7 @@ + + /* When one fencer queries its peers for devices able to handle a fencing + * request, each peer will reply with a list of such devices available to it. +- * Each reply will be parsed into a st_query_result_t, with each device's ++ * Each reply will be parsed into a peer_device_info_t, with each device's + * information kept in a device_properties_t. + */ + +@@ -72,18 +72,19 @@ typedef struct st_query_result_s { + int ndevices; + /* Devices available to this host that are capable of fencing the target */ + GHashTable *devices; +-} st_query_result_t; ++} peer_device_info_t; + + GHashTable *stonith_remote_op_list = NULL; + +-void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc); ++void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, ++ int rc); + static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, +- const st_query_result_t *chosen_peer); ++ const peer_device_info_t *chosen_peer); + + static gint + sort_strings(gconstpointer a, gconstpointer b) +@@ -95,7 +96,7 @@ static void + free_remote_query(gpointer data) + { + if (data) { +- st_query_result_t *query = data; ++ peer_device_info_t *query = data; + + crm_trace("Free'ing query result from %s", query->host); + g_hash_table_destroy(query->devices); +@@ -150,8 +151,8 @@ count_peer_device(gpointer key, gpointer value, gpointer user_data) + * \return Number of devices available to peer that were not already executed + */ + static int +-count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, +- gboolean verified_only) ++count_peer_devices(const remote_fencing_op_t *op, ++ const peer_device_info_t *peer, gboolean verified_only) + { + struct peer_count_data data; + +@@ -175,7 +176,7 @@ count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, + * \return Device properties if found, NULL otherwise + */ + static device_properties_t * +-find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, ++find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer, + const char *device) + { + device_properties_t *props = g_hash_table_lookup(peer->devices, device); +@@ -196,7 +197,7 @@ find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, + * \return TRUE if device was found and marked, FALSE otherwise + */ + static gboolean +-grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer, ++grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer, + const char *device, gboolean verified_devices_only) + { + device_properties_t *props = find_peer_device(op, peer, device); +@@ -1216,7 +1217,7 @@ enum find_best_peer_options { + FIND_PEER_VERIFIED_ONLY = 0x0004, + }; + +-static st_query_result_t * ++static peer_device_info_t * + find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options) + { + GList *iter = NULL; +@@ -1227,7 +1228,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + } + + for (iter = op->query_results; iter != NULL; iter = iter->next) { +- st_query_result_t *peer = iter->data; ++ peer_device_info_t *peer = iter->data; + + crm_trace("Testing result from %s targeting %s with %d device%s: %d %x", + peer->host, op->target, peer->ndevices, +@@ -1257,11 +1258,11 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + return NULL; + } + +-static st_query_result_t * ++static peer_device_info_t * + stonith_choose_peer(remote_fencing_op_t * op) + { + const char *device = NULL; +- st_query_result_t *peer = NULL; ++ peer_device_info_t *peer = NULL; + uint32_t active = fencing_active_peers(); + + do { +@@ -1317,8 +1318,8 @@ stonith_choose_peer(remote_fencing_op_t * op) + } + + static int +-get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, +- const char *device) ++get_device_timeout(const remote_fencing_op_t *op, ++ const peer_device_info_t *peer, const char *device) + { + device_properties_t *props; + +@@ -1338,7 +1339,7 @@ get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, + + struct timeout_data { + const remote_fencing_op_t *op; +- const st_query_result_t *peer; ++ const peer_device_info_t *peer; + int total_timeout; + }; + +@@ -1365,7 +1366,7 @@ add_device_timeout(gpointer key, gpointer value, gpointer user_data) + } + + static int +-get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) ++get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer) + { + struct timeout_data timeout; + +@@ -1380,7 +1381,7 @@ get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) + + static int + get_op_total_timeout(const remote_fencing_op_t *op, +- const st_query_result_t *chosen_peer) ++ const peer_device_info_t *chosen_peer) + { + int total_timeout = 0; + stonith_topology_t *tp = find_topology_for_host(op->target); +@@ -1403,7 +1404,7 @@ get_op_total_timeout(const remote_fencing_op_t *op, + } + for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { + for (iter = op->query_results; iter != NULL; iter = iter->next) { +- const st_query_result_t *peer = iter->data; ++ const peer_device_info_t *peer = iter->data; + + if (find_peer_device(op, peer, device_list->data)) { + total_timeout += get_device_timeout(op, peer, +@@ -1555,7 +1556,7 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + } + + void +-call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) ++call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + { + const char *device = NULL; + int timeout = op->base_timeout; +@@ -1734,8 +1735,8 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) + static gint + sort_peers(gconstpointer a, gconstpointer b) + { +- const st_query_result_t *peer_a = a; +- const st_query_result_t *peer_b = b; ++ const peer_device_info_t *peer_a = a; ++ const peer_device_info_t *peer_b = b; + + return (peer_b->ndevices - peer_a->ndevices); + } +@@ -1768,7 +1769,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + for (device = tp->levels[i]; device; device = device->next) { + match = NULL; + for (iter = op->query_results; iter && !match; iter = iter->next) { +- st_query_result_t *peer = iter->data; ++ peer_device_info_t *peer = iter->data; + + if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { + continue; +@@ -1850,31 +1851,31 @@ parse_action_specific(xmlNode *xml, const char *peer, const char *device, + * + * \param[in] xml XML node containing device properties + * \param[in,out] op Operation that query and reply relate to +- * \param[in,out] result Peer's results ++ * \param[in,out] peer Peer's device information + * \param[in] device ID of device being parsed + */ + static void + add_device_properties(xmlNode *xml, remote_fencing_op_t *op, +- st_query_result_t *result, const char *device) ++ peer_device_info_t *peer, const char *device) + { + xmlNode *child; + int verified = 0; + device_properties_t *props = calloc(1, sizeof(device_properties_t)); + +- /* Add a new entry to this result's devices list */ ++ /* Add a new entry to this peer's devices list */ + CRM_ASSERT(props != NULL); +- g_hash_table_insert(result->devices, strdup(device), props); ++ g_hash_table_insert(peer->devices, strdup(device), props); + + /* Peers with verified (monitored) access will be preferred */ + crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified); + if (verified) { + crm_trace("Peer %s has confirmed a verified device %s", +- result->host, device); ++ peer->host, device); + props->verified = TRUE; + } + + /* Parse action-specific device properties */ +- parse_action_specific(xml, result->host, device, op_requested_action(op), ++ parse_action_specific(xml, peer->host, device, op_requested_action(op), + op, st_phase_requested, props); + for (child = pcmk__xml_first_child(xml); child != NULL; + child = pcmk__xml_next(child)) { +@@ -1883,10 +1884,10 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, + * winds up getting remapped. + */ + if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) { +- parse_action_specific(child, result->host, device, "off", ++ parse_action_specific(child, peer->host, device, "off", + op, st_phase_off, props); + } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) { +- parse_action_specific(child, result->host, device, "on", ++ parse_action_specific(child, peer->host, device, "on", + op, st_phase_on, props); + } + } +@@ -1903,17 +1904,17 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, + * + * \return Newly allocated result structure with parsed reply + */ +-static st_query_result_t * ++static peer_device_info_t * + add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml) + { +- st_query_result_t *result = calloc(1, sizeof(st_query_result_t)); ++ peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t)); + xmlNode *child; + + // cppcheck seems not to understand the abort logic in CRM_CHECK + // cppcheck-suppress memleak +- CRM_CHECK(result != NULL, return NULL); +- result->host = strdup(host); +- result->devices = pcmk__strkey_table(free, free); ++ CRM_CHECK(peer != NULL, return NULL); ++ peer->host = strdup(host); ++ peer->devices = pcmk__strkey_table(free, free); + + /* Each child element describes one capable device available to the peer */ + for (child = pcmk__xml_first_child(xml); child != NULL; +@@ -1921,17 +1922,17 @@ add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml + const char *device = ID(child); + + if (device) { +- add_device_properties(child, op, result, device); ++ add_device_properties(child, op, peer, device); + } + } + +- result->ndevices = g_hash_table_size(result->devices); +- CRM_CHECK(ndevices == result->ndevices, ++ peer->ndevices = g_hash_table_size(peer->devices); ++ CRM_CHECK(ndevices == peer->ndevices, + crm_err("Query claimed to have %d device%s but %d found", +- ndevices, pcmk__plural_s(ndevices), result->ndevices)); ++ ndevices, pcmk__plural_s(ndevices), peer->ndevices)); + +- op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); +- return result; ++ op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers); ++ return peer; + } + + /*! +@@ -1957,7 +1958,7 @@ process_remote_stonith_query(xmlNode * msg) + const char *id = NULL; + const char *host = NULL; + remote_fencing_op_t *op = NULL; +- st_query_result_t *result = NULL; ++ peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); + +@@ -1991,7 +1992,7 @@ process_remote_stonith_query(xmlNode * msg) + op->replies, replies_expected, host, + op->target, op->action, ndevices, pcmk__plural_s(ndevices), id); + if (ndevices > 0) { +- result = add_result(op, host, ndevices, dev); ++ peer = add_result(op, host, ndevices, dev); + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2001,7 +2002,7 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- call_remote_stonith(op, result, pcmk_ok); ++ call_remote_stonith(op, peer, pcmk_ok); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", +@@ -2010,15 +2011,15 @@ process_remote_stonith_query(xmlNode * msg) + } + + } else if (op->state == st_query) { +- int nverified = count_peer_devices(op, result, TRUE); ++ int nverified = count_peer_devices(op, peer, TRUE); + + /* We have a result for a non-topology fencing op that looks promising, + * go ahead and start fencing before query timeout */ +- if (result && (host_is_target == FALSE) && nverified) { ++ if ((peer != NULL) && !host_is_target && nverified) { + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- call_remote_stonith(op, result, pcmk_ok); ++ call_remote_stonith(op, peer, pcmk_ok); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", +@@ -2029,10 +2030,10 @@ process_remote_stonith_query(xmlNode * msg) + crm_trace("Waiting for more peer results before launching fencing operation"); + } + +- } else if (result && (op->state == st_done)) { ++ } else if ((peer != NULL) && (op->state == st_done)) { + crm_info("Discarding query result from %s (%d device%s): " +- "Operation is %s", result->host, +- result->ndevices, pcmk__plural_s(result->ndevices), ++ "Operation is %s", peer->host, ++ peer->ndevices, pcmk__plural_s(peer->ndevices), + stonith_op_state_str(op->state)); + } + +-- +2.27.0 + + +From 913e0620310089d2250e9ecde383df757f8e8063 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 12:46:37 -0600 +Subject: [PATCH 04/19] Low: fencer: improve broadcasting replies for fenced + originators + +If the target of a fencing action was also the originator, the executioner +broadcasts the result on their behalf. + +Previously, it would check if the action was not in a list of actions that are +never broadcasted. However we really only want to broadcast off/reboot results +so just check for that instead. + +This also rearranges reply creation slightly so we don't trace-log the reply +until it is fully created. +--- + daemons/fenced/fenced_commands.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 46c840f2a..e4185f6e1 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2385,32 +2385,31 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + int pid, bool merged) + { + xmlNode *reply = NULL; +- gboolean bcast = FALSE; ++ bool bcast = false; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + + reply = construct_async_reply(cmd, result); + +- // Only replies for certain actions are broadcast +- if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", +- NULL)) { +- crm_trace("Never broadcast '%s' replies", cmd->action); ++ // If target was also the originator, broadcast fencing results for it ++ if (!stand_alone && pcmk__is_fencing_action(cmd->action) ++ && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { + +- } else if (!stand_alone && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei) && !pcmk__str_eq(cmd->action, "on", pcmk__str_casei)) { +- crm_trace("Broadcast '%s' reply for %s", cmd->action, cmd->victim); ++ crm_trace("Broadcast '%s' result for %s (target was also originator)", ++ cmd->action, cmd->victim); + crm_xml_add(reply, F_SUBTYPE, "broadcast"); +- bcast = TRUE; ++ crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); ++ bcast = true; + } + + log_async_result(cmd, result, pid, NULL, merged); +- crm_log_xml_trace(reply, "Reply"); + + if (merged) { + crm_xml_add(reply, F_STONITH_MERGED, "true"); + } ++ crm_log_xml_trace(reply, "Reply"); + + if (bcast) { +- crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); + + } else if (cmd->origin) { +-- +2.27.0 + + +From 8b8f94fd9ca5e61922cb81e32c8a3d0f1d75fb0b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 14:40:49 -0600 +Subject: [PATCH 05/19] Refactor: fencer: avoid code duplication when sending + async reply + +... and clean up reply function +--- + daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index e4185f6e1..4ea0a337a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2411,15 +2411,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + + if (bcast) { + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); +- +- } else if (cmd->origin) { +- crm_trace("Directed reply to %s", cmd->origin); +- send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); +- + } else { +- crm_trace("Directed local %ssync reply to %s", +- (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name); +- do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); ++ stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); + } + + if (stand_alone) { +@@ -2814,16 +2807,28 @@ check_alternate_host(const char *target) + return alternate_host; + } + ++/*! ++ * \internal ++ * \brief Send a reply to a CPG peer or IPC client ++ * ++ * \param[in] reply XML reply to send ++ * \param[in] call_options Send synchronously if st_opt_sync_call is set here ++ * \param[in] remote_peer If not NULL, name of peer node to send CPG reply ++ * \param[in] client_id If not NULL, name of client to send IPC reply ++ */ + static void +-stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, ++stonith_send_reply(xmlNode *reply, int call_options, const char *remote_peer, + const char *client_id) + { +- if (remote_peer) { +- send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); +- } else { ++ CRM_CHECK((reply != NULL) && ((remote_peer != NULL) || (client_id != NULL)), ++ return); ++ ++ if (remote_peer == NULL) { + do_local_reply(reply, client_id, +- pcmk_is_set(call_options, st_opt_sync_call), +- (remote_peer != NULL)); ++ pcmk_is_set(call_options, st_opt_sync_call), FALSE); ++ } else { ++ send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, ++ reply, FALSE); + } + } + +-- +2.27.0 + + +From 2cdbda58f0e9f38a0e302506107fd933cb415144 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 17:24:09 -0600 +Subject: [PATCH 06/19] Refactor: fencer: ensure all requests get clean-up + +handle_request() has if-else blocks for each type of request. Previously, if a +request didn't need a reply, the function would do any clean-up needed and +return immediately. Now, we track whether a reply is needed, and all request +types flow to the end of the function for consistent clean-up. + +This doesn't change any behavior at this point, but allows us to do more at the +end of request handling. +--- + daemons/fenced/fenced_commands.c | 46 ++++++++++++++++++-------------- + 1 file changed, 26 insertions(+), 20 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 4ea0a337a..19477b49b 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2892,6 +2892,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + xmlNode *data = NULL; + xmlNode *reply = NULL; ++ bool need_reply = true; + + char *output = NULL; + const char *op = crm_element_value(request, F_STONITH_OPERATION); +@@ -2921,10 +2922,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + pcmk__ipc_send_xml(client, id, reply, flags); + client->request_id = 0; + free_xml(reply); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { + rc = stonith_device_action(request, &output); ++ need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -2933,7 +2936,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); + do_stonith_async_timeout_update(client_id, call_id, op_timeout); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + if (remote_peer) { +@@ -2944,7 +2948,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + remove_relay_op(request); + + stonith_query(request, remote_peer, client_id, call_options); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { + const char *flag_name = NULL; +@@ -2965,7 +2970,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); +- return 0; ++ rc = pcmk_ok; ++ need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); +@@ -2977,8 +2983,11 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value(dev, F_STONITH_ACTION), + crm_element_value(dev, F_STONITH_TARGET)); + +- if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { ++ if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { ++ rc = -EPROTO; ++ } else { + rc = -EINPROGRESS; ++ need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +@@ -3012,7 +3021,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); + + if (stonith_check_fence_tolerance(tolerance, target, action)) { +- rc = 0; ++ rc = pcmk_ok; + goto done; + } + +@@ -3047,10 +3056,13 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + FALSE); + rc = -EINPROGRESS; + +- } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) { ++ } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { ++ rc = -EPROTO; ++ } else { + rc = -EINPROGRESS; + } + } ++ need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { + rc = stonith_fence_history(request, &data, remote_peer, call_options); +@@ -3058,8 +3070,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + /* we don't expect answers to the broadcast + * we might have sent out + */ +- free_xml(data); +- return pcmk_ok; ++ rc = pcmk_ok; ++ need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { +@@ -3111,8 +3123,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(request, XML_ATTR_ID, &node_id); + name = crm_element_value(request, XML_ATTR_UNAME); + reap_crm_member(node_id, name); +- +- return pcmk_ok; ++ rc = pcmk_ok; ++ need_reply = false; + + } else { + crm_err("Unknown IPC request %s from %s %s", op, +@@ -3120,20 +3132,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + ((client == NULL)? remote_peer : pcmk__client_name(client))); + } + +- done: +- ++done: + if (rc == -EACCES) { + crm_warn("Rejecting IPC request '%s' from unprivileged client %s", + crm_str(op), pcmk__client_name(client)); + } + +- /* Always reply unless the request is in process still. +- * If in progress, a reply will happen async after the request +- * processing is finished */ +- if (rc != -EINPROGRESS) { +- crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0, +- id, pcmk_is_set(call_options, st_opt_sync_call), call_options, +- crm_element_value(request, F_STONITH_CALLOPTS)); ++ // Reply if result is known ++ if (need_reply) { + + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); +-- +2.27.0 + + +From 067d655ebd3fbb0ed27f4e7426db4c3b661ba777 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 23 Nov 2021 17:26:32 -0600 +Subject: [PATCH 07/19] Log: fencer: improve debug logs when processing CPG/IPC + messages + +By moving the result log messages from stonith_command() to handle_reply() and +handle_request(), we can simplify stonith_command() and give slightly better +messages. +--- + daemons/fenced/fenced_commands.c | 80 +++++++++++++++----------------- + 1 file changed, 38 insertions(+), 42 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 19477b49b..98af0e04f 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2883,7 +2883,7 @@ remove_relay_op(xmlNode * request) + } + } + +-static int ++static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) + { +@@ -3152,73 +3152,69 @@ done: + free_xml(data); + free_xml(reply); + +- return rc; ++ crm_debug("Processed %s request from %s %s: %s (rc=%d)", ++ op, ((client == NULL)? "peer" : "client"), ++ ((client == NULL)? remote_peer : pcmk__client_name(client)), ++ ((rc > 0)? "" : pcmk_strerror(rc)), rc); + } + + static void + handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) + { +- const char *op = crm_element_value(request, F_STONITH_OPERATION); ++ // Copy, because request might be freed before we want to log this ++ char *op = crm_element_value_copy(request, F_STONITH_OPERATION); + + if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + process_remote_stonith_query(request); +- } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { +- process_remote_stonith_exec(request); +- } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +- /* Reply to a complex fencing op */ ++ } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { + process_remote_stonith_exec(request); + } else { +- crm_err("Unknown %s reply from %s %s", op, +- ((client == NULL)? "peer" : "client"), ++ crm_err("Ignoring unknown %s reply from %s %s", ++ crm_str(op), ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client))); + crm_log_xml_warn(request, "UnknownOp"); ++ free(op); ++ return; + } ++ crm_debug("Processed %s reply from %s %s", ++ op, ((client == NULL)? "peer" : "client"), ++ ((client == NULL)? remote_peer : pcmk__client_name(client))); ++ free(op); + } + ++/*! ++ * \internal ++ * \brief Handle a message from an IPC client or CPG peer ++ * ++ * \param[in] client If not NULL, IPC client that sent message ++ * \param[in] id If from IPC client, IPC message ID ++ * \param[in] flags Message flags ++ * \param[in] message Message XML ++ * \param[in] remote_peer If not NULL, CPG peer that sent message ++ */ + void + stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, +- xmlNode *request, const char *remote_peer) ++ xmlNode *message, const char *remote_peer) + { +- int call_options = 0; +- int rc = 0; +- gboolean is_reply = FALSE; +- +- /* Copy op for reporting. The original might get freed by handle_reply() +- * before we use it in crm_debug(): +- * handle_reply() +- * |- process_remote_stonith_exec() +- * |-- remote_op_done() +- * |--- handle_local_reply_and_notify() +- * |---- crm_xml_add(...F_STONITH_OPERATION...) +- * |--- free_xml(op->request) +- */ +- char *op = crm_element_value_copy(request, F_STONITH_OPERATION); +- +- if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_NEVER)) { +- is_reply = TRUE; +- } ++ int call_options = st_opt_none; ++ bool is_reply = get_xpath_object("//" T_STONITH_REPLY, message, ++ LOG_NEVER) != NULL; + +- crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); +- crm_debug("Processing %s%s %u from %s %s with call options 0x%08x", +- op, (is_reply? " reply" : ""), id, ++ crm_element_value_int(message, F_STONITH_CALLOPTS, &call_options); ++ crm_debug("Processing %ssynchronous %s %s %u from %s %s", ++ pcmk_is_set(call_options, st_opt_sync_call)? "" : "a", ++ crm_element_value(message, F_STONITH_OPERATION), ++ (is_reply? "reply" : "request"), id, + ((client == NULL)? "peer" : "client"), +- ((client == NULL)? remote_peer : pcmk__client_name(client)), +- call_options); ++ ((client == NULL)? remote_peer : pcmk__client_name(client))); + + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); + } + + if (is_reply) { +- handle_reply(client, request, remote_peer); ++ handle_reply(client, message, remote_peer); + } else { +- rc = handle_request(client, id, flags, request, remote_peer); ++ handle_request(client, id, flags, message, remote_peer); + } +- +- crm_debug("Processed %s%s from %s %s: %s (rc=%d)", +- op, (is_reply? " reply" : ""), +- ((client == NULL)? "peer" : "client"), +- ((client == NULL)? remote_peer : pcmk__client_name(client)), +- ((rc > 0)? "" : pcmk_strerror(rc)), rc); +- free(op); + } +-- +2.27.0 + + +From 44cb340c11b4652f452a47eb2b0050b4a459382b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 15 Nov 2021 16:29:09 -0600 +Subject: [PATCH 08/19] Refactor: fencer: drop unused argument from + notification functions + +--- + daemons/fenced/fenced_commands.c | 12 ++++++------ + daemons/fenced/fenced_history.c | 6 +++--- + daemons/fenced/fenced_remote.c | 6 +++--- + daemons/fenced/pacemaker-fenced.c | 18 +++++++++--------- + daemons/fenced/pacemaker-fenced.h | 6 +++--- + 5 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 98af0e04f..946ce4042 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2428,8 +2428,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + free_xml(reply); +@@ -3082,7 +3082,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_device(call_options, op, rc, device_id); ++ do_stonith_notify_device(op, rc, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3093,7 +3093,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_device(call_options, op, rc, device_id); ++ do_stonith_notify_device(op, rc, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; +@@ -3103,7 +3103,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_level(call_options, op, rc, device_id); ++ do_stonith_notify_level(op, rc, device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3114,7 +3114,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else { + rc = -EACCES; + } +- do_stonith_notify_level(call_options, op, rc, device_id); ++ do_stonith_notify_level(op, rc, device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 1ba034ba9..7127593b6 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, + g_hash_table_foreach_remove(stonith_remote_op_list, + stonith_remove_history_entry, + (gpointer) target); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + } + +@@ -396,7 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + + if (updated) { + stonith_fence_history_trim(); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + if (cnt == 0) { +@@ -470,7 +470,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + is done so send a notification for anything + that smells like history-sync + */ +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY_SYNCED, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); + if (crm_element_value(msg, F_STONITH_CALLID)) { + /* this is coming from the stonith-API + * +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 9e2f62804..c907cd120 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -423,8 +423,8 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; +@@ -1119,7 +1119,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + if (op->state != st_duplicate) { + /* kick history readers */ +- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); ++ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + + /* safe to trim as long as that doesn't touch pending ops */ +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index a64004ce1..a290e1670 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -357,7 +357,7 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int + } + + void +-do_stonith_notify(int options, const char *type, int result, xmlNode * data) ++do_stonith_notify(const char *type, int result, xmlNode *data) + { + /* TODO: Standardize the contents of data */ + xmlNode *update_msg = create_xml_node(NULL, "notify"); +@@ -380,7 +380,7 @@ do_stonith_notify(int options, const char *type, int result, xmlNode * data) + } + + static void +-do_stonith_notify_config(int options, const char *op, int rc, ++do_stonith_notify_config(const char *op, int rc, + const char *desc, int active) + { + xmlNode *notify_data = create_xml_node(NULL, op); +@@ -390,20 +390,20 @@ do_stonith_notify_config(int options, const char *op, int rc, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(options, op, rc, notify_data); ++ do_stonith_notify(op, rc, notify_data); + free_xml(notify_data); + } + + void +-do_stonith_notify_device(int options, const char *op, int rc, const char *desc) ++do_stonith_notify_device(const char *op, int rc, const char *desc) + { +- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); ++ do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); + } + + void +-do_stonith_notify_level(int options, const char *op, int rc, const char *desc) ++do_stonith_notify_level(const char *op, int rc, const char *desc) + { +- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); ++ do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); + } + + static void +@@ -418,7 +418,7 @@ topology_remove_helper(const char *node, int level) + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + + rc = stonith_level_remove(data, &desc); +- do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); ++ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); + + free_xml(data); + free(desc); +@@ -468,7 +468,7 @@ handle_topology_change(xmlNode *match, bool remove) + } + + rc = stonith_level_register(match, &desc); +- do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); ++ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); + + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index a64b57693..3e41d867e 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -233,9 +233,9 @@ xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + +-void do_stonith_notify(int options, const char *type, int result, xmlNode * data); +-void do_stonith_notify_device(int options, const char *op, int rc, const char *desc); +-void do_stonith_notify_level(int options, const char *op, int rc, const char *desc); ++void do_stonith_notify(const char *type, int result, xmlNode *data); ++void do_stonith_notify_device(const char *op, int rc, const char *desc); ++void do_stonith_notify_level(const char *op, int rc, const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, +-- +2.27.0 + + +From a49df4901b663b3366634c1d58f04625ecba4005 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 11:57:14 -0600 +Subject: [PATCH 09/19] Refactor: fencer: functionize checking for privileged + client + +... for readability and to make planned changes easier +--- + daemons/fenced/fenced_commands.c | 49 +++++++++++++++++++------------- + 1 file changed, 30 insertions(+), 19 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 946ce4042..34c956f5c 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2883,6 +2883,32 @@ remove_relay_op(xmlNode * request) + } + } + ++/*! ++ * \internal ++ * \brief Check whether an API request was sent by a privileged user ++ * ++ * API commands related to fencing configuration may be done only by privileged ++ * IPC users (i.e. root or hacluster), because all other users should go through ++ * the CIB to have ACLs applied. If no client was given, this is a peer request, ++ * which is always allowed. ++ * ++ * \param[in] c IPC client that sent request (or NULL if sent by CPG peer) ++ * \param[in] op Requested API operation (for logging only) ++ * ++ * \return true if sender is peer or privileged client, otherwise false ++ */ ++static inline bool ++is_privileged(pcmk__client_t *c, const char *op) ++{ ++ if ((c == NULL) || pcmk_is_set(c->flags, pcmk__client_privileged)) { ++ return true; ++ } else { ++ crm_warn("Rejecting IPC request '%s' from unprivileged client %s", ++ crm_str(op), pcmk__client_name(c)); ++ return false; ++ } ++} ++ + static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) +@@ -2898,15 +2924,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *op = crm_element_value(request, F_STONITH_OPERATION); + const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); + +- /* IPC commands related to fencing configuration may be done only by +- * privileged users (i.e. root or hacluster), because all other users should +- * go through the CIB to have ACLs applied. +- * +- * If no client was given, this is a peer request, which is always allowed. +- */ +- bool allowed = (client == NULL) +- || pcmk_is_set(client->flags, pcmk__client_privileged); +- + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + + if (pcmk_is_set(call_options, st_opt_sync_call)) { +@@ -3077,7 +3094,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { + const char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_device_register(request, &device_id, FALSE); + } else { + rc = -EACCES; +@@ -3088,7 +3105,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); + const char *device_id = crm_element_value(dev, XML_ATTR_ID); + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_device_remove(device_id, FALSE); + } else { + rc = -EACCES; +@@ -3098,7 +3115,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_level_register(request, &device_id); + } else { + rc = -EACCES; +@@ -3109,7 +3126,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { + char *device_id = NULL; + +- if (allowed) { ++ if (is_privileged(client, op)) { + rc = stonith_level_remove(request, &device_id); + } else { + rc = -EACCES; +@@ -3133,14 +3150,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + done: +- if (rc == -EACCES) { +- crm_warn("Rejecting IPC request '%s' from unprivileged client %s", +- crm_str(op), pcmk__client_name(client)); +- } +- + // Reply if result is known + if (need_reply) { +- + if (pcmk_is_set(call_options, st_opt_sync_call)) { + CRM_ASSERT(client == NULL || client->request_id == id); + } +-- +2.27.0 + + +From 10ca8a5ef5266159bc3f993802aeae6537ceeb11 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 16:59:03 -0600 +Subject: [PATCH 10/19] Low: fencer: return -ETIME for peer fencing timeouts + +94c55684 set the result as pcmk_ok, but it appears that the intent was just to +keep the delegate from being set, and -ETIME should still do that, while being +more appropriate. +--- + daemons/fenced/fenced_remote.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c907cd120..dc7b802da 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -608,7 +608,7 @@ remote_op_timeout_one(gpointer userdata) + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- call_remote_stonith(op, NULL, pcmk_ok); ++ call_remote_stonith(op, NULL, -ETIME); + return FALSE; + } + +-- +2.27.0 + + +From fb2eefeb695cc92e1a2aed6f1f1d2b900d4fb83e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 16 Nov 2021 17:54:56 -0600 +Subject: [PATCH 11/19] Refactor: fencer: functionize common part of timeout + handling + +Previously, remote_op_timeout() was called from multiple places, but only one +of those places needed the full processing. The common part is now in a new +function finalize_timed_out_op() called from all the places, and +remote_op_timeout() now has just the additional processing needed by the one +place plus a call to the new function. + +This will allow a future change to set a different exit reason depending on +which step timed out. +--- + daemons/fenced/fenced_remote.c | 49 +++++++++++++++++++++++----------- + 1 file changed, 34 insertions(+), 15 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index dc7b802da..22c4b0772 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -612,20 +612,18 @@ remote_op_timeout_one(gpointer userdata) + return FALSE; + } + +-static gboolean +-remote_op_timeout(gpointer userdata) ++/*! ++ * \internal ++ * \brief Finalize a remote fencer operation that timed out ++ * ++ * \param[in] op Fencer operation that timed out ++ */ ++static void ++finalize_timed_out_op(remote_fencing_op_t *op) + { +- remote_fencing_op_t *op = userdata; + + op->op_timer_total = 0; + +- if (op->state == st_done) { +- crm_debug("Action '%s' targeting %s for client %s already completed " +- CRM_XS " id=%.8s", +- op->action, op->target, op->client_name, op->id); +- return FALSE; +- } +- + crm_debug("Action '%s' targeting %s for client %s timed out " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); +@@ -637,14 +635,35 @@ remote_op_timeout(gpointer userdata) + */ + op->state = st_done; + remote_op_done(op, NULL, pcmk_ok, FALSE); +- return FALSE; ++ return; + } + + op->state = st_failed; + + remote_op_done(op, NULL, -ETIME, FALSE); ++} + +- return FALSE; ++/*! ++ * \internal ++ * \brief Finalize a remote fencer operation that timed out ++ * ++ * \param[in] userdata Fencer operation that timed out ++ * ++ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) ++ */ ++static gboolean ++remote_op_timeout(gpointer userdata) ++{ ++ remote_fencing_op_t *op = userdata; ++ ++ if (op->state == st_done) { ++ crm_debug("Action '%s' targeting %s for client %s already completed " ++ CRM_XS " id=%.8s", ++ op->action, op->target, op->client_name, op->id); ++ } else { ++ finalize_timed_out_op(userdata); ++ } ++ return G_SOURCE_REMOVE; + } + + static gboolean +@@ -670,7 +689,7 @@ remote_op_query_timeout(gpointer data) + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } +- remote_op_timeout(op); ++ finalize_timed_out_op(op); + } + + return FALSE; +@@ -1675,8 +1694,8 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + crm_info("No remaining peers capable of fencing (%s) %s for client %s " + CRM_XS " state=%s", op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); +- CRM_LOG_ASSERT(op->state < st_done); +- remote_op_timeout(op); ++ CRM_CHECK(op->state < st_done, return); ++ finalize_timed_out_op(op); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { + // int rc = -EHOSTUNREACH; +-- +2.27.0 + + +From c047005a112ac7da5ba62084e39c79db739f0923 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 10:05:18 -0600 +Subject: [PATCH 12/19] Low: fencer: handle malformed manual confirmation + requests better + +Rename stonith_manual_ack() to fenced_handle_manual_confirmation(), and move +more of the manual confirmation handling in handle_request() into it, for +better code isolation. This will also make planned changes easier. + +The one behavioral difference is that a failure of initiate_remote_stonith_op() +will now be ignored rather than segmentation fault trying to dereference NULL. +--- + daemons/fenced/fenced_commands.c | 20 ++++++++++++-------- + daemons/fenced/fenced_remote.c | 29 ++++++++++++++++++++++++----- + daemons/fenced/pacemaker-fenced.h | 2 +- + 3 files changed, 37 insertions(+), 14 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 34c956f5c..6f325b9e8 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3012,14 +3012,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + if (remote_peer || stand_alone) { + rc = stonith_fence(request); + +- } else if (call_options & st_opt_manual_ack) { +- remote_fencing_op_t *rop = NULL; +- xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); +- const char *target = crm_element_value(dev, F_STONITH_TARGET); +- +- crm_notice("Received manual confirmation that %s is fenced", target); +- rop = initiate_remote_stonith_op(client, request, TRUE); +- rc = stonith_manual_ack(request, rop); ++ } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { ++ switch (fenced_handle_manual_confirmation(client, request)) { ++ case pcmk_rc_ok: ++ rc = pcmk_ok; ++ break; ++ case EINPROGRESS: ++ rc = -EINPROGRESS; ++ break; ++ default: ++ rc = -EPROTO; ++ break; ++ } + + } else { + const char *alternate_host = NULL; +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 22c4b0772..60ee5e32e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1003,22 +1003,41 @@ static uint32_t fencing_active_peers(void) + return count; + } + ++/*! ++ * \internal ++ * \brief Process a manual confirmation of a pending fence action ++ * ++ * \param[in] client IPC client that sent confirmation ++ * \param[in] msg Request XML with manual confirmation ++ * ++ * \return Standard Pacemaker return code ++ */ + int +-stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) ++fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + { ++ remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); + ++ CRM_CHECK(dev != NULL, return EPROTO); ++ ++ crm_notice("Received manual confirmation that %s has been fenced", ++ crm_str(crm_element_value(dev, F_STONITH_TARGET))); ++ op = initiate_remote_stonith_op(client, msg, TRUE); ++ if (op == NULL) { ++ return EPROTO; ++ } + op->state = st_done; + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- crm_notice("Injecting manual confirmation that %s is safely off/down", +- crm_element_value(dev, F_STONITH_TARGET)); ++ // For the fencer's purposes, the fencing operation is done + + remote_op_done(op, msg, pcmk_ok, FALSE); + +- // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() +- return -EINPROGRESS; ++ /* For the requester's purposes, the operation is still pending. The ++ * actual result will be sent asynchronously via the operation's done_cb(). ++ */ ++ return EINPROGRESS; + } + + /*! +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 3e41d867e..cf88644f1 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -256,7 +256,7 @@ bool fencing_peer_active(crm_node_t *peer); + + void set_fencing_completed(remote_fencing_op_t * op); + +-int stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op); ++int fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg); + + gboolean node_has_attr(const char *node, const char *name, const char *value); + +-- +2.27.0 + + +From ec60f014b5a8f774aa57a26e40a2b1b94a7e3d3a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 10:35:31 -0600 +Subject: [PATCH 13/19] Low: fencer: handle malformed topology level removal + requests better + +Log the malformed request, and return -EPROTO instead of -EINVAL. If a request +is missing a level number, treat it as malformed instead of as a request to +remove all. +--- + daemons/fenced/fenced_commands.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 6f325b9e8..358844203 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1678,27 +1678,27 @@ stonith_level_register(xmlNode *msg, char **desc) + int + stonith_level_remove(xmlNode *msg, char **desc) + { +- int id = 0; ++ int id = -1; + stonith_topology_t *tp; + char *target; + + /* Unlike additions, removal requests should always have one level tag */ + xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); + +- CRM_CHECK(level != NULL, return -EINVAL); ++ CRM_CHECK(level != NULL, return -EPROTO); + + target = stonith_level_key(level, -1); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); ++ ++ CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), ++ crm_log_xml_warn(msg, "invalid level"); ++ free(target); ++ return -EPROTO); ++ + if (desc) { + *desc = crm_strdup_printf("%s[%d]", target, id); + } + +- /* Sanity-check arguments */ +- if (id >= ST_LEVEL_MAX) { +- free(target); +- return -EINVAL; +- } +- + tp = g_hash_table_lookup(topology, target); + if (tp == NULL) { + guint nentries = g_hash_table_size(topology); +@@ -1714,7 +1714,7 @@ stonith_level_remove(xmlNode *msg, char **desc) + "(%d active %s remaining)", target, nentries, + pcmk__plural_alt(nentries, "entry", "entries")); + +- } else if (id > 0 && tp->levels[id] != NULL) { ++ } else if (tp->levels[id] != NULL) { + guint nlevels; + + g_list_free_full(tp->levels[id], free); +-- +2.27.0 + + +From ee0cfb6b284c2d6d21f8e77bf6ff286b1364235d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:33:05 -0600 +Subject: [PATCH 14/19] Refactor: fencer: avoid obscuring a variable + +handle_request() declared a xmlNode *reply variable, and then one of its "if" +blocks defined another one, obscuring the first. Drop the first declaration, +and instead move it to the one other place that needed it. + +Also remove a redundant assertion. +--- + daemons/fenced/fenced_commands.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 358844203..af0a92450 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2917,7 +2917,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + int rc = -EOPNOTSUPP; + + xmlNode *data = NULL; +- xmlNode *reply = NULL; + bool need_reply = true; + + char *output = NULL; +@@ -2926,8 +2925,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + +- if (pcmk_is_set(call_options, st_opt_sync_call)) { +- CRM_ASSERT(client == NULL || client->request_id == id); ++ if (pcmk_is_set(call_options, st_opt_sync_call) && (client != NULL)) { ++ CRM_ASSERT(client->request_id == id); + } + + if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { +@@ -3156,16 +3155,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- if (pcmk_is_set(call_options, st_opt_sync_call)) { +- CRM_ASSERT(client == NULL || client->request_id == id); +- } +- reply = stonith_construct_reply(request, output, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, output, data, rc); ++ + stonith_send_reply(reply, call_options, remote_peer, client_id); ++ free_xml(reply); + } + + free(output); + free_xml(data); +- free_xml(reply); + + crm_debug("Processed %s request from %s %s: %s (rc=%d)", + op, ((client == NULL)? "peer" : "client"), +-- +2.27.0 + + +From a5fef7b95b7541860e29c1ff33be38db327208fb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:37:10 -0600 +Subject: [PATCH 15/19] Refactor: fencer: add convenience function for setting + protocol error result + +The fencer will soon track and return the full result (rather than just a +legacy return code) for fencing actions, for callbacks and notifications. +To simplify that process as well as move away from the legacy codes in general, +all fencer API operations will be modified to return a full result. + +This convenience function will come in handy for that. +--- + daemons/fenced/pacemaker-fenced.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index cf88644f1..3bc5dc3d1 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -262,6 +262,13 @@ gboolean node_has_attr(const char *node, const char *name, const char *value); + + gboolean node_does_watchdog_fencing(const char *node); + ++static inline void ++fenced_set_protocol_error(pcmk__action_result_t *result) ++{ ++ pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, ++ "Fencer API request missing required information (bug?)"); ++} ++ + extern char *stonith_our_uname; + extern gboolean stand_alone; + extern GHashTable *device_list; +-- +2.27.0 + + +From ed770d36fb34dc7b3344cd326830a6c06cc789ce Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 09:59:51 -0600 +Subject: [PATCH 16/19] Refactor: fencer: make a few functions return void + +... to make planned changes easier. The return values were previously ignored. +--- + daemons/fenced/fenced_commands.c | 17 ++++++++------- + daemons/fenced/fenced_history.c | 6 +----- + daemons/fenced/fenced_remote.c | 35 ++++++++++++++----------------- + daemons/fenced/pacemaker-fenced.c | 6 +++--- + daemons/fenced/pacemaker-fenced.h | 8 +++---- + 5 files changed, 33 insertions(+), 39 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index af0a92450..ea7d281ce 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1411,8 +1411,8 @@ stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) + return pcmk_ok; + } + +-int +-stonith_device_remove(const char *id, gboolean from_cib) ++void ++stonith_device_remove(const char *id, bool from_cib) + { + stonith_device_t *device = g_hash_table_lookup(device_list, id); + guint ndevices = 0; +@@ -1421,7 +1421,7 @@ stonith_device_remove(const char *id, gboolean from_cib) + ndevices = g_hash_table_size(device_list); + crm_info("Device '%s' not found (%d active device%s)", + id, ndevices, pcmk__plural_s(ndevices)); +- return pcmk_ok; ++ return; + } + + if (from_cib) { +@@ -1443,7 +1443,6 @@ stonith_device_remove(const char *id, gboolean from_cib) + (device->cib_registered? " cib" : ""), + (device->api_registered? " api" : "")); + } +- return pcmk_ok; + } + + /*! +@@ -3085,8 +3084,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + need_reply = (rc != -EINPROGRESS); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { +- rc = stonith_fence_history(request, &data, remote_peer, call_options); +- if (call_options & st_opt_discard_reply) { ++ stonith_fence_history(request, &data, remote_peer, call_options); ++ rc = pcmk_ok; ++ if (pcmk_is_set(call_options, st_opt_discard_reply)) { + /* we don't expect answers to the broadcast + * we might have sent out + */ +@@ -3109,7 +3109,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *device_id = crm_element_value(dev, XML_ATTR_ID); + + if (is_privileged(client, op)) { +- rc = stonith_device_remove(device_id, FALSE); ++ stonith_device_remove(device_id, false); ++ rc = pcmk_ok; + } else { + rc = -EACCES; + } +@@ -3179,7 +3180,7 @@ handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) + if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { + process_remote_stonith_query(request); + } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { +- process_remote_stonith_exec(request); ++ fenced_process_fencing_reply(request); + } else { + crm_err("Ignoring unknown %s reply from %s %s", + crm_str(op), ((client == NULL)? "peer" : "client"), +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 7127593b6..bc159383c 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -433,14 +433,11 @@ stonith_local_history(gboolean add_id, const char *target) + * a reply from + * \param[in] remote_peer + * \param[in] options call-options from the request +- * +- * \return always success as there is actully nothing that can go really wrong + */ +-int ++void + stonith_fence_history(xmlNode *msg, xmlNode **output, + const char *remote_peer, int options) + { +- int rc = 0; + const char *target = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_NEVER); + xmlNode *out_history = NULL; +@@ -525,5 +522,4 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + *output = stonith_local_history(FALSE, target); + } + free_xml(out_history); +- return rc; + } +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 60ee5e32e..6338aebde 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2086,11 +2086,9 @@ process_remote_stonith_query(xmlNode * msg) + * or attempt another device as appropriate. + * + * \param[in] msg XML reply received +- * +- * \return pcmk_ok on success, -errno on error + */ +-int +-process_remote_stonith_exec(xmlNode * msg) ++void ++fenced_process_fencing_reply(xmlNode *msg) + { + int rc = 0; + const char *id = NULL; +@@ -2098,13 +2096,13 @@ process_remote_stonith_exec(xmlNode * msg) + remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); + +- CRM_CHECK(dev != NULL, return -EPROTO); ++ CRM_CHECK(dev != NULL, return); + + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); +- CRM_CHECK(id != NULL, return -EPROTO); ++ CRM_CHECK(id != NULL, return); + + dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); +- CRM_CHECK(dev != NULL, return -EPROTO); ++ CRM_CHECK(dev != NULL, return); + + crm_element_value_int(dev, F_STONITH_RC, &rc); + +@@ -2125,35 +2123,35 @@ process_remote_stonith_exec(xmlNode * msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- return -EOPNOTSUPP; ++ return; + } + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- return rc; ++ return; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { + crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " +- CRM_XS " rc=%d id=%.8s", ++ CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_strerror(rc), rc, op->id); ++ pcmk_strerror(rc), op->id); + if (rc == pcmk_ok) { + op->state = st_done; + } else { + op->state = st_failed; + } + remote_op_done(op, msg, rc, FALSE); +- return pcmk_ok; ++ return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- return rc; ++ return; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2168,7 +2166,7 @@ process_remote_stonith_exec(xmlNode * msg) + * and notify our local clients. */ + if (op->state == st_done) { + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } + + if ((op->phase == 2) && (rc != pcmk_ok)) { +@@ -2184,14 +2182,14 @@ process_remote_stonith_exec(xmlNode * msg) + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg, rc); +- return rc; ++ return; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } + } + } else if (rc == pcmk_ok && op->devices == NULL) { +@@ -2199,12 +2197,12 @@ process_remote_stonith_exec(xmlNode * msg) + + op->state = st_done; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } else if (rc == -ETIME && op->devices == NULL) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; + remote_op_done(op, msg, rc, FALSE); +- return rc; ++ return; + } else { + /* fall-through and attempt other fencing action using another peer */ + } +@@ -2213,7 +2211,6 @@ process_remote_stonith_exec(xmlNode * msg) + crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, + op->client_name, rc); + call_remote_stonith(op, NULL, rc); +- return rc; + } + + gboolean +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index a290e1670..0a8b3bf6f 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -445,7 +445,7 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) + + rsc_id = crm_element_value(match, XML_ATTR_ID); + +- stonith_device_remove(rsc_id, TRUE); ++ stonith_device_remove(rsc_id, true); + } + } + +@@ -610,7 +610,7 @@ watchdog_device_update(void) + } else { + /* be silent if no device - todo parameter to stonith_device_remove */ + if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { +- stonith_device_remove(STONITH_WATCHDOG_ID, TRUE); ++ stonith_device_remove(STONITH_WATCHDOG_ID, true); + } + } + } +@@ -847,7 +847,7 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) + } + if (search != NULL) { + *search = 0; +- stonith_device_remove(rsc_id, TRUE); ++ stonith_device_remove(rsc_id, true); + /* watchdog_device_update called afterwards + to fall back to implicit definition if needed */ + } else { +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 3bc5dc3d1..5162ada75 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -214,7 +214,7 @@ void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, + + int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib); + +-int stonith_device_remove(const char *id, gboolean from_cib); ++void stonith_device_remove(const char *id, bool from_cib); + + char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); +@@ -241,14 +241,14 @@ remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, + gboolean manual_ack); + +-int process_remote_stonith_exec(xmlNode * msg); ++void fenced_process_fencing_reply(xmlNode *msg); + + int process_remote_stonith_query(xmlNode * msg); + + void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); + +-int stonith_fence_history(xmlNode *msg, xmlNode **output, +- const char *remote_peer, int options); ++void stonith_fence_history(xmlNode *msg, xmlNode **output, ++ const char *remote_peer, int options); + + void stonith_fence_history_trim(void); + +-- +2.27.0 + + +From 27df49460930738e77f5ca42536aff1d3bdfcae7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 10:06:43 -0600 +Subject: [PATCH 17/19] Refactor: fencer: drop unnecessary argument when + advancing topology device + +If we're advancing to the next device in a topology level, by necessity that +means any previous device succeeded. +--- + daemons/fenced/fenced_remote.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 6338aebde..d54e6a4ef 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1519,14 +1519,13 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout) + * \internal + * \brief Advance an operation to the next device in its topology + * +- * \param[in,out] op Operation to advance +- * \param[in] device ID of device just completed +- * \param[in] msg XML reply that contained device result (if available) +- * \param[in] rc Return code of device's execution ++ * \param[in] op Fencer operation to advance ++ * \param[in] device ID of device that just completed ++ * \param[in] msg If not NULL, XML reply of last delegated fencing operation + */ + static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, +- xmlNode *msg, int rc) ++ xmlNode *msg) + { + /* Advance to the next device at this topology level, if any */ + if (op->devices) { +@@ -1556,8 +1555,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + + if (op->devices) { + /* Necessary devices remain, so execute the next one */ +- crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)", +- op->target, op->client_name, op->originator, rc); ++ crm_trace("Next targeting %s on behalf of %s@%s", ++ op->target, op->client_name, op->originator); + + // The requested delay has been applied for the first device + if (op->delay > 0) { +@@ -1570,7 +1569,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_ok, FALSE); + } + } + +@@ -1701,7 +1700,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + */ + crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s " + "after successful 'off'", device, op->target); +- advance_topology_device_in_level(op, device, NULL, pcmk_ok); ++ advance_topology_device_in_level(op, device, NULL); + return; + + } else if (op->owner == FALSE) { +@@ -2181,7 +2180,7 @@ fenced_process_fencing_reply(xmlNode *msg) + if (rc == pcmk_ok) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ +- advance_topology_device_in_level(op, device, msg, rc); ++ advance_topology_device_in_level(op, device, msg); + return; + } else { + /* This device failed, time to try another topology level. If no other +-- +2.27.0 + + +From 05437e1339bc1f9071b43e97d5846a939687951d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 11:59:17 -0600 +Subject: [PATCH 18/19] Refactor: fencer: minor renames for consistency + +... per review +--- + daemons/fenced/fenced_remote.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index d54e6a4ef..8feb40147 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -63,7 +63,7 @@ typedef struct device_properties_s { + int delay_base[st_phase_max]; + } device_properties_t; + +-typedef struct st_query_result_s { ++typedef struct { + /* Name of peer that sent this result */ + char *host; + /* Only try peers for non-topology based operations once */ +@@ -95,13 +95,12 @@ sort_strings(gconstpointer a, gconstpointer b) + static void + free_remote_query(gpointer data) + { +- if (data) { +- peer_device_info_t *query = data; ++ if (data != NULL) { ++ peer_device_info_t *peer = data; + +- crm_trace("Free'ing query result from %s", query->host); +- g_hash_table_destroy(query->devices); +- free(query->host); +- free(query); ++ g_hash_table_destroy(peer->devices); ++ free(peer->host); ++ free(peer); + } + } + +-- +2.27.0 + + +From 86974d7cef05bafbed540d02e59514292581ae65 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 08:33:41 -0600 +Subject: [PATCH 19/19] Refactor: fencer: simplify send_async_reply() + +... as suggested in review +--- + daemons/fenced/fenced_commands.c | 28 ++++++++++++---------------- + 1 file changed, 12 insertions(+), 16 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index ea7d281ce..f34cb4f13 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2384,36 +2384,34 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + int pid, bool merged) + { + xmlNode *reply = NULL; +- bool bcast = false; + + CRM_CHECK((cmd != NULL) && (result != NULL), return); + ++ log_async_result(cmd, result, pid, NULL, merged); ++ + reply = construct_async_reply(cmd, result); ++ if (merged) { ++ crm_xml_add(reply, F_STONITH_MERGED, "true"); ++ } + +- // If target was also the originator, broadcast fencing results for it + if (!stand_alone && pcmk__is_fencing_action(cmd->action) + && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { +- ++ /* The target was also the originator, so broadcast the result on its ++ * behalf (since it will be unable to). ++ */ + crm_trace("Broadcast '%s' result for %s (target was also originator)", + cmd->action, cmd->victim); + crm_xml_add(reply, F_SUBTYPE, "broadcast"); + crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); +- bcast = true; +- } +- +- log_async_result(cmd, result, pid, NULL, merged); +- +- if (merged) { +- crm_xml_add(reply, F_STONITH_MERGED, "true"); +- } +- crm_log_xml_trace(reply, "Reply"); +- +- if (bcast) { + send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); + } else { ++ // Reply only to the originator + stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); + } + ++ crm_log_xml_trace(reply, "Reply"); ++ free_xml(reply); ++ + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); +@@ -2430,8 +2428,6 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } +- +- free_xml(reply); + } + + static void +-- +2.27.0 + diff --git a/SOURCES/006-crm_simulate.patch b/SOURCES/006-crm_simulate.patch deleted file mode 100644 index c8d4e3f..0000000 --- a/SOURCES/006-crm_simulate.patch +++ /dev/null @@ -1,896 +0,0 @@ -From 97571e6ccc9b7fa339a7e27d9b0b9ab782ff3003 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 16 Jun 2021 13:54:10 -0400 -Subject: [PATCH 1/5] Low: schemas: Copy crm_mon.rng in preparation for - changes. - ---- - xml/api/crm_mon-2.12.rng | 243 +++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 243 insertions(+) - create mode 100644 xml/api/crm_mon-2.12.rng - -diff --git a/xml/api/crm_mon-2.12.rng b/xml/api/crm_mon-2.12.rng -new file mode 100644 -index 0000000..ffec923 ---- /dev/null -+++ b/xml/api/crm_mon-2.12.rng -@@ -0,0 +1,243 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ granted -+ revoked -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - - -From da394983f106f974274ddd94675a04c85086010e Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 18 Jun 2021 15:06:34 -0400 -Subject: [PATCH 2/5] Refactor: Split node history out into its own XML schema. - -This allows for sharing it between crm_mon and crm_simulate. ---- - xml/Makefile.am | 2 +- - xml/api/crm_mon-2.12.rng | 64 +-------------------------------------- - xml/api/node-history-2.12.rng | 70 +++++++++++++++++++++++++++++++++++++++++++ - 3 files changed, 72 insertions(+), 64 deletions(-) - create mode 100644 xml/api/node-history-2.12.rng - -diff --git a/xml/Makefile.am b/xml/Makefile.am -index b9448d4..8e7b6d3 100644 ---- a/xml/Makefile.am -+++ b/xml/Makefile.am -@@ -64,7 +64,7 @@ API_request_base = command-output \ - CIB_cfg_base = options nodes resources constraints fencing acls tags alerts - - # Names of all schemas (including top level and those included by others) --API_base = $(API_request_base) fence-event failure generic-list item node-attrs nodes resources status -+API_base = $(API_request_base) fence-event failure generic-list item node-attrs node-history nodes resources status - CIB_base = cib $(CIB_cfg_base) status score rule nvset - - # Static schema files and transforms (only CIB has transforms) -diff --git a/xml/api/crm_mon-2.12.rng b/xml/api/crm_mon-2.12.rng -index ffec923..be14412 100644 ---- a/xml/api/crm_mon-2.12.rng -+++ b/xml/api/crm_mon-2.12.rng -@@ -20,7 +20,7 @@ - - - -- -+ - - - -@@ -113,14 +113,6 @@ - - - -- -- -- -- -- -- -- -- - - - -@@ -156,60 +148,6 @@ - - - -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - - - -diff --git a/xml/api/node-history-2.12.rng b/xml/api/node-history-2.12.rng -new file mode 100644 -index 0000000..9628000 ---- /dev/null -+++ b/xml/api/node-history-2.12.rng -@@ -0,0 +1,70 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - - -From bf72b2615630eef7876e443d60b34d5a316de847 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 16 Jun 2021 14:09:31 -0400 -Subject: [PATCH 3/5] Low: schemas: Copy crm_simulate.rng in preparation for - changes. - ---- - xml/api/crm_simulate-2.12.rng | 335 ++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 335 insertions(+) - create mode 100644 xml/api/crm_simulate-2.12.rng - -diff --git a/xml/api/crm_simulate-2.12.rng b/xml/api/crm_simulate-2.12.rng -new file mode 100644 -index 0000000..9a7612d ---- /dev/null -+++ b/xml/api/crm_simulate-2.12.rng -@@ -0,0 +1,335 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - - -From c46e07788788acf5669e3f89b9344190a91c7331 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 18 Jun 2021 15:10:19 -0400 -Subject: [PATCH 4/5] Feature: tools: Add the node-summary to crm_simulate - output. - -If --show-failcounts is given to crm_simulate, it should also display -the node-summary message. - -See: rhbz#1686426 ---- - tools/crm_simulate.c | 7 +++++-- - xml/api/crm_simulate-2.12.rng | 3 +++ - 2 files changed, 8 insertions(+), 2 deletions(-) - -diff --git a/tools/crm_simulate.c b/tools/crm_simulate.c -index b4aa9d1..2ea292c 100644 ---- a/tools/crm_simulate.c -+++ b/tools/crm_simulate.c -@@ -409,11 +409,14 @@ print_cluster_status(pe_working_set_t * data_set, unsigned int print_opts) - FALSE, FALSE, all, all, FALSE); - - if (options.show_attrs) { -- out->message(out, "node-attribute-list", data_set, -- 0, rc == pcmk_rc_ok, FALSE, FALSE, FALSE, all, all); -+ rc = out->message(out, "node-attribute-list", data_set, -+ 0, rc == pcmk_rc_ok, FALSE, FALSE, FALSE, all, all); - } - - if (options.show_failcounts) { -+ rc = out->message(out, "node-summary", data_set, all, all, -+ 0, print_opts, FALSE, FALSE, FALSE, FALSE, rc == pcmk_rc_ok); -+ - out->message(out, "failed-action-list", data_set, all, all, - rc == pcmk_rc_ok); - } -diff --git a/xml/api/crm_simulate-2.12.rng b/xml/api/crm_simulate-2.12.rng -index 9a7612d..f90bd36 100644 ---- a/xml/api/crm_simulate-2.12.rng -+++ b/xml/api/crm_simulate-2.12.rng -@@ -67,6 +67,9 @@ - - - -+ -+ -+ - - - --- -1.8.3.1 - - -From bac50336e0264604716e5997b87ee7e65311b982 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 18 Jun 2021 15:21:52 -0400 -Subject: [PATCH 5/5] Low: libcrmcommon: Increase PCMK__API_VERSION for new - crm_resource output. - -See: rhbz#1686426 ---- - include/crm/common/output_internal.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h -index 0436cde..ba9c423 100644 ---- a/include/crm/common/output_internal.h -+++ b/include/crm/common/output_internal.h -@@ -27,7 +27,7 @@ extern "C" { - # include - # include - --# define PCMK__API_VERSION "2.11" -+# define PCMK__API_VERSION "2.12" - - #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) - # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) --- -1.8.3.1 - diff --git a/SOURCES/006-stateful-metadata.patch b/SOURCES/006-stateful-metadata.patch new file mode 100644 index 0000000..a9ea6f4 --- /dev/null +++ b/SOURCES/006-stateful-metadata.patch @@ -0,0 +1,143 @@ +From b52fe799c89637e2a761a5725c2376db5c05f2d1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 15:51:54 -0600 +Subject: [PATCH 1/2] Low: resources: remove DOCTYPE from OCF 1.1-compliant + agents + +OCF 1.1 replaced the DTD schema with RNG, but DOCTYPE still refers to the DTD. +There's no DOCTYPE for RNG, and DOCTYPE is optional, so just remove it. +--- + extra/resources/Dummy | 3 +-- + extra/resources/HealthIOWait | 3 +-- + extra/resources/Stateful | 3 +-- + extra/resources/attribute | 3 +-- + extra/resources/ping | 3 +-- + extra/resources/remote | 3 +-- + 6 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/extra/resources/Dummy b/extra/resources/Dummy +index a344deac0..56584e564 100755 +--- a/extra/resources/Dummy ++++ b/extra/resources/Dummy +@@ -58,8 +58,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/HealthIOWait b/extra/resources/HealthIOWait +index 43a8b70c4..5f1483ef7 100755 +--- a/extra/resources/HealthIOWait ++++ b/extra/resources/HealthIOWait +@@ -25,8 +25,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/Stateful b/extra/resources/Stateful +index ae3424bbf..0d2062d51 100755 +--- a/extra/resources/Stateful ++++ b/extra/resources/Stateful +@@ -39,8 +39,7 @@ SCORE_PROMOTED=10 + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/attribute b/extra/resources/attribute +index 1800dff8f..a2bd353e0 100755 +--- a/extra/resources/attribute ++++ b/extra/resources/attribute +@@ -57,8 +57,7 @@ END + meta_data() { + cat < +- +- ++ + 1.1 + Manages a node attribute + +diff --git a/extra/resources/ping b/extra/resources/ping +index 6e296979f..7cc6b802d 100755 +--- a/extra/resources/ping ++++ b/extra/resources/ping +@@ -36,8 +36,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + + +diff --git a/extra/resources/remote b/extra/resources/remote +index a53262bb6..f7e40dc81 100755 +--- a/extra/resources/remote ++++ b/extra/resources/remote +@@ -24,8 +24,7 @@ + meta_data() { + cat < +- +- ++ + 1.1 + Pacemaker Remote connection + +-- +2.27.0 + + +From 70f469120f8db6a024c786466ee74a6c7fbd1f43 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 30 Nov 2021 15:53:39 -0600 +Subject: [PATCH 2/2] Fix: resources: use correct syntax in Stateful meta-data + +The OCF standard only allows "0" or "1" for booleans. + +This fixes incorrect ocf:pacemaker:Stateful meta-data syntax introduced by +7024398 as a regression in the 2.1.0 release. +--- + extra/resources/Stateful | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/extra/resources/Stateful b/extra/resources/Stateful +index 0d2062d51..2ebe6725f 100755 +--- a/extra/resources/Stateful ++++ b/extra/resources/Stateful +@@ -57,7 +57,7 @@ Location to store the resource state in + + + +- ++ + + If this is set, the environment will be dumped to this file for every call. + +@@ -65,7 +65,7 @@ If this is set, the environment will be dumped to this file for every call. + + + +- ++ + + The notify action will sleep for this many seconds before returning, + to simulate a long-running notify. +-- +2.27.0 + diff --git a/SOURCES/007-memory-leak.patch b/SOURCES/007-memory-leak.patch new file mode 100644 index 0000000..38ad3a2 --- /dev/null +++ b/SOURCES/007-memory-leak.patch @@ -0,0 +1,39 @@ +From f491d9d5a7ed554fed985de356bb085fdec3421c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 09:01:00 -0600 +Subject: [PATCH] Fix: fencer: avoid memory leak when broadcasting history + differences + +Regression introduced in 2.1.0 by dbc27b2 +--- + daemons/fenced/fenced_history.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index bc159383c..a9c57dc86 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -484,8 +484,6 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + !pcmk__str_eq(remote_peer, stonith_our_uname, pcmk__str_casei)) { + xmlNode *history = get_xpath_object("//" F_STONITH_HISTORY_LIST, + msg, LOG_NEVER); +- GHashTable *received_history = +- history?stonith_xml_history_to_list(history):NULL; + + /* either a broadcast created directly upon stonith-API request + * or a diff as response to such a thing +@@ -497,6 +495,11 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + if (!history || + !crm_is_true(crm_element_value(history, + F_STONITH_DIFFERENTIAL))) { ++ GHashTable *received_history = NULL; ++ ++ if (history != NULL) { ++ received_history = stonith_xml_history_to_list(history); ++ } + out_history = + stonith_local_history_diff_and_merge(received_history, TRUE, NULL); + if (out_history) { +-- +2.27.0 + diff --git a/SOURCES/007-unfencing-loop.patch b/SOURCES/007-unfencing-loop.patch deleted file mode 100644 index d4950c8..0000000 --- a/SOURCES/007-unfencing-loop.patch +++ /dev/null @@ -1,733 +0,0 @@ -From 6dcd6b51d7d3993bc483588d6ed75077518ed600 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 4 Jun 2021 16:30:55 -0500 -Subject: [PATCH 01/11] Low: controller: check whether unfenced node was remote - node - -... so the controller can indicate the node is remote (if known at that point, -which is not guaranteed) when setting unfencing-related node attributes. ---- - daemons/controld/controld_fencing.c | 21 ++++++++++++++++++--- - 1 file changed, 18 insertions(+), 3 deletions(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 23dff28..0fba661 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -757,15 +757,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) - if (pcmk__str_eq("on", op, pcmk__str_casei)) { - const char *value = NULL; - char *now = pcmk__ttoa(time(NULL)); -+ gboolean is_remote_node = FALSE; -+ -+ /* This check is not 100% reliable, since this node is not -+ * guaranteed to have the remote node cached. However, it -+ * doesn't have to be reliable, since the attribute manager can -+ * learn a node's "remoteness" by other means sooner or later. -+ * This allows it to learn more quickly if this node does have -+ * the information. -+ */ -+ if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) { -+ is_remote_node = TRUE; -+ } - -- update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE); -+ update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, -+ is_remote_node); - free(now); - - value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); -- update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE); -+ update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, -+ is_remote_node); - - value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); -- update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE); -+ update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, -+ is_remote_node); - - } else if (action->sent_update == FALSE) { - send_stonith_update(action, target, uuid); --- -1.8.3.1 - - -From 3ef6d9403f68ab8559c45cc99f5a8da05ca6420b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 7 Jun 2021 10:50:36 -0500 -Subject: [PATCH 02/11] Refactor: pacemaker-attrd: functionize adding remote - node to cache - -... for future reuse ---- - daemons/attrd/attrd_commands.c | 34 +++++++++++++++++++++++----------- - 1 file changed, 23 insertions(+), 11 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 731c243..93a165b 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -102,6 +102,28 @@ free_attribute(gpointer data) - } - } - -+/*! -+ * \internal -+ * \brief Ensure a Pacemaker Remote node is in the correct peer cache -+ * -+ * \param[in] -+ */ -+static void -+cache_remote_node(const char *node_name) -+{ -+ /* If we previously assumed this node was an unseen cluster node, -+ * remove its entry from the cluster peer cache. -+ */ -+ crm_node_t *dup = pcmk__search_cluster_node_cache(0, node_name); -+ -+ if (dup && (dup->uuid == NULL)) { -+ reap_crm_member(0, node_name); -+ } -+ -+ // Ensure node is in the remote peer cache -+ CRM_ASSERT(crm_remote_peer_get(node_name) != NULL); -+} -+ - static xmlNode * - build_attribute_xml( - xmlNode *parent, const char *name, const char *set, const char *uuid, unsigned int timeout_ms, const char *user, -@@ -709,17 +731,7 @@ attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml) - - crm_element_value_int(xml, PCMK__XA_ATTR_IS_REMOTE, &is_remote); - if (is_remote) { -- /* If we previously assumed this node was an unseen cluster node, -- * remove its entry from the cluster peer cache. -- */ -- crm_node_t *dup = pcmk__search_cluster_node_cache(0, host); -- -- if (dup && (dup->uuid == NULL)) { -- reap_crm_member(0, host); -- } -- -- /* Ensure this host is in the remote peer cache */ -- CRM_ASSERT(crm_remote_peer_get(host) != NULL); -+ cache_remote_node(host); - } - - if (v == NULL) { --- -1.8.3.1 - - -From 6fac2c71bc2c56870ac828d7cd7b7c799279c47e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 7 Jun 2021 10:39:34 -0500 -Subject: [PATCH 03/11] Refactor: pacemaker-attrd: don't try to remove votes - for remote nodes - -Remote nodes never vote. - -This has no effect in practice since the removal would simply do nothing, -but we might as well not waste time trying. ---- - daemons/attrd/attrd_commands.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 93a165b..dbe777e 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -976,7 +976,8 @@ attrd_election_cb(gpointer user_data) - void - attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data) - { -- bool remove_voter = FALSE; -+ bool gone = false; -+ bool is_remote = pcmk_is_set(peer->flags, crm_remote_node); - - switch (kind) { - case crm_status_uname: -@@ -984,7 +985,7 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - - case crm_status_processes: - if (!pcmk_is_set(peer->processes, crm_get_cluster_proc())) { -- remove_voter = TRUE; -+ gone = true; - } - break; - -@@ -1000,13 +1001,13 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - } else { - // Remove all attribute values associated with lost nodes - attrd_peer_remove(peer->uname, FALSE, "loss"); -- remove_voter = TRUE; -+ gone = true; - } - break; - } - -- // In case an election is in progress, remove any vote by the node -- if (remove_voter) { -+ // Remove votes from cluster nodes that leave, in case election in progress -+ if (gone && !is_remote) { - attrd_remove_voter(peer); - } - } --- -1.8.3.1 - - -From 54089fc663d6aaf10ca164c6c94b3b17237788de Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 7 Jun 2021 10:40:06 -0500 -Subject: [PATCH 04/11] Low: pacemaker-attrd: check for remote nodes in peer - update callback - -If a remote node was started before the local cluster node joined the cluster, -the cluster node will assume its node attributes are for a cluster node until -it learns otherwise. Check for remoteness in the peer update callback, to have -another way we can learn it. ---- - daemons/attrd/attrd_commands.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index dbe777e..5f6a754 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -1009,6 +1009,10 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - // Remove votes from cluster nodes that leave, in case election in progress - if (gone && !is_remote) { - attrd_remove_voter(peer); -+ -+ // Ensure remote nodes that come up are in the remote node cache -+ } else if (!gone && is_remote) { -+ cache_remote_node(peer->uname); - } - } - --- -1.8.3.1 - - -From 8c048df0312d0d9c857d87b570a352429a710928 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 7 Jun 2021 11:29:12 -0500 -Subject: [PATCH 05/11] Log: pacemaker-attrd: log peer status changes - ---- - daemons/attrd/attrd_commands.c | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 5f6a754..d6d179b 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -972,6 +972,7 @@ attrd_election_cb(gpointer user_data) - return FALSE; - } - -+#define state_text(state) ((state)? (const char *)(state) : "in unknown state") - - void - attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data) -@@ -981,15 +982,23 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - - switch (kind) { - case crm_status_uname: -+ crm_debug("%s node %s is now %s", -+ (is_remote? "Remote" : "Cluster"), -+ peer->uname, state_text(peer->state)); - break; - - case crm_status_processes: - if (!pcmk_is_set(peer->processes, crm_get_cluster_proc())) { - gone = true; - } -+ crm_debug("Node %s is %s a peer", -+ peer->uname, (gone? "no longer" : "now")); - break; - - case crm_status_nstate: -+ crm_debug("%s node %s is now %s (was %s)", -+ (is_remote? "Remote" : "Cluster"), -+ peer->uname, state_text(peer->state), state_text(data)); - if (pcmk__str_eq(peer->state, CRM_NODE_MEMBER, pcmk__str_casei)) { - /* If we're the writer, send new peers a list of all attributes - * (unless it's a remote node, which doesn't run its own attrd) --- -1.8.3.1 - - -From 1dcc8dee4990cf0dbdec0e14db6d9a3ad67a41d5 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 7 Jun 2021 11:13:53 -0500 -Subject: [PATCH 06/11] Low: pacemaker-attrd: ensure node ID is only set for - attributes when known - -In most cases, attribute updates contained the node ID, and the node ID was -used by other code, only if known (i.e. positive). However a couple places did -not check this, so add that. - -I am unsure whether the missing check caused problems in practice, but there -appears to be the possibility that a remote node would wrongly be added to the -cluster node cache. ---- - daemons/attrd/attrd_commands.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index d6d179b..b3f441c 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -136,7 +136,9 @@ build_attribute_xml( - crm_xml_add(xml, PCMK__XA_ATTR_UUID, uuid); - crm_xml_add(xml, PCMK__XA_ATTR_USER, user); - crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, peer); -- crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid); -+ if (peerid > 0) { -+ crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid); -+ } - crm_xml_add(xml, PCMK__XA_ATTR_VALUE, value); - crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, timeout_ms/1000); - crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, is_private); -@@ -937,7 +939,7 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter) - /* If this is a cluster node whose node ID we are learning, remember it */ - if ((v->nodeid == 0) && (v->is_remote == FALSE) - && (crm_element_value_int(xml, PCMK__XA_ATTR_NODE_ID, -- (int*)&v->nodeid) == 0)) { -+ (int*)&v->nodeid) == 0) && (v->nodeid > 0)) { - - crm_node_t *known_peer = crm_get_peer(v->nodeid, host); - --- -1.8.3.1 - - -From 8d12490e88b558d01db37a38f7d35175c6d2d69a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 10 Jun 2021 17:25:57 -0500 -Subject: [PATCH 07/11] Refactor: pacemaker-attrd: functionize processing a - sync response - -... for code isolation, and because we need to add more to it ---- - daemons/attrd/attrd_commands.c | 59 ++++++++++++++++++++++++++++-------------- - 1 file changed, 39 insertions(+), 20 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index b3f441c..d02d3e6 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -572,6 +572,43 @@ attrd_peer_clear_failure(crm_node_t *peer, xmlNode *xml) - } - - /*! -+ * \internal -+ * \brief Load attributes from a peer sync response -+ * -+ * \param[in] peer Peer that sent clear request -+ * \param[in] peer_won Whether peer is the attribute writer -+ * \param[in] xml Request XML -+ */ -+static void -+process_peer_sync_response(crm_node_t *peer, bool peer_won, xmlNode *xml) -+{ -+ crm_info("Processing " PCMK__ATTRD_CMD_SYNC_RESPONSE " from %s", -+ peer->uname); -+ -+ if (peer_won) { -+ /* Initialize the "seen" flag for all attributes to cleared, so we can -+ * detect attributes that local node has but the writer doesn't. -+ */ -+ clear_attribute_value_seen(); -+ } -+ -+ // Process each attribute update in the sync response -+ for (xmlNode *child = pcmk__xml_first_child(xml); child != NULL; -+ child = pcmk__xml_next(child)) { -+ attrd_peer_update(peer, child, -+ crm_element_value(child, PCMK__XA_ATTR_NODE_NAME), -+ TRUE); -+ } -+ -+ if (peer_won) { -+ /* If any attributes are still not marked as seen, the writer doesn't -+ * know about them, so send all peers an update with them. -+ */ -+ attrd_current_only_attribute_update(peer, xml); -+ } -+} -+ -+/*! - \internal - \brief Broadcast private attribute for local node with protocol version - */ -@@ -596,7 +633,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) - const char *op = crm_element_value(xml, PCMK__XA_TASK); - const char *election_op = crm_element_value(xml, F_CRM_TASK); - const char *host = crm_element_value(xml, PCMK__XA_ATTR_NODE_NAME); -- bool peer_won = FALSE; -+ bool peer_won = false; - - if (election_op) { - attrd_handle_election_op(peer, xml); -@@ -631,25 +668,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) - - } else if (pcmk__str_eq(op, PCMK__ATTRD_CMD_SYNC_RESPONSE, pcmk__str_casei) - && !pcmk__str_eq(peer->uname, attrd_cluster->uname, pcmk__str_casei)) { -- xmlNode *child = NULL; -- -- crm_info("Processing %s from %s", op, peer->uname); -- -- /* Clear the seen flag for attribute processing held only in the own node. */ -- if (peer_won) { -- clear_attribute_value_seen(); -- } -- -- for (child = pcmk__xml_first_child(xml); child != NULL; -- child = pcmk__xml_next(child)) { -- host = crm_element_value(child, PCMK__XA_ATTR_NODE_NAME); -- attrd_peer_update(peer, child, host, TRUE); -- } -- -- if (peer_won) { -- /* Synchronize if there is an attribute held only by own node that Writer does not have. */ -- attrd_current_only_attribute_update(peer, xml); -- } -+ process_peer_sync_response(peer, peer_won, xml); - - } else if (pcmk__str_eq(op, PCMK__ATTRD_CMD_FLUSH, pcmk__str_casei)) { - /* Ignore. The flush command was removed in 2.0.0 but may be --- -1.8.3.1 - - -From a890a0e5bbbcabf907f51ed0460868035f72464d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 11 Jun 2021 14:40:39 -0500 -Subject: [PATCH 08/11] Refactor: pacemaker-attrd: functionize broadcasting - local override - -... for code isolation ---- - daemons/attrd/attrd_commands.c | 42 +++++++++++++++++++++++++++++------------- - 1 file changed, 29 insertions(+), 13 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index d02d3e6..4783427 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -804,6 +804,34 @@ attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml) - free_xml(sync); - } - -+/*! -+ * \internal -+ * \brief Override an attribute sync with a local value -+ * -+ * Broadcast the local node's value for an attribute that's different from the -+ * value provided in a peer's attribute synchronization response. This ensures a -+ * node's values for itself take precedence and all peers are kept in sync. -+ * -+ * \param[in] a Attribute entry to override -+ * -+ * \return Local instance of attribute value -+ */ -+static attribute_value_t * -+broadcast_local_value(attribute_t *a) -+{ -+ attribute_value_t *v = g_hash_table_lookup(a->values, attrd_cluster->uname); -+ xmlNode *sync = create_xml_node(NULL, __func__); -+ -+ crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); -+ build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, -+ a->user, a->is_private, v->nodename, v->nodeid, -+ v->current, FALSE); -+ attrd_xml_add_writer(sync); -+ send_attrd_message(NULL, sync); -+ free_xml(sync); -+ return v; -+} -+ - void - attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter) - { -@@ -899,21 +927,9 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter) - if (filter && !pcmk__str_eq(v->current, value, pcmk__str_casei) - && pcmk__str_eq(host, attrd_cluster->uname, pcmk__str_casei)) { - -- xmlNode *sync = create_xml_node(NULL, __func__); -- - crm_notice("%s[%s]: local value '%s' takes priority over '%s' from %s", - attr, host, v->current, value, peer->uname); -- -- crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); -- v = g_hash_table_lookup(a->values, host); -- build_attribute_xml(sync, attr, a->set, a->uuid, a->timeout_ms, a->user, -- a->is_private, v->nodename, v->nodeid, v->current, FALSE); -- -- attrd_xml_add_writer(sync); -- -- /* Broadcast in case any other nodes had the inconsistent value */ -- send_attrd_message(NULL, sync); -- free_xml(sync); -+ v = broadcast_local_value(a); - - } else if (!pcmk__str_eq(v->current, value, pcmk__str_casei)) { - crm_notice("Setting %s[%s]: %s -> %s " CRM_XS " from %s", --- -1.8.3.1 - - -From f6f65e3dab070f1bbdf6d1383f4d6173a8840bc9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 11 Jun 2021 14:50:29 -0500 -Subject: [PATCH 09/11] Log: pacemaker-attrd: improve messages when - broadcasting local-only values - -The traces aren't necessary since build_attribute_xml() already logs the same -info at debug. Also, rename function for clarity, and make static. ---- - daemons/attrd/attrd_commands.c | 35 ++++++++++++++++------------------- - 1 file changed, 16 insertions(+), 19 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 4783427..356defb 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -51,11 +51,12 @@ GHashTable *attributes = NULL; - - void write_attribute(attribute_t *a, bool ignore_delay); - void write_or_elect_attribute(attribute_t *a); --void attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml); - void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter); - void attrd_peer_sync(crm_node_t *peer, xmlNode *xml); - void attrd_peer_remove(const char *host, gboolean uncache, const char *source); - -+static void broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml); -+ - static gboolean - send_attrd_message(crm_node_t * node, xmlNode * data) - { -@@ -604,7 +605,7 @@ process_peer_sync_response(crm_node_t *peer, bool peer_won, xmlNode *xml) - /* If any attributes are still not marked as seen, the writer doesn't - * know about them, so send all peers an update with them. - */ -- attrd_current_only_attribute_update(peer, xml); -+ broadcast_unseen_local_values(peer, xml); - } - } - -@@ -768,40 +769,36 @@ attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml) - return(v); - } - --void --attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml) -+void -+broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml) - { - GHashTableIter aIter; - GHashTableIter vIter; -- attribute_t *a; -+ attribute_t *a = NULL; - attribute_value_t *v = NULL; -- xmlNode *sync = create_xml_node(NULL, __func__); -- gboolean build = FALSE; -- -- crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); -+ xmlNode *sync = NULL; - - g_hash_table_iter_init(&aIter, attributes); - while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { - g_hash_table_iter_init(&vIter, a->values); - while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) { -- if (pcmk__str_eq(v->nodename, attrd_cluster->uname, pcmk__str_casei) && v->seen == FALSE) { -- crm_trace("Syncing %s[%s] = %s to everyone.(from local only attributes)", a->id, v->nodename, v->current); -- -- build = TRUE; -+ if (!(v->seen) && pcmk__str_eq(v->nodename, attrd_cluster->uname, -+ pcmk__str_casei)) { -+ if (sync == NULL) { -+ sync = create_xml_node(NULL, __func__); -+ crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); -+ } - build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, - v->nodename, v->nodeid, v->current, (a->timeout_ms && a->timer ? TRUE : FALSE)); -- } else { -- crm_trace("Local attribute(%s[%s] = %s) was ignore.(another host) : [%s]", a->id, v->nodename, v->current, attrd_cluster->uname); -- continue; - } - } - } - -- if (build) { -- crm_debug("Syncing values to everyone.(from local only attributes)"); -+ if (sync != NULL) { -+ crm_debug("Broadcasting local-only values"); - send_attrd_message(NULL, sync); -+ free_xml(sync); - } -- free_xml(sync); - } - - /*! --- -1.8.3.1 - - -From ab90ffb785ea018556f216b8f540f8c3429a3947 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 11 Jun 2021 15:04:20 -0500 -Subject: [PATCH 10/11] Refactor: pacemaker-attrd: simplify attribute XML - creation function - -... and rename for clarity ---- - daemons/attrd/attrd_commands.c | 48 ++++++++++++++++++++++++------------------ - 1 file changed, 27 insertions(+), 21 deletions(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 356defb..5b32a77 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -125,25 +125,35 @@ cache_remote_node(const char *node_name) - CRM_ASSERT(crm_remote_peer_get(node_name) != NULL); - } - -+/*! -+ * \internal -+ * \brief Create an XML representation of an attribute for use in peer messages -+ * -+ * \param[in] parent Create attribute XML as child element of this element -+ * \param[in] a Attribute to represent -+ * \param[in] v Attribute value to represent -+ * \param[in] force_write If true, value should be written even if unchanged -+ * -+ * \return XML representation of attribute -+ */ - static xmlNode * --build_attribute_xml( -- xmlNode *parent, const char *name, const char *set, const char *uuid, unsigned int timeout_ms, const char *user, -- gboolean is_private, const char *peer, uint32_t peerid, const char *value, gboolean is_force_write) -+add_attribute_value_xml(xmlNode *parent, attribute_t *a, attribute_value_t *v, -+ bool force_write) - { - xmlNode *xml = create_xml_node(parent, __func__); - -- crm_xml_add(xml, PCMK__XA_ATTR_NAME, name); -- crm_xml_add(xml, PCMK__XA_ATTR_SET, set); -- crm_xml_add(xml, PCMK__XA_ATTR_UUID, uuid); -- crm_xml_add(xml, PCMK__XA_ATTR_USER, user); -- crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, peer); -- if (peerid > 0) { -- crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid); -+ crm_xml_add(xml, PCMK__XA_ATTR_NAME, a->id); -+ crm_xml_add(xml, PCMK__XA_ATTR_SET, a->set); -+ crm_xml_add(xml, PCMK__XA_ATTR_UUID, a->uuid); -+ crm_xml_add(xml, PCMK__XA_ATTR_USER, a->user); -+ crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, v->nodename); -+ if (v->nodeid > 0) { -+ crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, v->nodeid); - } -- crm_xml_add(xml, PCMK__XA_ATTR_VALUE, value); -- crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, timeout_ms/1000); -- crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, is_private); -- crm_xml_add_int(xml, PCMK__XA_ATTR_FORCE, is_force_write); -+ crm_xml_add(xml, PCMK__XA_ATTR_VALUE, v->current); -+ crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, a->timeout_ms / 1000); -+ crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, a->is_private); -+ crm_xml_add_int(xml, PCMK__XA_ATTR_FORCE, force_write); - - return xml; - } -@@ -695,8 +705,7 @@ attrd_peer_sync(crm_node_t *peer, xmlNode *xml) - g_hash_table_iter_init(&vIter, a->values); - while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) { - crm_debug("Syncing %s[%s] = %s to %s", a->id, v->nodename, v->current, peer?peer->uname:"everyone"); -- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, -- v->nodename, v->nodeid, v->current, FALSE); -+ add_attribute_value_xml(sync, a, v, false); - } - } - -@@ -788,8 +797,7 @@ broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml) - sync = create_xml_node(NULL, __func__); - crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); - } -- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, -- v->nodename, v->nodeid, v->current, (a->timeout_ms && a->timer ? TRUE : FALSE)); -+ add_attribute_value_xml(sync, a, v, a->timeout_ms && a->timer); - } - } - } -@@ -820,9 +828,7 @@ broadcast_local_value(attribute_t *a) - xmlNode *sync = create_xml_node(NULL, __func__); - - crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); -- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, -- a->user, a->is_private, v->nodename, v->nodeid, -- v->current, FALSE); -+ add_attribute_value_xml(sync, a, v, false); - attrd_xml_add_writer(sync); - send_attrd_message(NULL, sync); - free_xml(sync); --- -1.8.3.1 - - -From 540d74130c5c8d9c626d6c50475e4dc4f64234e7 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 4 Jun 2021 16:34:26 -0500 -Subject: [PATCH 11/11] Fix: pacemaker-attrd: avoid repeated unfencing of - remote nodes - -The attribute manager can't record a remote node's attributes to the CIB until -it knows the node is remote. Normally, this is learned when the remote node -starts, because the controller clears the CRM_OP_PROBED attribute and indicates -that it is for a remote node. - -However, if a cluster node is down when a remote node starts, and later comes -up, it learns the remote node's existing attributes as part of the attribute -sync. Previously, this did not include whether each value is for a cluster or -remote node, so the newly joined attribute manager couldn't write out remote -nodes' attributes until it learned that via some other event -- which might not -happen before the node becomes DC, in which case its scheduler will not see any -unfencing-related node attributes and may wrongly schedule unfencing. - -The sync response handling already calls attrd_lookup_or_create_value(), which -checks PCMK__XA_ATTR_IS_REMOTE, so all we need to do is add that to the sync -response. ---- - daemons/attrd/attrd_commands.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c -index 5b32a77..0142383 100644 ---- a/daemons/attrd/attrd_commands.c -+++ b/daemons/attrd/attrd_commands.c -@@ -43,8 +43,9 @@ - * 1 1.1.15 PCMK__ATTRD_CMD_UPDATE_BOTH, - * PCMK__ATTRD_CMD_UPDATE_DELAY - * 2 1.1.17 PCMK__ATTRD_CMD_CLEAR_FAILURE -+ * 3 2.1.1 PCMK__ATTRD_CMD_SYNC_RESPONSE indicates remote nodes - */ --#define ATTRD_PROTOCOL_VERSION "2" -+#define ATTRD_PROTOCOL_VERSION "3" - - int last_cib_op_done = 0; - GHashTable *attributes = NULL; -@@ -150,6 +151,9 @@ add_attribute_value_xml(xmlNode *parent, attribute_t *a, attribute_value_t *v, - if (v->nodeid > 0) { - crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, v->nodeid); - } -+ if (v->is_remote != 0) { -+ crm_xml_add_int(xml, PCMK__XA_ATTR_IS_REMOTE, 1); -+ } - crm_xml_add(xml, PCMK__XA_ATTR_VALUE, v->current); - crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, a->timeout_ms / 1000); - crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, a->is_private); --- -1.8.3.1 - diff --git a/SOURCES/008-dynamic-list-fencing.patch b/SOURCES/008-dynamic-list-fencing.patch deleted file mode 100644 index 4a56117..0000000 --- a/SOURCES/008-dynamic-list-fencing.patch +++ /dev/null @@ -1,140 +0,0 @@ -From 2d15fb37525f88ec8d5acb689b698044c4bb69b1 Mon Sep 17 00:00:00 2001 -From: Hideo Yamauchi -Date: Thu, 17 Jun 2021 22:39:12 +0900 -Subject: [PATCH 1/2] Low: fenced: Low: fenced: Remove unnecessary release. - ---- - daemons/fenced/fenced_commands.c | 3 --- - 1 file changed, 3 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index fee55a7..35aec06 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1104,9 +1104,6 @@ dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) - /* Fall back to status */ - g_hash_table_replace(dev->params, - strdup(PCMK_STONITH_HOST_CHECK), strdup("status")); -- -- g_list_free_full(dev->targets, free); -- dev->targets = NULL; - } else if (!rc) { - crm_info("Refreshing port list for %s", dev->id); - g_list_free_full(dev->targets, free); --- -1.8.3.1 - - -From a29f88f6020aac5f1ac32072942eb5713d7be50d Mon Sep 17 00:00:00 2001 -From: Hideo Yamauchi -Date: Thu, 17 Jun 2021 22:40:40 +0900 -Subject: [PATCH 2/2] High: fenced: Wrong device may be selected when - "dynamic-list" is specified. - ---- - daemons/fenced/fenced_commands.c | 67 +++++++++++++++++++++++----------------- - 1 file changed, 38 insertions(+), 29 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 35aec06..da076fb 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -904,6 +904,31 @@ xml2device_params(const char *name, xmlNode *dev) - return params; - } - -+static const char * -+target_list_type(stonith_device_t * dev) -+{ -+ const char *check_type = NULL; -+ -+ check_type = g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK); -+ -+ if (check_type == NULL) { -+ -+ if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_LIST)) { -+ check_type = "static-list"; -+ } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP)) { -+ check_type = "static-list"; -+ } else if (pcmk_is_set(dev->flags, st_device_supports_list)) { -+ check_type = "dynamic-list"; -+ } else if (pcmk_is_set(dev->flags, st_device_supports_status)) { -+ check_type = "status"; -+ } else { -+ check_type = "none"; -+ } -+ } -+ -+ return check_type; -+} -+ - static stonith_device_t * - build_device_from_xml(xmlNode * msg) - { -@@ -931,6 +956,12 @@ build_device_from_xml(xmlNode * msg) - value = g_hash_table_lookup(device->params, PCMK_STONITH_HOST_MAP); - device->aliases = build_port_aliases(value, &(device->targets)); - -+ value = target_list_type(device); -+ if (!pcmk__str_eq(value, "static-list", pcmk__str_casei) && device->targets) { -+ /* Other than "static-list", dev-> targets is unnecessary. */ -+ g_list_free_full(device->targets, free); -+ device->targets = NULL; -+ } - device->agent_metadata = get_agent_metadata(device->agent); - if (device->agent_metadata) { - read_action_metadata(device); -@@ -971,31 +1002,6 @@ build_device_from_xml(xmlNode * msg) - return device; - } - --static const char * --target_list_type(stonith_device_t * dev) --{ -- const char *check_type = NULL; -- -- check_type = g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK); -- -- if (check_type == NULL) { -- -- if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_LIST)) { -- check_type = "static-list"; -- } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP)) { -- check_type = "static-list"; -- } else if (pcmk_is_set(dev->flags, st_device_supports_list)) { -- check_type = "dynamic-list"; -- } else if (pcmk_is_set(dev->flags, st_device_supports_status)) { -- check_type = "status"; -- } else { -- check_type = "none"; -- } -- } -- -- return check_type; --} -- - static void - schedule_internal_command(const char *origin, - stonith_device_t * device, -@@ -1099,11 +1105,14 @@ dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) - - /* If we successfully got the targets earlier, don't disable. */ - if (rc != 0 && !dev->targets) { -- crm_notice("Disabling port list queries for %s: %s " -- CRM_XS " rc=%d", dev->id, output, rc); -- /* Fall back to status */ -- g_hash_table_replace(dev->params, -+ if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_CHECK) == NULL) { -+ /* -+ If the operation fails if the user does not explicitly specify "dynamic-list", it will fall back to "status". -+ */ -+ crm_notice("Disabling port list queries for %s (%d): %s", dev->id, rc, output); -+ g_hash_table_replace(dev->params, - strdup(PCMK_STONITH_HOST_CHECK), strdup("status")); -+ } - } else if (!rc) { - crm_info("Refreshing port list for %s", dev->id); - g_list_free_full(dev->targets, free); --- -1.8.3.1 - diff --git a/SOURCES/008-fencing-history.patch b/SOURCES/008-fencing-history.patch new file mode 100644 index 0000000..1ea9ac7 --- /dev/null +++ b/SOURCES/008-fencing-history.patch @@ -0,0 +1,43 @@ +From 0339e89f3238b31df78b864dae8684b82c370741 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 13 Dec 2021 15:22:40 -0600 +Subject: [PATCH] Fix: fencer: get current time correctly + +f52bc8e1ce (2.1.2) introduced a regression by using clock_gettime() with +CLOCK_MONOTONIC to get the current time. Use qb_util_timespec_from_epoch_get() +instead (which as of this writing uses clock_gettime() with CLOCK_REALTIME if +available, and falls back to gettimeofday() if not). +--- + daemons/fenced/fenced_commands.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f34cb4f13..7685cb8c3 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2746,19 +2746,14 @@ bool fencing_peer_active(crm_node_t *peer) + return FALSE; + } + +-void set_fencing_completed(remote_fencing_op_t * op) ++void ++set_fencing_completed(remote_fencing_op_t *op) + { +-#ifdef CLOCK_MONOTONIC + struct timespec tv; + +- clock_gettime(CLOCK_MONOTONIC, &tv); +- ++ qb_util_timespec_from_epoch_get(&tv); + op->completed = tv.tv_sec; + op->completed_nsec = tv.tv_nsec; +-#else +- op->completed = time(NULL); +- op->completed_nsec = 0L; +-#endif + } + + /*! +-- +2.27.0 + diff --git a/SOURCES/009-crm_resource-messages.patch b/SOURCES/009-crm_resource-messages.patch deleted file mode 100644 index bdbcf03..0000000 --- a/SOURCES/009-crm_resource-messages.patch +++ /dev/null @@ -1,229 +0,0 @@ -From 5bcab230ad4c647ca78b18bd4a66e30a4bb4417f Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 16 Jun 2021 11:19:03 +0200 -Subject: [PATCH 1/2] Feature: crm_resource: report not supported for --force-* - w/systemd, upstart, nagios and bundled resources - ---- - tools/crm_resource.c | 21 ++++---------- - tools/crm_resource_runtime.c | 67 +++++++++++++++++++++++++++++--------------- - 2 files changed, 51 insertions(+), 37 deletions(-) - -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index 4abdd03..fa7902c 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -660,21 +660,12 @@ attr_set_type_cb(const gchar *option_name, const gchar *optarg, gpointer data, G - - gboolean - class_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { -- if (!(pcmk_get_ra_caps(optarg) & pcmk_ra_cap_params)) { -- if (!args->quiet) { -- g_set_error(error, G_OPTION_ERROR, CRM_EX_INVALID_PARAM, -- "Standard %s does not support parameters\n", optarg); -- } -- return FALSE; -- -- } else { -- if (options.v_class != NULL) { -- free(options.v_class); -- } -- -- options.v_class = strdup(optarg); -+ if (options.v_class != NULL) { -+ free(options.v_class); - } - -+ options.v_class = strdup(optarg); -+ - options.cmdline_config = TRUE; - options.require_resource = FALSE; - return TRUE; -@@ -1422,7 +1413,7 @@ validate_cmdline_config(void) - } else if (options.rsc_cmd != cmd_execute_agent) { - g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_USAGE, - "--class, --agent, and --provider can only be used with " -- "--validate"); -+ "--validate and --force-*"); - - // Not all of --class, --agent, and --provider need to be given. Not all - // classes support the concept of a provider. Check that what we were given -@@ -1841,7 +1832,7 @@ main(int argc, char **argv) - if (options.cmdline_config) { - exit_code = cli_resource_execute_from_params(out, NULL, - options.v_class, options.v_provider, options.v_agent, -- "validate-all", options.cmdline_params, -+ options.operation, options.cmdline_params, - options.override_params, options.timeout_ms, - args->verbosity, options.force, options.check_level); - } else { -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index fe42e60..59e6df5 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1674,24 +1674,59 @@ wait_till_stable(pcmk__output_t *out, int timeout_ms, cib_t * cib) - return rc; - } - -+static const char * -+get_action(const char *rsc_action) { -+ const char *action = NULL; -+ -+ if (pcmk__str_eq(rsc_action, "validate", pcmk__str_casei)) { -+ action = "validate-all"; -+ -+ } else if (pcmk__str_eq(rsc_action, "force-check", pcmk__str_casei)) { -+ action = "monitor"; -+ -+ } else if (pcmk__strcase_any_of(rsc_action, "force-start", "force-stop", -+ "force-demote", "force-promote", NULL)) { -+ action = rsc_action+6; -+ } else { -+ action = rsc_action; -+ } -+ -+ return action; -+} -+ - crm_exit_t - cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - const char *rsc_class, const char *rsc_prov, -- const char *rsc_type, const char *action, -+ const char *rsc_type, const char *rsc_action, - GHashTable *params, GHashTable *override_hash, - int timeout_ms, int resource_verbose, gboolean force, - int check_level) - { -+ const char *action = NULL; - GHashTable *params_copy = NULL; - crm_exit_t exit_code = CRM_EX_OK; - svc_action_t *op = NULL; - - if (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { - out->err(out, "Sorry, the %s option doesn't support %s resources yet", -- action, rsc_class); -+ rsc_action, rsc_class); -+ crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); -+ } else if (pcmk__strcase_any_of(rsc_class, PCMK_RESOURCE_CLASS_SYSTEMD, -+ PCMK_RESOURCE_CLASS_UPSTART, PCMK_RESOURCE_CLASS_NAGIOS, NULL)) { -+ out->err(out, "Sorry, the %s option doesn't support %s resources", -+ rsc_action, rsc_class); -+ crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); -+ } else if (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, -+ pcmk__str_casei) && !pcmk__str_eq( -+ resources_find_service_class(rsc_name), PCMK_RESOURCE_CLASS_LSB, -+ pcmk__str_casei)) { -+ out->err(out, "Sorry, the %s option doesn't support %s resources", -+ rsc_action, resources_find_service_class(rsc_name)); - crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); - } - -+ action = get_action(rsc_action); -+ - /* If no timeout was provided, grab the default. */ - if (timeout_ms == 0) { - timeout_ms = crm_get_msec(CRM_DEFAULT_OP_TIMEOUT_S); -@@ -1766,7 +1801,7 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - exit_code = op->rc; - - out->message(out, "resource-agent-action", resource_verbose, rsc_class, -- rsc_prov, rsc_type, rsc_name, action, override_hash, op->rc, -+ rsc_prov, rsc_type, rsc_name, rsc_action, override_hash, op->rc, - op->status, op->stdout_data, op->stderr_data); - } else { - exit_code = op->rc == 0 ? CRM_EX_ERROR : op->rc; -@@ -1790,27 +1825,15 @@ cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - const char *rtype = NULL; - const char *rprov = NULL; - const char *rclass = NULL; -- const char *action = NULL; - GHashTable *params = NULL; - -- if (pcmk__str_eq(rsc_action, "validate", pcmk__str_casei)) { -- action = "validate-all"; -- -- } else if (pcmk__str_eq(rsc_action, "force-check", pcmk__str_casei)) { -- action = "monitor"; -- -- } else if (pcmk__str_eq(rsc_action, "force-stop", pcmk__str_casei)) { -- action = rsc_action+6; -- -- } else if (pcmk__strcase_any_of(rsc_action, "force-start", "force-demote", -+ if (pcmk__strcase_any_of(rsc_action, "force-start", "force-demote", - "force-promote", NULL)) { -- action = rsc_action+6; -- - if(pe_rsc_is_clone(rsc)) { - GList *nodes = cli_resource_search(rsc, requested_name, data_set); - if(nodes != NULL && force == FALSE) { - out->err(out, "It is not safe to %s %s here: the cluster claims it is already active", -- action, rsc->id); -+ rsc_action, rsc->id); - out->err(out, "Try setting target-role=Stopped first or specifying " - "the force option"); - return CRM_EX_UNSAFE; -@@ -1818,9 +1841,6 @@ cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - - g_list_free_full(nodes, free); - } -- -- } else { -- action = rsc_action; - } - - if(pe_rsc_is_clone(rsc)) { -@@ -1831,6 +1851,9 @@ cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - if(rsc->variant == pe_group) { - out->err(out, "Sorry, the %s option doesn't support group resources", rsc_action); - return CRM_EX_UNIMPLEMENT_FEATURE; -+ } else if (rsc->variant == pe_container || pe_rsc_is_bundled(rsc)) { -+ out->err(out, "Sorry, the %s option doesn't support bundled resources", rsc_action); -+ return CRM_EX_UNIMPLEMENT_FEATURE; - } - - rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); -@@ -1841,12 +1864,12 @@ cli_resource_execute(pe_resource_t *rsc, const char *requested_name, - data_set); - - if (timeout_ms == 0) { -- timeout_ms = pe_get_configured_timeout(rsc, action, data_set); -+ timeout_ms = pe_get_configured_timeout(rsc, get_action(rsc_action), data_set); - } - - rid = pe_rsc_is_anon_clone(rsc->parent)? requested_name : rsc->id; - -- exit_code = cli_resource_execute_from_params(out, rid, rclass, rprov, rtype, action, -+ exit_code = cli_resource_execute_from_params(out, rid, rclass, rprov, rtype, rsc_action, - params, override_hash, timeout_ms, - resource_verbose, force, check_level); - return exit_code; --- -1.8.3.1 - - -From 289cd231186755d99c1262eb9f968dc852409588 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Fri, 16 Jul 2021 13:20:55 +0200 -Subject: [PATCH 2/2] Refactor: crm_resource: remove duplicate Overriding - message that's handled elsewhere - ---- - tools/crm_resource_runtime.c | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index 59e6df5..ce037c5 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1791,8 +1791,6 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - - g_hash_table_iter_init(&iter, override_hash); - while (g_hash_table_iter_next(&iter, (gpointer *) & name, (gpointer *) & value)) { -- out->info(out, "Overriding the cluster configuration for '%s' with '%s' = '%s'", -- rsc_name, name, value); - g_hash_table_replace(op->params, strdup(name), strdup(value)); - } - } --- -1.8.3.1 - diff --git a/SOURCES/009-fencing-reasons.patch b/SOURCES/009-fencing-reasons.patch new file mode 100644 index 0000000..3fb5bc7 --- /dev/null +++ b/SOURCES/009-fencing-reasons.patch @@ -0,0 +1,2985 @@ +From fcd42a5926e9a63d425586552ecc7b543838d352 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 11 Nov 2021 16:57:03 -0600 +Subject: [PATCH 01/23] Feature: fencer: pass full result in async command + replies + +The services library callbacks for async commands, which call +send_async_reply() -> construct_async_reply() to create the reply, now add +fields for exit status, operation status, and exit reason, in addition to the +existing action standard output and legacy return code. + +Nothing uses the new fields yet. +--- + daemons/fenced/fenced_commands.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index f34cb4f136..3497428c18 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2415,9 +2415,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + if (stand_alone) { + /* Do notification with a clean data object */ + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); +- int rc = pcmk_rc2legacy(stonith__result2rc(result)); + +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); ++ stonith__xe_set_result(notify_data, result); + crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); + crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); + crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); +@@ -2425,7 +2424,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + } + } +@@ -2728,9 +2727,8 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) + crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); + crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); + crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); +- crm_xml_add_int(reply, F_STONITH_RC, +- pcmk_rc2legacy(stonith__result2rc(result))); +- crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); ++ ++ stonith__xe_set_result(reply, result); + return reply; + } + +-- +2.27.0 + + +From 4bac2e9811872f92571e4f5a47d8c5032cfc3016 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 12:41:29 -0600 +Subject: [PATCH 02/23] Refactor: fencer: track full result for direct agent + actions + +This renames stonith_device_action() to execute_agent_action() for readability, +and has it set a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 95 +++++++++++++++++++------------- + 1 file changed, 56 insertions(+), 39 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 3497428c18..2f59ef84b7 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1729,23 +1729,6 @@ stonith_level_remove(xmlNode *msg, char **desc) + return pcmk_ok; + } + +-/*! +- * \internal +- * \brief Schedule an (asynchronous) action directly on a stonith device +- * +- * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action +- * directly on a specified device. Only list, monitor, and status actions are +- * expected to use this call, though it should work with any agent command. +- * +- * \param[in] msg API message XML with desired action +- * \param[out] output Unused +- * +- * \return -EINPROGRESS on success, -errno otherwise +- * \note If the action is monitor, the device must be registered via the API +- * (CIB registration is not sufficient), because monitor should not be +- * possible unless the device is "started" (API registered). +- */ +- + static char * + list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + { +@@ -1778,8 +1761,23 @@ list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) + return rv; + } + +-static int +-stonith_device_action(xmlNode * msg, char **output) ++/*! ++ * \internal ++ * \brief Execute a fence agent action directly (and asynchronously) ++ * ++ * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action ++ * directly on a specified device. Only list, monitor, and status actions are ++ * expected to use this call, though it should work with any agent command. ++ * ++ * \param[in] msg Request XML specifying action ++ * \param[out] result Where to store result of action ++ * ++ * \note If the action is monitor, the device must be registered via the API ++ * (CIB registration is not sufficient), because monitor should not be ++ * possible unless the device is "started" (API registered). ++ */ ++static void ++execute_agent_action(xmlNode *msg, pcmk__action_result_t *result) + { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); + xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); +@@ -1792,39 +1790,56 @@ stonith_device_action(xmlNode * msg, char **output) + crm_info("Malformed API action request: device %s, action %s", + (id? id : "not specified"), + (action? action : "not specified")); +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { ++ // Watchdog agent actions are implemented internally + if (stonith_watchdog_timeout_ms <= 0) { +- return -ENODEV; +- } else { +- if (pcmk__str_eq(action, "list", pcmk__str_casei)) { +- *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); +- return pcmk_ok; +- } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- return pcmk_ok; +- } ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Watchdog fence device not configured"); ++ return; ++ ++ } else if (pcmk__str_eq(action, "list", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result_output(result, ++ list_to_string(stonith_watchdog_targets, ++ "\n", TRUE), ++ NULL); ++ return; ++ ++ } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ return; + } + } + + device = g_hash_table_lookup(device_list, id); +- if ((device == NULL) +- || (!device->api_registered && !strcmp(action, "monitor"))) { ++ if (device == NULL) { ++ crm_info("Ignoring API '%s' action request because device %s not found", ++ action, id); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); ++ return; + ++ } else if (!device->api_registered && !strcmp(action, "monitor")) { + // Monitors may run only on "started" (API-registered) devices +- crm_info("Ignoring API '%s' action request because device %s not found", ++ crm_info("Ignoring API '%s' action request because device %s not active", + action, id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Fence device not active"); ++ return; + } + + cmd = create_async_command(msg); + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + schedule_stonith_command(cmd, device); +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + static void +@@ -2911,8 +2926,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + xmlNode *data = NULL; + bool need_reply = true; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- char *output = NULL; + const char *op = crm_element_value(request, F_STONITH_OPERATION); + const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); + +@@ -2935,8 +2950,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { +- rc = stonith_device_action(request, &output); +- need_reply = (rc != -EINPROGRESS); ++ execute_agent_action(request, &result); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3150,19 +3166,20 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, output, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free(output); + free_xml(data); + + crm_debug("Processed %s request from %s %s: %s (rc=%d)", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), + ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ ++ pcmk__reset_result(&result); + } + + static void +-- +2.27.0 + + +From 9601b2aff1ea6a4eef0bb2701c22c1e971a657eb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 18 Nov 2021 17:31:20 -0600 +Subject: [PATCH 03/23] Refactor: fencer: track full result for local fencing + +This renames stonith_fence() to fence_locally() for readability, and has it set +a full result rather than return a legacy return code. + +As of this commit, handle_request() just maps the result back to a legacy code, +but it will make better use of it with planned changes. +--- + daemons/fenced/fenced_commands.c | 38 +++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f59ef84b7..bfb0d71e5f 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2626,37 +2626,49 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) + } + } + +-static int +-stonith_fence(xmlNode * msg) ++/*! ++ * \internal ++ * \brief Execute a fence action via the local node ++ * ++ * \param[in] msg Fencing request ++ * \param[out] result Where to store result of fence action ++ */ ++static void ++fence_locally(xmlNode *msg, pcmk__action_result_t *result) + { + const char *device_id = NULL; + stonith_device_t *device = NULL; + async_command_t *cmd = create_async_command(msg); + xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); + ++ CRM_CHECK(result != NULL, return); ++ + if (cmd == NULL) { +- return -EPROTO; ++ fenced_set_protocol_error(result); ++ return; + } + + device_id = crm_element_value(dev, F_STONITH_DEVICE); +- if (device_id) { ++ if (device_id != NULL) { + device = g_hash_table_lookup(device_list, device_id); + if (device == NULL) { + crm_err("Requested device '%s' is not available", device_id); +- return -ENODEV; ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ "Requested fence device not found"); ++ return; + } + schedule_stonith_command(cmd, device); + + } else { + const char *host = crm_element_value(dev, F_STONITH_TARGET); + +- if (cmd->options & st_opt_cs_nodeid) { +- int nodeid; +- crm_node_t *node; ++ if (pcmk_is_set(cmd->options, st_opt_cs_nodeid)) { ++ int nodeid = 0; ++ crm_node_t *node = NULL; + + pcmk__scan_min_int(host, &nodeid, 0); + node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY); +- if (node) { ++ if (node != NULL) { + host = node->uname; + } + } +@@ -2666,7 +2678,7 @@ stonith_fence(xmlNode * msg) + TRUE, cmd, stonith_fence_get_devices_cb); + } + +- return -EINPROGRESS; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + + xmlNode * +@@ -3016,9 +3028,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { +- +- if (remote_peer || stand_alone) { +- rc = stonith_fence(request); ++ if ((remote_peer != NULL) || stand_alone) { ++ fence_locally(request, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { +-- +2.27.0 + + +From b7c7676cfd36fd72d3b29e86a23db97081e19b03 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:06:52 -0600 +Subject: [PATCH 04/23] Low: fencer: handle topology level registration errors + better + +Rename stonith_level_register() to fenced_register_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use a new combination of +PCMK_EXEC_INVALID with CRM_EX_INVALID_PARAM for invalid levels, so it gets +mapped back to the legacy code -EINVAL (which was returned before). +--- + daemons/fenced/fenced_commands.c | 52 +++++++++++++++++++++---------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 3 +- + lib/fencing/st_actions.c | 1 + + 4 files changed, 44 insertions(+), 21 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index bfb0d71e5f..975f8633a4 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1583,20 +1583,19 @@ parse_device_list(const char *devices) + + /*! + * \internal +- * \brief Register a STONITH level for a target ++ * \brief Register a fencing topology level for a target + * + * Given an XML request specifying the target name, level index, and device IDs + * for the level, this will create an entry for the target in the global topology + * table if one does not already exist, then append the specified device IDs to + * the entry's device list for the specified level. + * +- * \param[in] msg XML request for STONITH level registration +- * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") +- * +- * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of registration + */ +-int +-stonith_level_register(xmlNode *msg, char **desc) ++void ++fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + { + int id = 0; + xmlNode *level; +@@ -1607,6 +1606,13 @@ stonith_level_register(xmlNode *msg, char **desc) + stonith_key_value_t *dIter = NULL; + stonith_key_value_t *devices = NULL; + ++ CRM_CHECK(result != NULL, return); ++ ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } ++ + /* Allow the XML here to point to the level tag directly, or wrapped in + * another tag. If directly, don't search by xpath, because it might give + * multiple hits (e.g. if the XML is the CIB). +@@ -1614,11 +1620,15 @@ stonith_level_register(xmlNode *msg, char **desc) + if (pcmk__str_eq(TYPE(msg), XML_TAG_FENCING_LEVEL, pcmk__str_casei)) { + level = msg; + } else { +- level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ } ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; + } +- CRM_CHECK(level != NULL, return -EINVAL); + + mode = stonith_level_kind(level); ++ + target = stonith_level_key(level, mode); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +@@ -1626,18 +1636,26 @@ stonith_level_register(xmlNode *msg, char **desc) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + +- /* Sanity-check arguments */ +- if (mode >= 3 || (id <= 0) || (id >= ST_LEVEL_MAX)) { +- crm_trace("Could not add %s[%d] (%d) to the topology (%d active entries)", target, id, mode, g_hash_table_size(topology)); ++ // Ensure level ID is in allowed range ++ if ((id <= 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology registration for %s with invalid level %d", ++ target, id); + free(target); +- crm_log_xml_err(level, "Bad topology"); +- return -EINVAL; ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; + } + + /* Find or create topology table entry */ + tp = g_hash_table_lookup(topology, target); + if (tp == NULL) { + tp = calloc(1, sizeof(stonith_topology_t)); ++ if (tp == NULL) { ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ strerror(ENOMEM)); ++ return; ++ } + tp->kind = mode; + tp->target = target; + tp->target_value = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_VALUE); +@@ -1671,7 +1689,8 @@ stonith_level_register(xmlNode *msg, char **desc) + crm_info("Target %s has %d active fencing level%s", + tp->target, nlevels, pcmk__plural_s(nlevels)); + } +- return pcmk_ok; ++ ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + int +@@ -3142,7 +3161,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_register(request, &device_id); ++ fenced_register_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 0a8b3bf6f2..469304f67c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -452,8 +452,8 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) + static void + handle_topology_change(xmlNode *match, bool remove) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(match != NULL, return); + crm_trace("Updating %s", ID(match)); +@@ -467,9 +467,10 @@ handle_topology_change(xmlNode *match, bool remove) + free(key); + } + +- rc = stonith_level_register(match, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); +- ++ fenced_register_level(match, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free(desc); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 5162ada75d..cf114fb979 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -218,7 +218,8 @@ void stonith_device_remove(const char *id, bool from_cib); + + char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); +-int stonith_level_register(xmlNode * msg, char **desc); ++void fenced_register_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + int stonith_level_remove(xmlNode * msg, char **desc); + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index 7eaa8b0f2b..37fa849847 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -325,6 +325,7 @@ stonith__result2rc(const pcmk__action_result_t *result) + */ + case PCMK_EXEC_INVALID: + switch (result->exit_status) { ++ case CRM_EX_INVALID_PARAM: return EINVAL; + case CRM_EX_INSUFFICIENT_PRIV: return EACCES; + case CRM_EX_PROTOCOL: return EPROTO; + +-- +2.27.0 + + +From 27cedca4070328ecac1761f81c2890059af19dcf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 29 Nov 2021 17:29:38 -0600 +Subject: [PATCH 05/23] Low: fencer: handle topology level unregistration + errors better + +Rename stonith_level_remove() to fenced_unregister_level() for consistency, and +refactor it to return a full result rather than a legacy return code. + +Return a protocol error for missing information in the request XML, and log +invalid level numbers at warning level. Use PCMK_EXEC_INVALID with +CRM_EX_INVALID_PARAM for invalid levels, so it gets mapped back to the legacy +code -EINVAL (which reverses the recent change in ec60f014b, both for backward +compatibility and because it makes sense -- a missing parameter is a protocol +error, while an invalid parameter is an invalid parameter error). +--- + daemons/fenced/fenced_commands.c | 52 ++++++++++++++++++++++++------- + daemons/fenced/pacemaker-fenced.c | 9 +++--- + daemons/fenced/pacemaker-fenced.h | 4 +-- + 3 files changed, 48 insertions(+), 17 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 975f8633a4..ef41dc0e52 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1693,25 +1693,54 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +-int +-stonith_level_remove(xmlNode *msg, char **desc) ++/*! ++ * \internal ++ * \brief Unregister a fencing topology level for a target ++ * ++ * Given an XML request specifying the target name and level index (or 0 for all ++ * levels), this will remove any corresponding entry for the target from the ++ * global topology table. ++ * ++ * \param[in] msg XML request for STONITH level registration ++ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" ++ * \param[out] result Where to set result of unregistration ++ */ ++void ++fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result) + { + int id = -1; + stonith_topology_t *tp; + char *target; ++ xmlNode *level = NULL; ++ ++ CRM_CHECK(result != NULL, return); + +- /* Unlike additions, removal requests should always have one level tag */ +- xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); ++ if (msg == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + +- CRM_CHECK(level != NULL, return -EPROTO); ++ // Unlike additions, removal requests should always have one level tag ++ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); ++ if (level == NULL) { ++ fenced_set_protocol_error(result); ++ return; ++ } + + target = stonith_level_key(level, -1); + crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); + +- CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), +- crm_log_xml_warn(msg, "invalid level"); +- free(target); +- return -EPROTO); ++ // Ensure level ID is in allowed range ++ if ((id < 0) || (id >= ST_LEVEL_MAX)) { ++ crm_warn("Ignoring topology unregistration for %s with invalid level %d", ++ target, id); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level"); ++ return; ++ } + + if (desc) { + *desc = crm_strdup_printf("%s[%d]", target, id); +@@ -1745,7 +1774,7 @@ stonith_level_remove(xmlNode *msg, char **desc) + } + + free(target); +- return pcmk_ok; ++ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + + static char * +@@ -3173,7 +3202,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_level_remove(request, &device_id); ++ fenced_unregister_level(request, &device_id, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { + rc = -EACCES; + } +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 469304f67c..56acc93f31 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -409,17 +409,18 @@ do_stonith_notify_level(const char *op, int rc, const char *desc) + static void + topology_remove_helper(const char *node, int level) + { +- int rc; + char *desc = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); + + crm_xml_add(data, F_STONITH_ORIGIN, __func__); + crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + +- rc = stonith_level_remove(data, &desc); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); +- ++ fenced_unregister_level(data, &desc, &result); ++ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, ++ pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ pcmk__reset_result(&result); + free_xml(data); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index cf114fb979..0006e02e7d 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -220,8 +220,8 @@ char *stonith_level_key(xmlNode * msg, int mode); + int stonith_level_kind(xmlNode * msg); + void fenced_register_level(xmlNode *msg, char **desc, + pcmk__action_result_t *result); +- +-int stonith_level_remove(xmlNode * msg, char **desc); ++void fenced_unregister_level(xmlNode *msg, char **desc, ++ pcmk__action_result_t *result); + + stonith_topology_t *find_topology_for_host(const char *host); + +-- +2.27.0 + + +From 3f603defca78eb2bdd46c51a80ed04a4c773442b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 12:22:33 -0600 +Subject: [PATCH 06/23] Log: fencer: track and log full result when handling + requests + +handle_request() now tracks and logs a full result rather than just a +legacy return code. +--- + daemons/fenced/fenced_commands.c | 95 ++++++++++++++++++-------------- + 1 file changed, 53 insertions(+), 42 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index ef41dc0e52..996c18faaa 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2981,9 +2981,7 @@ static void + handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + xmlNode *request, const char *remote_peer) + { +- int call_options = 0; +- int rc = -EOPNOTSUPP; +- ++ int call_options = st_opt_none; + xmlNode *data = NULL; + bool need_reply = true; + pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +@@ -3006,13 +3004,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + pcmk__ipc_send_xml(client, id, reply, flags); + client->request_id = 0; + free_xml(reply); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { + execute_agent_action(request, &result); + need_reply = (result.execution_status != PCMK_EXEC_PENDING); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { + const char *call_id = crm_element_value(request, F_STONITH_CALLID); +@@ -3021,7 +3018,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); + do_stonith_async_timeout_update(client_id, call_id, op_timeout); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { +@@ -3033,7 +3030,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + remove_relay_op(request); + + stonith_query(request, remote_peer, client_id, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { +@@ -3055,7 +3052,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + } + + pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { +@@ -3069,27 +3066,27 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value(dev, F_STONITH_TARGET)); + + if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + need_reply = false; + } + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { + if ((remote_peer != NULL) || stand_alone) { + fence_locally(request, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + + } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { + switch (fenced_handle_manual_confirmation(client, request)) { + case pcmk_rc_ok: +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + break; + case EINPROGRESS: +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, ++ NULL); + break; + default: +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); + break; + } + +@@ -3100,17 +3097,15 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *action = crm_element_value(dev, F_STONITH_ACTION); + const char *device = crm_element_value(dev, F_STONITH_DEVICE); + +- if (client) { ++ if (client != NULL) { + int tolerance = 0; + + crm_notice("Client %s wants to fence (%s) %s using %s", + pcmk__client_name(client), action, + target, (device? device : "any device")); +- + crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); +- + if (stonith_check_fence_tolerance(tolerance, target, action)) { +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + goto done; + } + +@@ -3143,24 +3138,24 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_xml_add(request, F_STONITH_REMOTE_OP_ID, op->id); + send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, + FALSE); +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + + } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { +- rc = -EPROTO; ++ fenced_set_protocol_error(&result); ++ + } else { +- rc = -EINPROGRESS; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + } +- need_reply = (rc != -EINPROGRESS); ++ need_reply = (result.execution_status != PCMK_EXEC_PENDING); + + } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { + stonith_fence_history(request, &data, remote_peer, call_options); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + if (pcmk_is_set(call_options, st_opt_discard_reply)) { + /* we don't expect answers to the broadcast + * we might have sent out + */ +- rc = pcmk_ok; + need_reply = false; + } + +@@ -3168,11 +3163,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + const char *device_id = NULL; + + if (is_privileged(client, op)) { +- rc = stonith_device_register(request, &device_id, FALSE); ++ int rc = stonith_device_register(request, &device_id, FALSE); ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ ((rc == pcmk_ok)? NULL : pcmk_strerror(rc))); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3180,22 +3182,25 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + stonith_device_remove(device_id, false); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, rc, device_id); ++ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; + + if (is_privileged(client, op)) { + fenced_register_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3203,11 +3208,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + + if (is_privileged(client, op)) { + fenced_unregister_level(request, &device_id, &result); +- rc = pcmk_rc2legacy(stonith__result2rc(&result)); + } else { +- rc = -EACCES; ++ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, ++ PCMK_EXEC_INVALID, ++ "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, rc, device_id); ++ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +@@ -3216,31 +3222,36 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + crm_element_value_int(request, XML_ATTR_ID, &node_id); + name = crm_element_value(request, XML_ATTR_UNAME); + reap_crm_member(node_id, name); +- rc = pcmk_ok; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + need_reply = false; + + } else { + crm_err("Unknown IPC request %s from %s %s", op, + ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client))); ++ pcmk__set_result(&result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, ++ "Unknown IPC request type (bug?)"); + } + + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); ++ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, ++ pcmk_rc2legacy(stonith__result2rc(&result))); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); + } + +- free_xml(data); +- +- crm_debug("Processed %s request from %s %s: %s (rc=%d)", ++ crm_debug("Processed %s request from %s %s: %s%s%s%s", + op, ((client == NULL)? "peer" : "client"), + ((client == NULL)? remote_peer : pcmk__client_name(client)), +- ((rc > 0)? "" : pcmk_strerror(rc)), rc); ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + ++ free_xml(data); + pcmk__reset_result(&result); + } + +-- +2.27.0 + + +From 5e13199699a4e9279520b3668c072e3db49c9782 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:10:36 -0600 +Subject: [PATCH 07/23] Feature: fencer: pass full result in replies to + requests + +Rename stonith_construct_reply() to fenced_construct_reply() for consistency, +make it take a full result as an argument rather than separate arguments for +legacy return code and output, and add the full result to the reply (along with +the legacy return code, for backward compatibility). + +This is used for peer query replies and some request replies (including replies +to local clients who requested fencing). Other replies, such as those built by +construct_async_reply(), are not affected by this commit. +--- + daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++++++--------- + daemons/fenced/fenced_remote.c | 9 ++++++++- + daemons/fenced/pacemaker-fenced.h | 4 ++-- + 3 files changed, 34 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 996c18faaa..84f89e8daf 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2322,6 +2322,7 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + const char *target = NULL; + int timeout = 0; + xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_NEVER); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); + if (dev) { +@@ -2338,7 +2339,8 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int + crm_log_xml_debug(msg, "Query"); + query = calloc(1, sizeof(struct st_query_data)); + +- query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ query->reply = fenced_construct_reply(msg, NULL, &result); + query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; + query->client_id = client_id ? strdup(client_id) : NULL; + query->target = target ? strdup(target) : NULL; +@@ -2729,8 +2731,23 @@ fence_locally(xmlNode *msg, pcmk__action_result_t *result) + pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); + } + ++/*! ++ * \internal ++ * \brief Build an XML reply for a fencing operation ++ * ++ * \param[in] request Request that reply is for ++ * \param[in] data If not NULL, add to reply as call data ++ * \param[in] result Full result of fencing operation ++ * ++ * \return Newly created XML reply ++ * \note The caller is responsible for freeing the result. ++ * \note This has some overlap with construct_async_reply(), but that copies ++ * values from an async_command_t, whereas this one copies them from the ++ * request. ++ */ + xmlNode * +-stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) ++fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *reply = NULL; + +@@ -2738,8 +2755,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + + crm_xml_add(reply, "st_origin", __func__); + crm_xml_add(reply, F_TYPE, T_STONITH_NG); +- crm_xml_add(reply, F_STONITH_OUTPUT, output); +- crm_xml_add_int(reply, F_STONITH_RC, rc); ++ stonith__xe_set_result(reply, result); + + if (request == NULL) { + /* Most likely, this is the result of a stonith operation that was +@@ -2749,12 +2765,14 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + * @TODO Maybe synchronize this information at start-up? + */ + crm_warn("Missing request information for client notifications for " +- "operation with result %d (initiated before we came up?)", rc); ++ "operation with result '%s' (initiated before we came up?)", ++ pcmk_exec_status_str(result->execution_status)); + + } else { + const char *name = NULL; + const char *value = NULL; + ++ // Attributes to copy from request to reply + const char *names[] = { + F_STONITH_OPERATION, + F_STONITH_CALLID, +@@ -2764,8 +2782,6 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i + F_STONITH_CALLOPTS + }; + +- crm_trace("Creating a result reply with%s reply output (rc=%d)", +- (data? "" : "out"), rc); + for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { + name = names[lpc]; + value = crm_element_value(request, name); +@@ -3236,8 +3252,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + done: + // Reply if result is known + if (need_reply) { +- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, +- pcmk_rc2legacy(stonith__result2rc(&result))); ++ xmlNode *reply = fenced_construct_reply(request, data, &result); + + stonith_send_reply(reply, call_options, remote_peer, client_id); + free_xml(reply); +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8feb401477..baa07d9e78 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -415,7 +415,14 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = stonith_construct_reply(op->request, NULL, data, rc); ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ reply = fenced_construct_reply(op->request, data, &result); ++ } + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0006e02e7d..d5f4bc79fd 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -228,8 +228,8 @@ stonith_topology_t *find_topology_for_host(const char *host); + void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, + gboolean from_peer); + +-xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, +- int rc); ++xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, ++ pcmk__action_result_t *result); + + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); +-- +2.27.0 + + +From b32aa252b321ff40c834d153cb23f8b3be471611 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:43:20 -0600 +Subject: [PATCH 08/23] Log: fencer: grab and log full result when processing + peer fencing replies + +fenced_process_fencing_reply() now checks for the full result, instead of only +a legacy return code, in peer replies, and uses it in log messages. +--- + daemons/fenced/fenced_remote.c | 63 ++++++++++++++++++++-------------- + 1 file changed, 37 insertions(+), 26 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index baa07d9e78..c6369f0051 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2095,21 +2095,21 @@ process_remote_stonith_query(xmlNode * msg) + void + fenced_process_fencing_reply(xmlNode *msg) + { +- int rc = 0; + const char *id = NULL; + const char *device = NULL; + remote_fencing_op_t *op = NULL; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return); + + id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); + CRM_CHECK(id != NULL, return); + +- dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); ++ dev = stonith__find_xe_with_result(msg); + CRM_CHECK(dev != NULL, return); + +- crm_element_value_int(dev, F_STONITH_RC, &rc); ++ stonith__xe_get_result(dev, &result); + + device = crm_element_value(dev, F_STONITH_DEVICE); + +@@ -2117,7 +2117,7 @@ fenced_process_fencing_reply(xmlNode *msg) + op = g_hash_table_lookup(stonith_remote_op_list, id); + } + +- if (op == NULL && rc == pcmk_ok) { ++ if ((op == NULL) && pcmk__result_ok(&result)) { + /* Record successful fencing operations */ + const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID); + +@@ -2139,16 +2139,19 @@ fenced_process_fencing_reply(xmlNode *msg) + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +- crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " ++ crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_strerror(rc), op->id); +- if (rc == pcmk_ok) { ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2162,28 +2165,35 @@ fenced_process_fencing_reply(xmlNode *msg) + if (pcmk_is_set(op->call_options, st_opt_topology)) { + const char *device = crm_element_value(msg, F_STONITH_DEVICE); + +- crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s " +- CRM_XS " rc=%d", ++ crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, +- op->originator, pcmk_strerror(rc), rc); ++ op->originator, ++ pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : " (", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ (result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + +- if ((op->phase == 2) && (rc != pcmk_ok)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ +- crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s " +- "after successful 'off'", device, rc, op->target); +- rc = pcmk_ok; ++ crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " ++ "after successful 'off'", ++ device, pcmk_exec_status_str(result.execution_status), ++ (result.exit_reason == NULL)? "" : ": ", ++ (result.exit_reason == NULL)? "" : result.exit_reason, ++ op->target); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (rc == pcmk_ok) { ++ if (pcmk__result_ok(&result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +@@ -2193,29 +2203,30 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } + } +- } else if (rc == pcmk_ok && op->devices == NULL) { ++ } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); +- + op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; +- } else if (rc == -ETIME && op->devices == NULL) { ++ } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, rc, FALSE); ++ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ + } + + /* Retry on failure */ +- crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, +- op->client_name, rc); +- call_remote_stonith(op, NULL, rc); ++ crm_trace("Next for %s on behalf of %s@%s (result was: %s)", ++ op->target, op->originator, op->client_name, ++ pcmk_exec_status_str(result.execution_status)); ++ call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); + } + + gboolean +-- +2.27.0 + + +From afb5706ac606a8ea883aa1597ee63d9891cc2e13 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 15:56:30 -0600 +Subject: [PATCH 09/23] Refactor: fencer: pass full result of previous failed + action when initiating peer fencing + +Rename call_remote_stonith() to request_peer_fencing() for readability, and +make it take the full result of the previous failed action, rather than just +its legacy return code, as an argument. + +This does cause one change in behavior: if topology is in use, a previous +attempt failed, and no more peers have the appropriate device, then the +legacy return code returned will be -ENODEV rather than -EHOSTUNREACH. +These are treated similarly internally, and hopefully that will not cause +problems for external code. +--- + daemons/fenced/fenced_remote.c | 89 +++++++++++++++++++++++++--------- + 1 file changed, 67 insertions(+), 22 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c6369f0051..31d5ee6e93 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,12 +76,13 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, +- int rc); + static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + ++static void request_peer_fencing(remote_fencing_op_t *op, ++ peer_device_info_t *peer, ++ pcmk__action_result_t *result); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -609,12 +610,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- call_remote_stonith(op, NULL, -ETIME); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ ++ // Try another device, if appropriate ++ request_peer_fencing(op, NULL, &result); + return FALSE; + } + +@@ -685,9 +690,13 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { ++ // Result won't be used in this case, but we need to pass something ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1533,6 +1542,10 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1569,7 +1582,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", +@@ -1598,15 +1611,30 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + return FALSE; + } + +-void +-call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) ++/*! ++ * \internal ++ * \brief Ask a peer to execute a fencing operation ++ * ++ * \param[in] op Fencing operation to be executed ++ * \param[in] peer If NULL or topology is in use, choose best peer to execute ++ * the fencing, otherwise use this peer ++ * \param[in] result Full result of previous failed attempt, if any (used as ++ * final result only if a previous attempt failed, topology ++ * is not in use, and no devices remain to be attempted) ++ */ ++static void ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, ++ pcmk__action_result_t *result) + { + const char *device = NULL; +- int timeout = op->base_timeout; ++ int timeout; ++ ++ CRM_CHECK(op != NULL, return); + + crm_trace("Action %.8s targeting %s for %s is %s", + op->id, op->target, op->client_name, + stonith_op_state_str(op->state)); ++ timeout = op->base_timeout; + if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) { + peer = stonith_choose_peer(op); + } +@@ -1623,9 +1651,14 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) { +- /* Ignore any peer preference, they might not have the device we need */ +- /* When using topology, stonith_choose_peer() removes the device from +- * further consideration, so be sure to calculate timeout beforehand */ ++ /* Ignore the caller's peer preference if topology is in use, because ++ * that peer might not have access to the required device. With ++ * topology, stonith_choose_peer() removes the device from further ++ * consideration, so the timeout must be calculated beforehand. ++ * ++ * @TODO Basing the total timeout on the caller's preferred peer (above) ++ * is less than ideal. ++ */ + peer = stonith_choose_peer(op); + + device = op->devices->data; +@@ -1722,8 +1755,6 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + finalize_timed_out_op(op); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { +-// int rc = -EHOSTUNREACH; +- + /* if the operation never left the query state, + * but we have all the expected replies, then no devices + * are available to execute the fencing operation. */ +@@ -1735,17 +1766,28 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + } + ++ // This is the only case in which result will be used ++ CRM_CHECK(result != NULL, return); ++ + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- rc = -ENODEV; ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, ++ NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- rc = -EHOSTUNREACH; +- } ++ pcmk__reset_result(result); ++ pcmk__set_result(result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); ++ } ++ /* ... else use result provided by caller -- overwriting it with ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ setting the correct delegate if needed. ++ */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " + "for client %s " CRM_XS " state=%s", +@@ -1754,7 +1796,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) + } + + op->state = st_failed; +- remote_op_done(op, NULL, rc, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2004,6 +2046,7 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2038,6 +2081,8 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, + * it is possible fencing levels will be skipped because of the missing +@@ -2045,12 +2090,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + } + + } else if (op->state == st_query) { +@@ -2062,12 +2107,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- call_remote_stonith(op, peer, pcmk_ok); ++ request_peer_fencing(op, peer, &result); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- call_remote_stonith(op, NULL, pcmk_ok); ++ request_peer_fencing(op, NULL, &result); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2226,7 +2271,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); +- call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); ++ request_peer_fencing(op, NULL, &result); + } + + gboolean +-- +2.27.0 + + +From 43e08ba7ee1635e47bfaf2a57636101c675b89ae Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:02:04 -0600 +Subject: [PATCH 10/23] Feature: fencer: set exit reason for timeouts waiting + for peer replies + +--- + daemons/fenced/fenced_remote.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 31d5ee6e93..415a7c1b98 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -616,7 +616,9 @@ remote_op_timeout_one(gpointer userdata) + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Peer did not send fence result within timeout"); ++ + + // Try another device, if appropriate + request_peer_fencing(op, NULL, &result); +-- +2.27.0 + + +From 34e5baebac78b7235825b31bebc44e3d65ae45cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:10:28 -0600 +Subject: [PATCH 11/23] Refactor: fencer: pass full result when handling + duplicate actions + +Rename handle_duplicates() to finalize_op_duplicates() for readability, and +make it take a full result rather than a legacy return code as an argument. +--- + daemons/fenced/fenced_remote.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 415a7c1b98..850bfb6eb3 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -439,12 +439,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Finalize all duplicates of a given fencer operation ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { +- GList *iter = NULL; +- +- for (iter = op->duplicates; iter != NULL; iter = iter->next) { ++ for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; + + if (other->state == st_duplicate) { +@@ -452,8 +459,9 @@ handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_strerror(rc), other->id); +- remote_op_done(other, data, rc, TRUE); ++ pcmk_exec_status_str(result->execution_status), ++ other->id); ++ remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); + + } else { + // Possible if (for example) it timed out already +@@ -570,8 +578,13 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + + handle_local_reply_and_notify(op, data, rc); + +- if (dup == FALSE) { +- handle_duplicates(op, data, rc); ++ if (!dup) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), NULL); ++ finalize_op_duplicates(op, data, &result); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 939bd6f5f0f79b19d0cc4d869f3c8980fda2e461 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:23:20 -0600 +Subject: [PATCH 12/23] Feature: fencer: set exit reasons for fencing timeouts + +finalize_timed_out_op() now takes an exit reason as an argument. +It is called for fencing timeouts, peer query reply timeouts, +and all capable nodes failing to fence. + +At this point, the exit reason is not used, but that is planned. +--- + daemons/fenced/fenced_remote.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 850bfb6eb3..c10a32442e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -643,10 +643,12 @@ remote_op_timeout_one(gpointer userdata) + * \brief Finalize a remote fencer operation that timed out + * + * \param[in] op Fencer operation that timed out ++ * \param[in] reason Readable description of what step timed out + */ + static void +-finalize_timed_out_op(remote_fencing_op_t *op) ++finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_total = 0; + +@@ -660,13 +662,13 @@ finalize_timed_out_op(remote_fencing_op_t *op) + * devices, and return success. + */ + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ } else { ++ op->state = st_failed; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- +- op->state = st_failed; +- +- remote_op_done(op, NULL, -ETIME, FALSE); ++ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ pcmk__reset_result(&result); + } + + /*! +@@ -687,7 +689,8 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata); ++ finalize_timed_out_op(userdata, "Fencing could not be completed " ++ "within overall timeout"); + } + return G_SOURCE_REMOVE; + } +@@ -719,7 +722,8 @@ remote_op_query_timeout(gpointer data) + g_source_remove(op->op_timer_total); + op->op_timer_total = 0; + } +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "No capable peers replied to device query " ++ "within timeout"); + } + + return FALSE; +@@ -1767,7 +1771,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + CRM_XS " state=%s", op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + CRM_CHECK(op->state < st_done, return); +- finalize_timed_out_op(op); ++ finalize_timed_out_op(op, "All nodes failed, or are unable, to " ++ "fence target"); + + } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { + /* if the operation never left the query state, +-- +2.27.0 + + +From b80b02799260feb98723a460f2f8e8ad5cdc467f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:32:04 -0600 +Subject: [PATCH 13/23] Refactor: fencer: pass full result when finalizing peer + fencing actions + +Rename remote_op_done() to finalize_op() for readability, and make it take a +full result as an argument, rather than a legacy return code. + +This does cause one change in behavior: when all topology levels fail, +the legacy return code returned will be -pcmk_err_generic instead of EINVAL. +--- + daemons/fenced/fenced_history.c | 2 +- + daemons/fenced/fenced_remote.c | 177 ++++++++++++++++++-------------- + 2 files changed, 103 insertions(+), 76 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index bc159383c2..9e38ff0a20 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -374,7 +374,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + set_fencing_completed(op); + /* use -EHOSTUNREACH to not introduce a new return-code that might + trigger unexpected results at other places and to prevent +- remote_op_done from setting the delegate if not present ++ finalize_op from setting the delegate if not present + */ + stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); + } +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index c10a32442e..aefc5f311c 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -76,13 +76,14 @@ typedef struct { + + GHashTable *stonith_remote_op_list = NULL; + +-static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); + extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -461,7 +462,7 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); ++ finalize_op(other, data, result, true); + + } else { + // Possible if (for example) it timed out already +@@ -487,104 +488,100 @@ delegate_from_xml(xmlNode *xml) + + /*! + * \internal +- * \brief Finalize a remote operation. ++ * \brief Finalize a peer fencing operation + * +- * \description This function has two code paths. ++ * Clean up after a fencing operation completes. This function has two code ++ * paths: the executioner uses it to broadcast the result to CPG peers, and then ++ * each peer (including the executioner) uses it to process that broadcast and ++ * notify its IPC clients of the result. + * +- * Path 1. This node is the owner of the operation and needs +- * to notify the cpg group via a broadcast as to the operation's +- * results. +- * +- * Path 2. The cpg broadcast is received. All nodes notify their local +- * stonith clients the operation results. +- * +- * So, The owner of the operation first notifies the cluster of the result, +- * and once that cpg notify is received back it notifies all the local clients. +- * +- * Nodes that are passive watchers of the operation will receive the +- * broadcast and only need to notify their local clients the operation finished. +- * +- * \param op, The fencing operation to finalize +- * \param data, The xml msg reply (if present) of the last delegated fencing +- * operation. +- * \param dup, Is this operation a duplicate, if so treat it a little differently +- * making sure the broadcast is not sent out. ++ * \param[in] op Fencer operation that completed ++ * \param[in] data If not NULL, XML reply of last delegated fencing operation ++ * \param[in] result Full operation result ++ * \param[in] dup Whether this operation is a duplicate of another ++ * (in which case, do not broadcast the result) + */ + static void +-remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + ++ CRM_CHECK((op != NULL) && (result != NULL), return); ++ ++ if (op->notify_sent) { ++ // Most likely, this is a timed-out action that eventually completed ++ crm_notice("Operation '%s'%s%s by %s for %s@%s%s: " ++ "Result arrived too late " CRM_XS " id=%.8s", ++ op->action, (op->target? " targeting " : ""), ++ (op->target? op->target : ""), ++ (op->delegate? op->delegate : "unknown node"), ++ op->client_name, op->originator, ++ (op_merged? " (merged)" : ""), ++ op->id); ++ return; ++ } ++ + set_fencing_completed(op); + clear_remote_op_timers(op); + undo_op_remap(op); + +- if (op->notify_sent == TRUE) { +- crm_err("Already sent notifications for '%s' targeting %s by %s for " +- "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s", +- op->action, op->target, +- (op->delegate? op->delegate : "unknown node"), +- op->client_name, op->originator, pcmk_strerror(rc), +- rc, stonith_op_state_str(op->state), op->id); +- goto remote_op_done_cleanup; +- } +- + if (data == NULL) { + data = create_xml_node(NULL, "remote-op"); + local_data = data; + + } else if (op->delegate == NULL) { +- switch (rc) { +- case -ENODEV: +- case -EHOSTUNREACH: ++ switch (result->execution_status) { ++ case PCMK_EXEC_NO_FENCE_DEVICE: + break; ++ case PCMK_EXEC_INVALID: ++ if (result->exit_status == CRM_EX_EXPIRED) { ++ break; ++ } ++ // else fall through + default: + op->delegate = delegate_from_xml(data); + break; + } + } + +- if(dup) { +- op_merged = TRUE; +- } else if (crm_element_value(data, F_STONITH_MERGED)) { +- op_merged = TRUE; +- } ++ if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) { ++ op_merged = true; ++ } + + /* Tell everyone the operation is done, we will continue + * with doing the local notifications once we receive + * the broadcast back. */ + subt = crm_element_value(data, F_SUBTYPE); +- if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { ++ if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE)); +- goto remote_op_done_cleanup; ++ stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ free_xml(local_data); ++ return; + } + +- if (rc == pcmk_ok || dup) { +- level = LOG_NOTICE; +- } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ if (pcmk__result_ok(result) || dup ++ || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +- +- do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s " ++ do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) " + CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""), + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id); ++ (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), ++ pcmk_exec_status_str(result->execution_status), ++ ((result->exit_reason == NULL)? "" : ": "), ++ ((result->exit_reason == NULL)? "" : result->exit_reason), ++ op->id); + +- handle_local_reply_and_notify(op, data, rc); ++ handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); + + if (!dup) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- finalize_op_duplicates(op, data, &result); ++ finalize_op_duplicates(op, data, result); + } + + /* Free non-essential parts of the record +@@ -594,20 +591,27 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + g_list_free_full(op->query_results, free_remote_query); + op->query_results = NULL; + } +- + if (op->request) { + free_xml(op->request); + op->request = NULL; + } + +- remote_op_done_cleanup: + free_xml(local_data); + } + ++/*! ++ * \internal ++ * \brief Finalize a watchdog fencer op after the waiting time expires ++ * ++ * \param[in] userdata Fencer operation that completed ++ * ++ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) ++ */ + static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -615,8 +619,9 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- remote_op_done(op, NULL, pcmk_ok, FALSE); +- return FALSE; ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, &result, false); ++ return G_SOURCE_REMOVE; + } + + static gboolean +@@ -667,7 +672,7 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + op->state = st_failed; + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, NULL, &result, false); + pcmk__reset_result(&result); + } + +@@ -1064,9 +1069,13 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- // For the fencer's purposes, the fencing operation is done ++ { ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, &result, false); ++ } + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1200,6 +1209,16 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + return op; + } + ++/*! ++ * \internal ++ * \brief Create a peer fencing operation from a request, and initiate it ++ * ++ * \param[in] client IPC client that made request (NULL to get from request) ++ * \param[in] request Request XML ++ * \param[in] manual_ack Whether this is a manual action confirmation ++ * ++ * \return Newly created operation on success, otherwise NULL ++ */ + remote_fencing_op_t * + initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + gboolean manual_ack) +@@ -1234,9 +1253,17 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + + switch (op->state) { + case st_failed: +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- remote_op_done(op, NULL, -EINVAL, FALSE); ++ // advance_topology_level() exhausted levels ++ { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, &result, false); ++ pcmk__reset_result(&result); ++ } + return op; + + case st_duplicate: +@@ -1607,7 +1634,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_ok, FALSE); ++ finalize_op(op, msg, &result, false); + } + } + +@@ -1805,7 +1832,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } + /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from ++ PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from + setting the correct delegate if needed. + */ + +@@ -1816,7 +1843,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); ++ finalize_op(op, NULL, result, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2216,7 +2243,7 @@ fenced_process_fencing_reply(xmlNode *msg) + } else { + op->state = st_failed; + } +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2241,7 +2268,7 @@ fenced_process_fencing_reply(xmlNode *msg) + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + +@@ -2268,20 +2295,20 @@ fenced_process_fencing_reply(xmlNode *msg) + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } + } + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); ++ finalize_op(op, msg, &result, false); + return; + } else { + /* fall-through and attempt other fencing action using another peer */ +-- +2.27.0 + + +From 8f19c09f1b961ba9aa510b7dcd1875bbabcddcdc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:39:23 -0600 +Subject: [PATCH 14/23] Refactor: fencer: pass full result when broadcasting + replies + +Rename stonith_bcast_result_to_peers() to fenced_broadcast_op_result() for +consistency, and make it take the full result as an argument instead of a +legacy return code. The full result is not yet used, but that is planned. +--- + daemons/fenced/fenced_history.c | 18 ++++++++++++------ + daemons/fenced/fenced_remote.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 9 ++------- + 3 files changed, 26 insertions(+), 16 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 9e38ff0a20..1e07a9815a 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -359,24 +359,29 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); + + g_hash_table_iter_init(&iter, remote_history); + while (g_hash_table_iter_next(&iter, NULL, (void **)&op)) { +- + if (stonith__op_state_pending(op->state) && + pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { ++ + crm_warn("Failing pending operation %.8s originated by us but " + "known only from peer history", op->id); + op->state = st_failed; + set_fencing_completed(op); +- /* use -EHOSTUNREACH to not introduce a new return-code that might +- trigger unexpected results at other places and to prevent +- finalize_op from setting the delegate if not present +- */ +- stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); ++ ++ /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() ++ * from setting a delegate ++ */ ++ pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ "Initiated by earlier fencer " ++ "process and presumed failed"); ++ fenced_broadcast_op_result(op, &result, false); + } + + g_hash_table_iter_steal(&iter); +@@ -391,6 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + ++ pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index aefc5f311c..a0f026c790 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -374,12 +374,21 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + return notify_data; + } + ++/*! ++ * \internal ++ * \brief Broadcast a fence result notification to all CPG peers ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * \param[in] op_merged Whether this operation is a duplicate of another ++ */ + void +-stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, rc); ++ xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -558,7 +567,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); ++ fenced_broadcast_op_result(op, result, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index d5f4bc79fd..ed47ab046c 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -153,13 +153,8 @@ typedef struct remote_fencing_op_s { + + } remote_fencing_op_t; + +-/*! +- * \internal +- * \brief Broadcast the result of an operation to the peers. +- * \param op, Operation whose result should be broadcast +- * \param rc, Result of the operation +- */ +-void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, ++ pcmk__action_result_t *result, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 3396e66b4c9cca895c7412b66159fd2342de1911 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:42:46 -0600 +Subject: [PATCH 15/23] Feature: fencer: add full result to local replies + +handle_local_reply_and_notify() now takes the full result as an argument +instead of a legacy return code, and adds it to the reply to the local +requester. It does not add it to notifications yet, but that is planned. +--- + daemons/fenced/fenced_remote.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index a0f026c790..329e06c444 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -409,8 +409,17 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + return; + } + ++/*! ++ * \internal ++ * \brief Reply to a local request originator and notify all subscribed clients ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] data Top-level XML to add notification to ++ * \param[in] result Full operation result ++ */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, ++ pcmk__action_result_t *result) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -421,26 +430,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, rc); ++ notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, +- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), +- stonith__legacy2status(rc), NULL); +- reply = fenced_construct_reply(op->request, data, &result); +- } ++ reply = fenced_construct_reply(op->request, data, result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); ++ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); + + /* mark this op as having notify's already sent */ +@@ -587,7 +589,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + ((result->exit_reason == NULL)? "" : result->exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); ++ handle_local_reply_and_notify(op, data, result); + + if (!dup) { + finalize_op_duplicates(op, data, result); +-- +2.27.0 + + +From 004583f3ef908cbd9dc6305597cb55d5ad22882c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:47:13 -0600 +Subject: [PATCH 16/23] Refactor: fencer: pass full result when sending device + notifications + +Rename do_stonith_notify_device() to fenced_send_device_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 15 +++++++++++++-- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 84f89e8daf..86a761dfab 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3190,7 +3190,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must register device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { + xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); +@@ -3204,7 +3204,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete device via CIB"); + } +- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_device_notification(op, &result, device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { + char *device_id = NULL; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 56acc93f31..42e167ce78 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -394,10 +394,21 @@ do_stonith_notify_config(const char *op, int rc, + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a device change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or ++ * STONITH_OP_DEVICE_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc ID of device that changed ++ */ + void +-do_stonith_notify_device(const char *op, int rc, const char *desc) ++fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + + void +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index ed47ab046c..0b63680171 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -230,7 +230,9 @@ void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + + void do_stonith_notify(const char *type, int result, xmlNode *data); +-void do_stonith_notify_device(const char *op, int rc, const char *desc); ++void fenced_send_device_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + void do_stonith_notify_level(const char *op, int rc, const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, +-- +2.27.0 + + +From ee0777d5ca99d8d2d7805d4a73241ab696c68751 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:51:55 -0600 +Subject: [PATCH 17/23] Refactor: fencer: pass full result when sending + topology notifications + +Rename do_stonith_notify_level() to fenced_send_level_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/pacemaker-fenced.c | 21 +++++++++++++++------ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 3 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 86a761dfab..2f3dbb035a 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -3216,7 +3216,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must add level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + free(device_id); + + } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { +@@ -3229,7 +3229,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, + PCMK_EXEC_INVALID, + "Unprivileged users must delete level via CIB"); + } +- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); ++ fenced_send_level_notification(op, &result, device_id); + + } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { + int node_id = 0; +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 42e167ce78..773cf57f6b 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -411,10 +411,21 @@ fenced_send_device_notification(const char *op, + do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a topology level change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc String representation of level ([]) ++ */ + void +-do_stonith_notify_level(const char *op, int rc, const char *desc) ++fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc) + { +- do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); ++ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); + } + + static void +@@ -429,8 +440,7 @@ topology_remove_helper(const char *node, int level) + crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); + + fenced_unregister_level(data, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_DEL, &result, desc); + pcmk__reset_result(&result); + free_xml(data); + free(desc); +@@ -480,8 +490,7 @@ handle_topology_change(xmlNode *match, bool remove) + } + + fenced_register_level(match, &desc, &result); +- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, +- pcmk_rc2legacy(stonith__result2rc(&result)), desc); ++ fenced_send_level_notification(STONITH_OP_LEVEL_ADD, &result, desc); + pcmk__reset_result(&result); + free(desc); + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 0b63680171..8503e813bf 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -233,7 +233,9 @@ void do_stonith_notify(const char *type, int result, xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-void do_stonith_notify_level(const char *op, int rc, const char *desc); ++void fenced_send_level_notification(const char *op, ++ const pcmk__action_result_t *result, ++ const char *desc); + + remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, + xmlNode *request, +-- +2.27.0 + + +From deec1ea9bcd7e0062755aa8b74358bfd12e4b9f0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:53:26 -0600 +Subject: [PATCH 18/23] Refactor: fencer: pass full result when sending + configuration notifications + +Rename do_stonith_notify_config() to send_config_notification() for +consistency, and make it take the full result as an argument rather than a +legacy return code. The full result is not used yet, but that is planned. +--- + daemons/fenced/pacemaker-fenced.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 773cf57f6b..d64358e07f 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -379,8 +379,19 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_trace("Notify complete"); + } + ++/*! ++ * \internal ++ * \brief Send notifications for a configuration change to subscribed clients ++ * ++ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, ++ * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or ++ * STONITH_OP_LEVEL_DEL) ++ * \param[in] result Operation result ++ * \param[in] desc Description of what changed ++ * \param[in] active Current number of devices or topologies in use ++ */ + static void +-do_stonith_notify_config(const char *op, int rc, ++send_config_notification(const char *op, const pcmk__action_result_t *result, + const char *desc, int active) + { + xmlNode *notify_data = create_xml_node(NULL, op); +@@ -390,7 +401,7 @@ do_stonith_notify_config(const char *op, int rc, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, rc, notify_data); ++ do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); + free_xml(notify_data); + } + +@@ -408,7 +419,7 @@ fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); ++ send_config_notification(op, result, desc, g_hash_table_size(device_list)); + } + + /*! +@@ -425,7 +436,7 @@ fenced_send_level_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc) + { +- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); ++ send_config_notification(op, result, desc, g_hash_table_size(topology)); + } + + static void +-- +2.27.0 + + +From 432e4445b630fb158482a5f6de1e0e41697a381f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:56:12 -0600 +Subject: [PATCH 19/23] Feature: fencer: pass full result when sending + notifications + +Rename do_stonith_notify() to fenced_send_notification() for consistency, and +make it take the full result as an argument rather than a legacy return code, +and add the full result to the notifications. +--- + daemons/fenced/fenced_commands.c | 4 ++-- + daemons/fenced/fenced_history.c | 6 +++--- + daemons/fenced/fenced_remote.c | 6 +++--- + daemons/fenced/pacemaker-fenced.c | 15 ++++++++++++--- + daemons/fenced/pacemaker-fenced.h | 4 +++- + 5 files changed, 23 insertions(+), 12 deletions(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 2f3dbb035a..54ebc12947 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -2489,8 +2489,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); + crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); + +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 1e07a9815a..44310ed77b 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, + g_hash_table_foreach_remove(stonith_remote_op_list, + stonith_remove_history_entry, + (gpointer) target); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + } + +@@ -402,7 +402,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + + if (updated) { + stonith_fence_history_trim(); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + if (cnt == 0) { +@@ -473,7 +473,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, + is done so send a notification for anything + that smells like history-sync + */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY_SYNCED, NULL, NULL); + if (crm_element_value(msg, F_STONITH_CALLID)) { + /* this is coming from the stonith-API + * +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 329e06c444..16c181b4b0 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -442,8 +442,8 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; +@@ -1211,7 +1211,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + if (op->state != st_duplicate) { + /* kick history readers */ +- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); ++ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + } + + /* safe to trim as long as that doesn't touch pending ops */ +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index d64358e07f..6b31b814a3 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -356,8 +356,17 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int + free_xml(notify_data); + } + ++/*! ++ * \internal ++ * \brief Notify relevant IPC clients of a fencing operation result ++ * ++ * \param[in] type Notification type ++ * \param[in] result Result of fencing operation (assume success if NULL) ++ * \param[in] data If not NULL, add to notification as call data ++ */ + void +-do_stonith_notify(const char *type, int result, xmlNode *data) ++fenced_send_notification(const char *type, const pcmk__action_result_t *result, ++ xmlNode *data) + { + /* TODO: Standardize the contents of data */ + xmlNode *update_msg = create_xml_node(NULL, "notify"); +@@ -367,7 +376,7 @@ do_stonith_notify(const char *type, int result, xmlNode *data) + crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); + crm_xml_add(update_msg, F_SUBTYPE, type); + crm_xml_add(update_msg, F_STONITH_OPERATION, type); +- crm_xml_add_int(update_msg, F_STONITH_RC, result); ++ stonith__xe_set_result(update_msg, result); + + if (data != NULL) { + add_message_xml(update_msg, F_STONITH_CALLDATA, data); +@@ -401,7 +410,7 @@ send_config_notification(const char *op, const pcmk__action_result_t *result, + crm_xml_add(notify_data, F_STONITH_DEVICE, desc); + crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); + +- do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); ++ fenced_send_notification(op, result, notify_data); + free_xml(notify_data); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 8503e813bf..502fcc9a29 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -229,7 +229,9 @@ xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, + void + do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); + +-void do_stonith_notify(const char *type, int result, xmlNode *data); ++void fenced_send_notification(const char *type, ++ const pcmk__action_result_t *result, ++ xmlNode *data); + void fenced_send_device_notification(const char *op, + const pcmk__action_result_t *result, + const char *desc); +-- +2.27.0 + + +From 86deababe506c2bb8259538e5380b6a78dc4b770 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 16:58:03 -0600 +Subject: [PATCH 20/23] Feature: fencer: pass full result when sending + notifications + +Rename create_op_done_notify() to fencing_result2xml() for readability, +make it take the full result as an argument rather than a legacy return code, +and add the full result to broadcasts and notifications. +--- + daemons/fenced/fenced_remote.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 16c181b4b0..4cf723e6df 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -356,13 +356,22 @@ undo_op_remap(remote_fencing_op_t *op) + } + } + ++/*! ++ * \internal ++ * \brief Create notification data XML for a fencing operation result ++ * ++ * \param[in] op Fencer operation that completed ++ * \param[in] result Full operation result ++ * ++ * \return Newly created XML to add as notification data ++ * \note The caller is responsible for freeing the result. ++ */ + static xmlNode * +-create_op_done_notify(remote_fencing_op_t * op, int rc) ++fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + + crm_xml_add_int(notify_data, "state", op->state); +- crm_xml_add_int(notify_data, F_STONITH_RC, rc); + crm_xml_add(notify_data, F_STONITH_TARGET, op->target); + crm_xml_add(notify_data, F_STONITH_ACTION, op->action); + crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); +@@ -371,6 +380,7 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + ++ stonith__xe_set_result(notify_data, result); + return notify_data; + } + +@@ -388,7 +398,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); ++ xmlNode *notify_data = fencing_result2xml(op, result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -430,7 +440,6 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + } + + /* Do notification with a clean data object */ +- notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); + crm_xml_add_int(data, "state", op->state); + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); +@@ -442,13 +451,14 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ ++ notify_data = fencing_result2xml(op, result); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + + /* mark this op as having notify's already sent */ + op->notify_sent = TRUE; + free_xml(reply); +- free_xml(notify_data); + } + + /*! +-- +2.27.0 + + +From 2814cde97520b63ca5f9baf3df37d73507e89d34 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 15 Dec 2021 17:40:52 -0600 +Subject: [PATCH 21/23] Low: fencer: restore check for invalid topology level + target + +... per review. b7c7676c mistakenly dropped it +--- + daemons/fenced/fenced_commands.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c +index 54ebc12947..1a4a791385 100644 +--- a/daemons/fenced/fenced_commands.c ++++ b/daemons/fenced/fenced_commands.c +@@ -1636,6 +1636,16 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + *desc = crm_strdup_printf("%s[%d]", target, id); + } + ++ // Ensure a valid target was specified ++ if ((mode < 0) || (mode > 2)) { ++ crm_warn("Ignoring topology level registration without valid target"); ++ free(target); ++ crm_log_xml_warn(level, "Bad level"); ++ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, ++ "Invalid topology level target"); ++ return; ++ } ++ + // Ensure level ID is in allowed range + if ((id <= 0) || (id >= ST_LEVEL_MAX)) { + crm_warn("Ignoring topology registration for %s with invalid level %d", +@@ -1643,7 +1653,7 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) + free(target); + crm_log_xml_warn(level, "Bad level"); + pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, +- "Invalid topology level"); ++ "Invalid topology level number"); + return; + } + +-- +2.27.0 + + +From c82806f9e16abcea00025fd3a290477aef2d8d83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:23:29 -0600 +Subject: [PATCH 22/23] Low: fencer: free result memory when processing fencing + replies + +found in review +--- + daemons/fenced/fenced_remote.c | 24 +++++++++++++++--------- + 1 file changed, 15 insertions(+), 9 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 4cf723e6df..9fda9ef060 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2241,14 +2241,14 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- return; ++ goto done; + } + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- return; ++ goto done; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { +@@ -2265,14 +2265,15 @@ fenced_process_fencing_reply(xmlNode *msg) + op->state = st_failed; + } + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- return; ++ goto done; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2290,7 +2291,7 @@ fenced_process_fencing_reply(xmlNode *msg) + * and notify our local clients. */ + if (op->state == st_done) { + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + + if ((op->phase == 2) && !pcmk__result_ok(&result)) { +@@ -2310,27 +2311,30 @@ fenced_process_fencing_reply(xmlNode *msg) + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- return; ++ goto done; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; + } + } ++ + } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; + finalize_op(op, msg, &result, false); +- return; ++ goto done; ++ + } else { + /* fall-through and attempt other fencing action using another peer */ + } +@@ -2340,6 +2344,8 @@ fenced_process_fencing_reply(xmlNode *msg) + op->target, op->originator, op->client_name, + pcmk_exec_status_str(result.execution_status)); + request_peer_fencing(op, NULL, &result); ++done: ++ pcmk__reset_result(&result); + } + + gboolean +-- +2.27.0 + + +From 137bf97fdb39043eebb02a0d3ebbe47ee8c7044c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 16:26:22 -0600 +Subject: [PATCH 23/23] Log: fencer: clarify timeout message + +... as suggested by review +--- + daemons/fenced/fenced_remote.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 9fda9ef060..1e237150c5 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -656,7 +656,7 @@ remote_op_timeout_one(gpointer userdata) + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Peer did not send fence result within timeout"); ++ "Peer did not return fence result within timeout"); + + + // Try another device, if appropriate +-- +2.27.0 + diff --git a/SOURCES/010-probe-failures.patch b/SOURCES/010-probe-failures.patch new file mode 100644 index 0000000..d90fc3c --- /dev/null +++ b/SOURCES/010-probe-failures.patch @@ -0,0 +1,4157 @@ +From f2e51898735b5e9990464141fc4aea3dd83f5067 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 14:36:41 -0400 +Subject: [PATCH 01/21] Refactor: scheduler: Use bool in unpack_rsc_op. + +Previously, we were using bool but TRUE/FALSE. Instead, use the actual +values. +--- + lib/pengine/unpack.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b1e84110a2..ecc7275e15 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3671,7 +3671,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + const char *task = NULL; + const char *task_key = NULL; + const char *exit_reason = NULL; +- bool expired = FALSE; ++ bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; + +@@ -3727,7 +3727,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if ((status != PCMK_EXEC_NOT_INSTALLED) + && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { +- expired = TRUE; ++ expired = true; + } + + if (!strcmp(task, CRMD_ACTION_STATUS)) { +-- +2.27.0 + + +From 4c961b8e670d336a368c7fd1535c247e40c6b48e Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 4 Nov 2021 15:07:01 -0400 +Subject: [PATCH 02/21] Refactor: scheduler: Add functions for determining if + an op is a probe. + +--- + include/crm/common/util.h | 3 + + lib/common/operations.c | 21 +++++++ + lib/common/tests/operations/Makefile.am | 6 +- + .../tests/operations/pcmk_is_probe_test.c | 37 +++++++++++++ + .../tests/operations/pcmk_xe_is_probe_test.c | 55 +++++++++++++++++++ + lib/pengine/unpack.c | 12 ++-- + lib/pengine/utils.c | 5 +- + 7 files changed, 127 insertions(+), 12 deletions(-) + create mode 100644 lib/common/tests/operations/pcmk_is_probe_test.c + create mode 100644 lib/common/tests/operations/pcmk_xe_is_probe_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index 2728b64492..fbea6e560c 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -72,6 +72,9 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + const char *timeout); + #define CRM_DEFAULT_OP_TIMEOUT_S "20s" + ++bool pcmk_is_probe(const char *task, guint interval); ++bool pcmk_xe_is_probe(xmlNode *xml_op); ++ + int compare_version(const char *version1, const char *version2); + + /* coverity[+kill] */ +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 366c189702..978df79082 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -537,3 +537,24 @@ pcmk__is_fencing_action(const char *action) + { + return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); + } ++ ++bool ++pcmk_is_probe(const char *task, guint interval) ++{ ++ if (task == NULL) { ++ return false; ++ } ++ ++ return (interval == 0) && pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none); ++} ++ ++bool ++pcmk_xe_is_probe(xmlNode *xml_op) ++{ ++ const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *interval_ms_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS); ++ int interval_ms; ++ ++ pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); ++ return pcmk_is_probe(task, interval_ms); ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index c8814ff0a8..2e3d0b0679 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -1,5 +1,5 @@ + # +-# Copyright 2020 the Pacemaker project contributors ++# Copyright 2020-2021 the Pacemaker project contributors + # + # The version control history for this file may have further details. + # +@@ -12,6 +12,8 @@ LDADD = $(top_builddir)/lib/common/libcrmcommon.la -lcmocka + include $(top_srcdir)/mk/tap.mk + + # Add "_test" to the end of all test program names to simplify .gitignore. +-check_PROGRAMS = parse_op_key_test ++check_PROGRAMS = parse_op_key_test \ ++ pcmk_is_probe_test \ ++ pcmk_xe_is_probe_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_is_probe_test.c b/lib/common/tests/operations/pcmk_is_probe_test.c +new file mode 100644 +index 0000000000..9b449f1a70 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_is_probe_test.c +@@ -0,0 +1,37 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++is_probe_test(void **state) ++{ ++ assert_false(pcmk_is_probe(NULL, 0)); ++ assert_false(pcmk_is_probe("", 0)); ++ assert_false(pcmk_is_probe("blahblah", 0)); ++ assert_false(pcmk_is_probe("monitor", 1)); ++ assert_true(pcmk_is_probe("monitor", 0)); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/common/tests/operations/pcmk_xe_is_probe_test.c b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +new file mode 100644 +index 0000000000..0283d1c145 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_is_probe_test.c +@@ -0,0 +1,55 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_probe_test(void **state) ++{ ++ xmlNode *node = NULL; ++ ++ assert_false(pcmk_xe_is_probe(NULL)); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_is_probe(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_is_probe(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_probe_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index ecc7275e15..7c0c66e696 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -83,7 +83,6 @@ is_dangling_guest_node(pe_node_t *node) + return FALSE; + } + +- + /*! + * \brief Schedule a fence action for a node + * +@@ -2984,7 +2983,6 @@ static void + unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, + enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + pe_action_t *action = NULL; + +@@ -2998,10 +2996,7 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + *last_failure = xml_op; + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; +- } ++ is_probe = pcmk_xe_is_probe(xml_op); + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3163,8 +3158,9 @@ determine_op_status( + } + + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); +- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { +- is_probe = true; ++ is_probe = pcmk_xe_is_probe(xml_op); ++ ++ if (is_probe) { + task = "probe"; + } + +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index c5eda3898e..07753e173a 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -1066,8 +1066,7 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + { + int timeout_ms = 0; + const char *value = NULL; +- bool is_probe = pcmk__str_eq(action->task, RSC_STATUS, pcmk__str_casei) +- && (interval_ms == 0); ++ bool is_probe = false; + #if ENABLE_VERSIONED_ATTRS + pe_rsc_action_details_t *rsc_details = NULL; + #endif +@@ -1094,6 +1093,8 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai + + CRM_CHECK(action && action->rsc, return); + ++ is_probe = pcmk_is_probe(action->task, interval_ms); ++ + // Cluster-wide + pe__unpack_dataset_nvpairs(data_set->op_defaults, XML_TAG_META_SETS, &rule_data, + action->meta, NULL, FALSE, data_set); +-- +2.27.0 + + +From 09f32df97ab5064a15ba5a1fb3970d5c64ee7b30 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:47:22 -0500 +Subject: [PATCH 03/21] Refactor: scheduler: Move setting interval_ms in + determine_op_status. + +This can now happen in the only place it's being used. +--- + lib/pengine/unpack.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 7c0c66e696..b9986d2462 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3142,7 +3142,6 @@ static int + determine_op_status( + pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) + { +- guint interval_ms = 0; + bool is_probe = false; + int result = PCMK_EXEC_DONE; + const char *key = get_op_key(xml_op); +@@ -3157,7 +3156,6 @@ determine_op_status( + exit_reason = ""; + } + +- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + is_probe = pcmk_xe_is_probe(xml_op); + + if (is_probe) { +@@ -3230,12 +3228,17 @@ determine_op_status( + result = PCMK_EXEC_ERROR_FATAL; + break; + +- case PCMK_OCF_UNIMPLEMENT_FEATURE: ++ case PCMK_OCF_UNIMPLEMENT_FEATURE: { ++ guint interval_ms = 0; ++ crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); ++ + if (interval_ms > 0) { + result = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through ++ } ++ + case PCMK_OCF_NOT_INSTALLED: + case PCMK_OCF_INVALID_PARAM: + case PCMK_OCF_INSUFFICIENT_PRIV: +-- +2.27.0 + + +From 6c8f47453afd6c100fddc45187faff17e15f7bfe Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 14:57:57 -0500 +Subject: [PATCH 04/21] Refactor: scheduler: Add pcmk_xe_mask_failed_probe. + +Given an xmlNodePtr for a resource operation, this function will +determine whether it is a failed probe operation that should not be +displayed in crm_mon (or other places, I suppose) or not. +--- + include/crm/common/util.h | 1 + + lib/common/operations.c | 17 ++ + lib/common/tests/operations/Makefile.am | 3 +- + .../pcmk_xe_mask_probe_failure_test.c | 162 ++++++++++++++++++ + 4 files changed, 182 insertions(+), 1 deletion(-) + create mode 100644 lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c + +diff --git a/include/crm/common/util.h b/include/crm/common/util.h +index fbea6e560c..784069ba1b 100644 +--- a/include/crm/common/util.h ++++ b/include/crm/common/util.h +@@ -74,6 +74,7 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, + + bool pcmk_is_probe(const char *task, guint interval); + bool pcmk_xe_is_probe(xmlNode *xml_op); ++bool pcmk_xe_mask_probe_failure(xmlNode *xml_op); + + int compare_version(const char *version1, const char *version2); + +diff --git a/lib/common/operations.c b/lib/common/operations.c +index 978df79082..54482b8863 100644 +--- a/lib/common/operations.c ++++ b/lib/common/operations.c +@@ -558,3 +558,20 @@ pcmk_xe_is_probe(xmlNode *xml_op) + pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); + return pcmk_is_probe(task, interval_ms); + } ++ ++bool ++pcmk_xe_mask_probe_failure(xmlNode *xml_op) ++{ ++ int status = PCMK_EXEC_UNKNOWN; ++ int rc = PCMK_OCF_OK; ++ ++ if (!pcmk_xe_is_probe(xml_op)) { ++ return false; ++ } ++ ++ crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); ++ crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); ++ ++ return rc == PCMK_OCF_NOT_INSTALLED || rc == PCMK_OCF_INVALID_PARAM || ++ status == PCMK_EXEC_NOT_INSTALLED; ++} +diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am +index 2e3d0b0679..457c5f7c7a 100644 +--- a/lib/common/tests/operations/Makefile.am ++++ b/lib/common/tests/operations/Makefile.am +@@ -14,6 +14,7 @@ include $(top_srcdir)/mk/tap.mk + # Add "_test" to the end of all test program names to simplify .gitignore. + check_PROGRAMS = parse_op_key_test \ + pcmk_is_probe_test \ +- pcmk_xe_is_probe_test ++ pcmk_xe_is_probe_test \ ++ pcmk_xe_mask_probe_failure_test + + TESTS = $(check_PROGRAMS) +diff --git a/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +new file mode 100644 +index 0000000000..a13f6d98f4 +--- /dev/null ++++ b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c +@@ -0,0 +1,162 @@ ++/* ++ * Copyright 2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++op_is_not_probe_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* Not worth testing this thoroughly since it's just a duplicate of whether ++ * pcmk_op_is_probe works or not. ++ */ ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++op_does_not_have_right_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++static void ++check_values_test(void **state) { ++ xmlNode *node = NULL; ++ ++ /* PCMK_EXEC_NOT_SUPPORTED */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_DONE */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_NOT_INSTALLED */ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_HARD */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ /* PCMK_EXEC_ERROR_FATAL */ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_true(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++ ++ node = string2xml(""); ++ assert_false(pcmk_xe_mask_probe_failure(node)); ++ free_xml(node); ++} ++ ++int main(int argc, char **argv) ++{ ++ const struct CMUnitTest tests[] = { ++ cmocka_unit_test(op_is_not_probe_test), ++ cmocka_unit_test(op_does_not_have_right_values_test), ++ cmocka_unit_test(check_values_test), ++ }; ++ ++ cmocka_set_message_output(CM_OUTPUT_TAP); ++ return cmocka_run_group_tests(tests, NULL, NULL); ++} +-- +2.27.0 + + +From c9ce1aaf93cd20bb01e80102dda0ffffb07e6472 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 14:26:31 -0500 +Subject: [PATCH 05/21] Refactor: scheduler: Combine op status and rc remapping + into one function. + +Well, not quite. Doing the remapping is complicated enough to where it +makes sense to have them in separate functions. However, they can both +be called from a single new function that takes the place of the +previous two calls in unpack_rsc_op. +--- + lib/pengine/unpack.c | 157 ++++++++++++++++++++----------------------- + 1 file changed, 72 insertions(+), 85 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b9986d2462..b659f319fb 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3121,36 +3121,68 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x + + /*! + * \internal +- * \brief Remap operation status based on action result ++ * \brief Remap informational monitor results and operation status + * +- * Given an action result, determine an appropriate operation status for the +- * purposes of responding to the action (the status provided by the executor is +- * not directly usable since the executor does not know what was expected). ++ * For the monitor results, certain OCF codes are for providing extended information ++ * to the user about services that aren't yet failed but not entirely healthy either. ++ * These must be treated as the "normal" result by Pacemaker. ++ * ++ * For operation status, the action result can be used to determine an appropriate ++ * status for the purposes of responding to the action. The status provided by the ++ * executor is not directly usable since the executor does not know what was expected. + * ++ * \param[in] xml_op Operation history entry XML from CIB status + * \param[in,out] rsc Resource that operation history entry is for +- * \param[in] rc Actual return code of operation +- * \param[in] target_rc Expected return code of operation + * \param[in] node Node where operation was executed +- * \param[in] xml_op Operation history entry XML from CIB status +- * \param[in,out] on_fail What should be done about the result + * \param[in] data_set Current cluster working set ++ * \param[in,out] on_fail What should be done about the result ++ * \param[in] target_rc Expected return code of operation ++ * \param[in,out] rc Actual return code of operation ++ * \param[in,out] status Operation execution status ++ * ++ * \note If the result is remapped and the node is not shutting down or failed, ++ * the operation will be recorded in the data set's list of failed operations ++ * to highlight it for the user. + * +- * \return Operation status based on return code and action info + * \note This may update the resource's current and next role. + */ +-static int +-determine_op_status( +- pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) +-{ ++static void ++remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set, enum action_fail_response *on_fail, ++ int target_rc, int *rc, int *status) { + bool is_probe = false; +- int result = PCMK_EXEC_DONE; +- const char *key = get_op_key(xml_op); + const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); ++ const char *key = get_op_key(xml_op); + const char *exit_reason = crm_element_value(xml_op, + XML_LRM_ATTR_EXIT_REASON); + ++ if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { ++ int remapped_rc = pcmk__effective_rc(*rc); ++ ++ if (*rc != remapped_rc) { ++ crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); ++ if (!node->details->shutdown || node->details->online) { ++ record_failed_op(xml_op, node, rsc, data_set); ++ } ++ ++ *rc = remapped_rc; ++ } ++ } ++ ++ /* If the executor reported an operation status of anything but done or ++ * error, consider that final. But for done or error, we know better whether ++ * it should be treated as a failure or not, because we know the expected ++ * result. ++ */ ++ if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { ++ return; ++ } ++ + CRM_ASSERT(rsc); +- CRM_CHECK(task != NULL, return PCMK_EXEC_ERROR); ++ CRM_CHECK(task != NULL, ++ *status = PCMK_EXEC_ERROR; return); ++ ++ *status = PCMK_EXEC_DONE; + + if (exit_reason == NULL) { + exit_reason = ""; +@@ -3171,23 +3203,23 @@ determine_op_status( + * those versions or processing of saved CIB files from those versions, + * so we do not need to care much about this case. + */ +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", + key, node->details->uname); + +- } else if (target_rc != rc) { +- result = PCMK_EXEC_ERROR; ++ } else if (target_rc != *rc) { ++ *status = PCMK_EXEC_ERROR; + pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", + key, node->details->uname, + target_rc, services_ocf_exitcode_str(target_rc), +- rc, services_ocf_exitcode_str(rc), ++ *rc, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason); + } + +- switch (rc) { ++ switch (*rc) { + case PCMK_OCF_OK: + if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, "Probe found %s active on %s at %s", + rsc->id, node->details->uname, + last_change_str(xml_op)); +@@ -3195,10 +3227,10 @@ determine_op_status( + break; + + case PCMK_OCF_NOT_RUNNING: +- if (is_probe || (target_rc == rc) ++ if (is_probe || (target_rc == *rc) + || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { + +- result = PCMK_EXEC_DONE; ++ *status = PCMK_EXEC_DONE; + rsc->role = RSC_ROLE_STOPPED; + + /* clear any previous failure actions */ +@@ -3208,8 +3240,8 @@ determine_op_status( + break; + + case PCMK_OCF_RUNNING_PROMOTED: +- if (is_probe && (rc != target_rc)) { +- result = PCMK_EXEC_DONE; ++ if (is_probe && (*rc != target_rc)) { ++ *status = PCMK_EXEC_DONE; + pe_rsc_info(rsc, + "Probe found %s active and promoted on %s at %s", + rsc->id, node->details->uname, +@@ -3221,11 +3253,11 @@ determine_op_status( + case PCMK_OCF_DEGRADED_PROMOTED: + case PCMK_OCF_FAILED_PROMOTED: + rsc->role = RSC_ROLE_PROMOTED; +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + break; + + case PCMK_OCF_NOT_CONFIGURED: +- result = PCMK_EXEC_ERROR_FATAL; ++ *status = PCMK_EXEC_ERROR_FATAL; + break; + + case PCMK_OCF_UNIMPLEMENT_FEATURE: { +@@ -3233,7 +3265,7 @@ determine_op_status( + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + + if (interval_ms > 0) { +- result = PCMK_EXEC_NOT_SUPPORTED; ++ *status = PCMK_EXEC_NOT_SUPPORTED; + break; + } + // fall through +@@ -3248,26 +3280,27 @@ determine_op_status( + pe_proc_err("No further recovery can be attempted for %s " + "because %s on %s failed (%s%s%s) at %s " + CRM_XS " rc=%d id=%s", rsc->id, task, +- node->details->uname, services_ocf_exitcode_str(rc), ++ node->details->uname, services_ocf_exitcode_str(*rc), + (*exit_reason? ": " : ""), exit_reason, +- last_change_str(xml_op), rc, ID(xml_op)); ++ last_change_str(xml_op), *rc, ID(xml_op)); + pe__clear_resource_flags(rsc, pe_rsc_managed); + pe__set_resource_flags(rsc, pe_rsc_block); + } +- result = PCMK_EXEC_ERROR_HARD; ++ *status = PCMK_EXEC_ERROR_HARD; + break; + + default: +- if (result == PCMK_EXEC_DONE) { ++ if (*status == PCMK_EXEC_DONE) { + crm_info("Treating unknown exit status %d from %s of %s " + "on %s at %s as failure", +- rc, task, rsc->id, node->details->uname, ++ *rc, task, rsc->id, node->details->uname, + last_change_str(xml_op)); +- result = PCMK_EXEC_ERROR; ++ *status = PCMK_EXEC_ERROR; + } + break; + } +- return result; ++ ++ pe_rsc_trace(rsc, "Remapped %s status to %d", key, *status); + } + + // return TRUE if start or monitor last failure but parameters changed +@@ -3622,41 +3655,6 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + } + } + +-/*! +- * \internal +- * \brief Remap informational monitor results to usual values +- * +- * Certain OCF result codes are for providing extended information to the +- * user about services that aren't yet failed but not entirely healthy either. +- * These must be treated as the "normal" result by Pacemaker. +- * +- * \param[in] rc Actual result of a monitor action +- * \param[in] xml_op Operation history XML +- * \param[in] node Node that operation happened on +- * \param[in] rsc Resource that operation happened to +- * \param[in] data_set Cluster working set +- * +- * \return Result code that pacemaker should use +- * +- * \note If the result is remapped, and the node is not shutting down or failed, +- * the operation will be recorded in the data set's list of failed +- * operations, to highlight it for the user. +- */ +-static int +-remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, +- const pe_resource_t *rsc, pe_working_set_t *data_set) +-{ +- int remapped_rc = pcmk__effective_rc(rc); +- +- if (rc != remapped_rc) { +- crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); +- if (!node->details->shutdown || node->details->online) { +- record_failed_op(xml_op, node, rsc, data_set); +- } +- } +- return remapped_rc; +-} +- + static void + unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + xmlNode **last_failure, enum action_fail_response *on_fail, +@@ -3712,7 +3710,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + node->details->uname, rsc->id); + } + +- /* It should be possible to call remap_monitor_rc() first then call ++ /* It should be possible to call remap_operation() first then call + * check_operation_expiry() only if rc != target_rc, because there should + * never be a fail count without at least one unexpected result in the + * resource history. That would be more efficient by avoiding having to call +@@ -3729,9 +3727,8 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + +- if (!strcmp(task, CRMD_ACTION_STATUS)) { +- rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); +- } ++ remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, ++ &rc, &status); + + if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); +@@ -3761,16 +3758,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + +- /* If the executor reported an operation status of anything but done or +- * error, consider that final. But for done or error, we know better whether +- * it should be treated as a failure or not, because we know the expected +- * result. +- */ +- if(status == PCMK_EXEC_DONE || status == PCMK_EXEC_ERROR) { +- status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); +- pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); +- } +- + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From 9fdca1999872b3930cf18b7d807ddb259f23e8a5 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:08:16 -0500 +Subject: [PATCH 06/21] Test: cts-cli: Add test output for a native resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 16 +++++++++++ + cts/cli/regression.crm_mon.exp | 50 ++++++++++++++++++++++++++-------- + 2 files changed, 55 insertions(+), 11 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index e6c6894b6f..b7817e4775 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,16 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -94,6 +104,9 @@ + + + ++ ++ ++ + + + +@@ -135,6 +148,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8714f917a9..d12dce3ae8 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,6 +3485,9 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3495,7 +3498,7 @@ Active Resources: + + + +- ++ + + + +@@ -3548,6 +3551,7 @@ Active Resources: + + + ++ + + + +@@ -3574,6 +3578,9 @@ Active Resources: + + + ++ ++ ++ + + + +@@ -3603,6 +3610,9 @@ Active Resources: + + + ++ ++ ++ + + + =#=#=#= End test: XML output of partially active resources - OK (0) =#=#=#= +@@ -3614,7 +3624,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3631,6 +3641,10 @@ Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3640,13 +3654,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: ++ * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] +@@ -3676,6 +3691,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3695,6 +3712,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3704,7 +3724,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3722,7 +3742,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3741,7 +3761,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3759,7 +3779,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,7 +3797,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3806,6 +3826,7 @@ Inactive Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: + * Node: cluster01: +@@ -3826,6 +3847,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * smart-mon: migration-threshold=1000000: ++ * (9) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3845,6 +3868,9 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ ++Failed Resource Actions: ++ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3854,7 +3880,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 13 resource instances configured (1 DISABLED) ++ * 14 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3865,6 +3891,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +@@ -3875,7 +3902,7 @@ Full List of Resources: + + + +- ++ + + + +@@ -3905,6 +3932,7 @@ Full List of Resources: + + + ++ + + + +-- +2.27.0 + + +From 1c54d0bbb74d066d55a56eae28d1a579b8854604 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:17:52 -0500 +Subject: [PATCH 07/21] Test: cts-cli: Add test output for a cloned resource + with a failed probe op. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 3 +++ + cts/cli/regression.crm_mon.exp | 12 ++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index b7817e4775..1f9dc156aa 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -107,6 +107,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d12dce3ae8..d093bd8106 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3488,6 +3488,7 @@ Active Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3581,6 +3582,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3612,6 +3616,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3645,6 +3650,7 @@ Full List of Resources: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,6 +3699,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3715,6 +3723,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3849,6 +3858,8 @@ Operations: + * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe ++ * ping: migration-threshold=1000000: ++ * (6) probe + * Node: cluster01: + * Fencing: migration-threshold=1000000: + * (15) start +@@ -3871,6 +3882,7 @@ Operations: + + Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms ++ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +-- +2.27.0 + + +From 9408f08c07eb531ff84b07bf959f3d681ebf2b78 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:48:16 -0500 +Subject: [PATCH 08/21] Test: cts-cli: Change the resources in + partially-active-group. + +dummy-2 is now not running because it failed to start due to an +unimplemented feature. I don't know what could possibly be +unimplemented about a dummy resource, but it's not important. + +There is also a new dummy-3 resource that acts exactly the same as +dummy-2. This preserves checking that the inactive member output can +still be displayed. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 6 +++- + cts/cli/regression.crm_mon.exp | 62 +++++++++++++++++++++++----------- + 2 files changed, 47 insertions(+), 21 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1f9dc156aa..1ce80ea58a 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -54,7 +54,8 @@ + + + +- ++ ++ + + + +@@ -104,6 +105,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d093bd8106..8cf3a1215e 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3485,8 +3485,10 @@ Active Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3499,12 +3501,12 @@ Failed Resource Actions: + + + +- ++ + + + + +- ++ + + + +@@ -3546,11 +3548,14 @@ Failed Resource Actions: + + + +- ++ + + + +- ++ ++ ++ ++ + + + +@@ -3579,6 +3584,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3615,6 +3623,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3629,7 +3638,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3645,10 +3654,12 @@ Full List of Resources: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3660,7 +3671,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3676,7 +3687,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3697,6 +3708,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3722,6 +3735,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3733,7 +3747,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3742,6 +3756,7 @@ Node List: + Active Resources: + * Resource Group: partially-active-group (1 member inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group + =#=#=#= Begin test: Text output of partially active group, with inactive resources =#=#=#= +@@ -3751,7 +3766,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3760,7 +3775,8 @@ Node List: + Full List of Resources: + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3770,7 +3786,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3788,7 +3804,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3796,7 +3812,10 @@ Node List: + + Active Resources: + * Resource Group: partially-active-group (1 member inactive): +- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 ++ ++Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3806,7 +3825,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3820,7 +3839,7 @@ Node List: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +- * 1 (ocf:pacemaker:Dummy): Active ++ * 2 (ocf:pacemaker:Dummy): Active + * 1 (ocf:pacemaker:remote): Active + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: +@@ -3834,7 +3853,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3856,6 +3875,8 @@ Operations: + * (3) monitor: interval="30000ms" + * dummy-1: migration-threshold=1000000: + * (2) start ++ * dummy-2: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3881,6 +3902,7 @@ Operations: + * (1) start + + Failed Resource Actions: ++ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3892,7 +3914,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 14 resource instances configured (1 DISABLED) ++ * 15 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3914,7 +3936,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 85e76b8bdb4de261a9cb4858eeedd49fba0346a1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 15:55:51 -0500 +Subject: [PATCH 09/21] Test: cts-cli: Add a failed probe on a new dummy-4 + resource. + +This is to verify that these resources which are part of a group are +displayed properly. No code changes will be necessary, since groups are +just several other resources all in the same pile. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 4 +++ + cts/cli/regression.crm_mon.exp | 51 ++++++++++++++++++++++------------ + 2 files changed, 37 insertions(+), 18 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index 1ce80ea58a..d4d4a70848 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -60,6 +60,7 @@ + + + ++ + + + +@@ -108,6 +109,9 @@ + + + ++ ++ ++ + + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 8cf3a1215e..c524b199e3 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3470,7 +3470,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3483,12 +3483,13 @@ Active Resources: + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= +@@ -3501,7 +3502,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3548,7 +3549,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3556,6 +3557,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3587,6 +3589,9 @@ Failed Resource Actions: + + + ++ ++ ++ + + + +@@ -3624,6 +3629,7 @@ Failed Resource Actions: + + + ++ + + + +@@ -3638,7 +3644,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3656,10 +3662,12 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= +@@ -3671,7 +3679,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3687,7 +3695,7 @@ Full List of Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: + * Node: cluster01: +@@ -3710,6 +3718,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3736,6 +3746,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= +@@ -3747,14 +3758,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= +@@ -3766,7 +3777,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] +@@ -3777,6 +3788,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3786,14 +3798,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + =#=#=#= End test: Text output of active member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of active member of partially active group +@@ -3804,14 +3816,14 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 cluster02 ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +- * Resource Group: partially-active-group (1 member inactive): ++ * Resource Group: partially-active-group (2 members inactive): + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +@@ -3825,7 +3837,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Node cluster01: online: +@@ -3853,7 +3865,7 @@ Inactive Resources: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: +- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 ++ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +@@ -3877,6 +3889,8 @@ Operations: + * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe ++ * dummy-4: migration-threshold=1000000: ++ * (2) probe + * smart-mon: migration-threshold=1000000: + * (9) probe + * ping: migration-threshold=1000000: +@@ -3903,6 +3917,7 @@ Operations: + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= +@@ -3914,7 +3929,7 @@ Cluster Summary: + * Last updated: + * Last change: + * 4 nodes configured +- * 15 resource instances configured (1 DISABLED) ++ * 16 resource instances configured (1 DISABLED) + + Node List: + * Online: [ cluster01 ] +@@ -3936,7 +3951,7 @@ Full List of Resources: + + + +- ++ + + + +-- +2.27.0 + + +From 206d733b6ce8e0ffcad243d282e8baa8c3ff72b4 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 23 Nov 2021 14:33:47 -0500 +Subject: [PATCH 10/21] Test: cts-cli: Add test output for a bundle resource + with a failed probe op. + +This just changes the existing failed bundle resource from not starting +to failing with a reason. + +There are no code changes yet to properly handle displaying these +operations, so the results here just reflect the current handling. +--- + cts/cli/crm_mon-partial.xml | 9 ++++++++ + cts/cli/regression.crm_mon.exp | 40 +++++++++++++++++++++++++--------- + 2 files changed, 39 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml +index d4d4a70848..5981fc653c 100644 +--- a/cts/cli/crm_mon-partial.xml ++++ b/cts/cli/crm_mon-partial.xml +@@ -178,5 +178,14 @@ + + + ++ ++ ++ ++ ++ ++ ++ ++ ++ + + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index c524b199e3..b690a26fb6 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3482,7 +3482,7 @@ Active Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3492,6 +3492,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3509,7 +3510,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3540,7 +3541,9 @@ Failed Resource Actions: + + + +- ++ ++ ++ + + + +@@ -3626,12 +3629,18 @@ Failed Resource Actions: + + + ++ ++ ++ ++ ++ + + + + + + ++ + + + +@@ -3657,7 +3666,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3670,6 +3679,7 @@ Failed Resource Actions: + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3693,7 +3703,7 @@ Full List of Resources: + * Stopped: [ cluster02 ] + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + +@@ -3743,12 +3753,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3856,14 +3870,14 @@ Node List: + * GuestNode httpd-bundle-0@cluster02: online: + * Resources: + * 1 (ocf:heartbeat:apache): Active ++ * GuestNode httpd-bundle-1@cluster01: online: ++ * Resources: ++ * 1 (ocf:heartbeat:apache): Active + + Inactive Resources: + * Clone Set: ping-clone [ping]: + * Started: [ cluster01 ] + * Stopped: [ cluster02 ] +- * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped +@@ -3914,12 +3928,16 @@ Operations: + * Node: httpd-bundle-0@cluster02: + * httpd: migration-threshold=1000000: + * (1) start ++ * Node: httpd-bundle-1@cluster01: ++ * httpd: migration-threshold=1000000: ++ * (1) probe + + Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 ++ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +@@ -3939,7 +3957,7 @@ Full List of Resources: + * Started: [ cluster01 ] + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 ++ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node +@@ -3972,7 +3990,9 @@ Full List of Resources: + + + +- ++ ++ ++ + + + +-- +2.27.0 + + +From 6240a28d36c0349e3b1d7f52c36106580c53bb01 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 22 Nov 2021 10:59:10 -0500 +Subject: [PATCH 11/21] Test: cts: Add --show-detail to a couple of the crm_mon + tests. + +This straightens out a couple differences in output between running +tests locally (where --enable-compat-2.0 is not given, which would +automatically add --show-detail) and running tests under mock (where +that option is given). + +Note that this only really matters for failed resource actions, which +were not previously output as part of any crm_mon regression test. It +is only the patches in this series that have introduced those, and thus +this difference. +--- + cts/cli/regression.crm_mon.exp | 131 ++++++++++++++++++++------------- + cts/cts-cli.in | 10 +-- + 2 files changed, 83 insertions(+), 58 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b690a26fb6..d7b9d98e2c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3466,33 +3466,42 @@ Operations: + =#=#=#= Begin test: Text output of partially active resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group (2 members inactive): + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3649,24 +3658,32 @@ Failed Resource Actions: + =#=#=#= Begin test: Text output of partially active resources, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 +@@ -3675,46 +3692,54 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Full List of Resources: + * 0/1 (ocf:pacemaker:HealthSMART): Active + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Container bundle set: httpd-bundle [pcmk:http]: +- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 +- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 ++ * Replica[0] ++ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 ++ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 ++ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 ++ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 ++ * Replica[1] ++ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 ++ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 ++ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 ++ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3734,7 +3759,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3758,11 +3783,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3826,14 +3851,14 @@ Active Resources: + =#=#=#= Begin test: Text output of inactive member of partially active group =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Online: [ cluster01 cluster02 ] ++ * Online: [ cluster01 (1) cluster02 (2) ] + * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] + + Active Resources: +@@ -3841,27 +3866,27 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= + Cluster Summary: + * Stack: corosync +- * Current DC: cluster02 (version) - partition with quorum ++ * Current DC: cluster02 (2) (version) - partition with quorum + * Last updated: + * Last change: + * 4 nodes configured + * 16 resource instances configured (1 DISABLED) + + Node List: +- * Node cluster01: online: ++ * Node cluster01 (1): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active + * 1 (ocf:pacemaker:ping): Active + * 1 (ocf:pacemaker:remote): Active + * 1 (stonith:fence_xvm): Active +- * Node cluster02: online: ++ * Node cluster02 (2): online: + * Resources: + * 1 (ocf:heartbeat:IPaddr2): Active + * 1 (ocf:heartbeat:docker): Active +@@ -3876,20 +3901,20 @@ Node List: + + Inactive Resources: + * Clone Set: ping-clone [ping]: +- * Started: [ cluster01 ] +- * Stopped: [ cluster02 ] ++ * ping (ocf:pacemaker:ping): Started cluster01 ++ * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + + Node Attributes: +- * Node: cluster01: ++ * Node: cluster01 (1): + * pingd : 1000 +- * Node: cluster02: ++ * Node: cluster02 (2): + * pingd : 1000 + + Operations: +- * Node: cluster02: ++ * Node: cluster02 (2): + * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: + * (2) start + * (3) monitor: interval="60000ms" +@@ -3909,7 +3934,7 @@ Operations: + * (9) probe + * ping: migration-threshold=1000000: + * (6) probe +- * Node: cluster01: ++ * Node: cluster01 (1): + * Fencing: migration-threshold=1000000: + * (15) start + * (20) monitor: interval="60000ms" +@@ -3933,11 +3958,11 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms +- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 +- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms +- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 +- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms ++ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms ++ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms ++ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index d32bfb7ed1..457816afab 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -420,7 +420,7 @@ function test_crm_mon() { + export CIB_file="$test_home/cli/crm_mon-partial.xml" + + desc="Text output of partially active resources" +- cmd="crm_mon -1" ++ cmd="crm_mon -1 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="XML output of partially active resources" +@@ -428,13 +428,13 @@ function test_crm_mon() { + test_assert_validate $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources" +- cmd="crm_mon -1 -r" ++ cmd="crm_mon -1 -r --show-detail" + test_assert $CRM_EX_OK 0 + + # XML already includes inactive resources + + desc="Complete brief text output, with inactive resources" +- cmd="crm_mon -1 -r --include=all --brief" ++ cmd="crm_mon -1 -r --include=all --brief --show-detail" + test_assert $CRM_EX_OK 0 + + # XML does not have a brief output option +@@ -452,11 +452,11 @@ function test_crm_mon() { + test_assert $CRM_EX_OK 0 + + desc="Text output of inactive member of partially active group" +- cmd="crm_mon -1 --resource=dummy-2" ++ cmd="crm_mon -1 --resource=dummy-2 --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Complete brief text output grouped by node, with inactive resources" +- cmd="crm_mon -1 -r --include=all --group-by-node --brief" ++ cmd="crm_mon -1 -r --include=all --group-by-node --brief --show-detail" + test_assert $CRM_EX_OK 0 + + desc="Text output of partially active resources, with inactive resources, filtered by node" +-- +2.27.0 + + +From da14053e5957d84ed0647688d37733adc2f988a3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 29 Nov 2021 15:05:42 -0500 +Subject: [PATCH 12/21] Test: scheduler: Add tests for failed probe operations. + +This adds identical sets of tests for primitive resources and cloned +resources. For the moment, the output reflects the current state of the +code. No changes have been made to properly handle these operations +yet. + +Each set has three resources, and each is set up with a slightly +different configuration of probe failures: + +(1) - Maskable probe failure on each node. +(2) - Maskable probe failure on one node, successful "not running" probe + on the other node. The resource should be started on the node + where "not running" was returned. +(3) - Maskable probe failure on one node, non-maskable probe failure on + the other node. The resource should not be running anywhere, and + should be stopped on the node with the non-maskable failure. +--- + cts/cts-scheduler.in | 2 + + cts/scheduler/dot/failed-probe-clone.dot | 30 ++++ + cts/scheduler/dot/failed-probe-primitive.dot | 4 + + cts/scheduler/exp/failed-probe-clone.exp | 141 ++++++++++++++++++ + cts/scheduler/exp/failed-probe-primitive.exp | 20 +++ + .../scores/failed-probe-clone.scores | 33 ++++ + .../scores/failed-probe-primitive.scores | 9 ++ + .../summary/failed-probe-clone.summary | 46 ++++++ + .../summary/failed-probe-primitive.summary | 27 ++++ + cts/scheduler/xml/failed-probe-clone.xml | 110 ++++++++++++++ + cts/scheduler/xml/failed-probe-primitive.xml | 71 +++++++++ + 11 files changed, 493 insertions(+) + create mode 100644 cts/scheduler/dot/failed-probe-clone.dot + create mode 100644 cts/scheduler/dot/failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/failed-probe-clone.exp + create mode 100644 cts/scheduler/exp/failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/failed-probe-clone.scores + create mode 100644 cts/scheduler/scores/failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/failed-probe-clone.summary + create mode 100644 cts/scheduler/summary/failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/failed-probe-clone.xml + create mode 100644 cts/scheduler/xml/failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 17fd6cefdf..3abcbc6c9d 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -113,6 +113,8 @@ TESTS = [ + [ "probe-3", "Probe (pending node)" ], + [ "probe-4", "Probe (pending node + stopped resource)" ], + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], ++ [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], ++ [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/failed-probe-clone.dot b/cts/scheduler/dot/failed-probe-clone.dot +new file mode 100644 +index 0000000000..90536b46ed +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-clone.dot +@@ -0,0 +1,30 @@ ++ digraph "g" { ++"ping-1_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-1_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2-clone_running_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2-clone_start_0" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2-clone_start_0" -> "ping-2_start_0 cluster02" [ style = bold] ++"ping-2-clone_start_0" [ style=bold color="green" fontcolor="orange"] ++"ping-2_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-2_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_monitor_10000 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-2_start_0 cluster02" -> "ping-2-clone_running_0" [ style = bold] ++"ping-2_start_0 cluster02" -> "ping-2_monitor_10000 cluster02" [ style = bold] ++"ping-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3-clone_running_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_start_0" -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3-clone_start_0" -> "ping-3_start_0 " [ style = dashed] ++"ping-3-clone_start_0" [ style=dashed color="red" fontcolor="orange"] ++"ping-3-clone_stop_0" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3-clone_stop_0" -> "ping-3_stop_0 cluster01" [ style = bold] ++"ping-3-clone_stop_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3-clone_stopped_0" -> "ping-3-clone_start_0" [ style = dashed] ++"ping-3-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"ping-3_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"ping-3_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"ping-3_start_0 " -> "ping-3-clone_running_0" [ style = dashed] ++"ping-3_start_0 " [ style=dashed color="red" fontcolor="black"] ++"ping-3_stop_0 cluster01" -> "ping-3-clone_stopped_0" [ style = bold] ++"ping-3_stop_0 cluster01" -> "ping-3_start_0 " [ style = dashed] ++"ping-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/dot/failed-probe-primitive.dot b/cts/scheduler/dot/failed-probe-primitive.dot +new file mode 100644 +index 0000000000..6e0c83216a +--- /dev/null ++++ b/cts/scheduler/dot/failed-probe-primitive.dot +@@ -0,0 +1,4 @@ ++ digraph "g" { ++"dummy-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/failed-probe-clone.exp b/cts/scheduler/exp/failed-probe-clone.exp +new file mode 100644 +index 0000000000..6be18935bf +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-clone.exp +@@ -0,0 +1,141 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/exp/failed-probe-primitive.exp b/cts/scheduler/exp/failed-probe-primitive.exp +new file mode 100644 +index 0000000000..d0d8aa44dc +--- /dev/null ++++ b/cts/scheduler/exp/failed-probe-primitive.exp +@@ -0,0 +1,20 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/failed-probe-clone.scores b/cts/scheduler/scores/failed-probe-clone.scores +new file mode 100644 +index 0000000000..7418b7f153 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-clone.scores +@@ -0,0 +1,33 @@ ++ ++pcmk__clone_allocate: ping-1-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2-clone allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-2:1 allocation score on cluster02: 0 ++pcmk__clone_allocate: ping-3-clone allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3-clone allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__clone_allocate: ping-3:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: ping-1:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-1:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:0 allocation score on cluster02: 0 ++pcmk__native_allocate: ping-2:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-2:1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:0 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: ping-3:1 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/scores/failed-probe-primitive.scores b/cts/scheduler/scores/failed-probe-primitive.scores +new file mode 100644 +index 0000000000..f313029451 +--- /dev/null ++++ b/cts/scheduler/scores/failed-probe-primitive.scores +@@ -0,0 +1,9 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-1 allocation score on cluster02: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-3 allocation score on cluster01: -INFINITY ++pcmk__native_allocate: dummy-3 allocation score on cluster02: -INFINITY +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +new file mode 100644 +index 0000000000..ca15c302aa +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -0,0 +1,46 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * ping-3 (ocf:pacemaker:ping): FAILED cluster01 ++ * Stopped: [ cluster02 ] ++ ++Transition Summary: ++ * Start ping-2:0 ( cluster02 ) ++ * Stop ping-3:0 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Cluster action: clear_failcount for ping-1 on cluster02 ++ * Cluster action: clear_failcount for ping-1 on cluster01 ++ * Cluster action: clear_failcount for ping-2 on cluster02 ++ * Cluster action: clear_failcount for ping-2 on cluster01 ++ * Pseudo action: ping-2-clone_start_0 ++ * Cluster action: clear_failcount for ping-3 on cluster01 ++ * Cluster action: clear_failcount for ping-3 on cluster02 ++ * Pseudo action: ping-3-clone_stop_0 ++ * Resource action: ping-2 start on cluster02 ++ * Pseudo action: ping-2-clone_running_0 ++ * Resource action: ping-3 stop on cluster01 ++ * Pseudo action: ping-3-clone_stopped_0 ++ * Resource action: ping-2 monitor=10000 on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * Clone Set: ping-1-clone [ping-1]: ++ * Stopped: [ cluster01 cluster02 ] ++ * Clone Set: ping-2-clone [ping-2]: ++ * Started: [ cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Clone Set: ping-3-clone [ping-3]: ++ * Stopped: [ cluster01 cluster02 ] +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +new file mode 100644 +index 0000000000..a634e7f00b +--- /dev/null ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -0,0 +1,27 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 ++ ++Transition Summary: ++ * Start dummy-2 ( cluster02 ) ++ * Stop dummy-3 ( cluster01 ) due to node availability ++ ++Executing Cluster Transition: ++ * Resource action: dummy-2 start on cluster02 ++ * Resource action: dummy-3 stop on cluster01 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped +diff --git a/cts/scheduler/xml/failed-probe-clone.xml b/cts/scheduler/xml/failed-probe-clone.xml +new file mode 100644 +index 0000000000..f677585bab +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-clone.xml +@@ -0,0 +1,110 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/xml/failed-probe-primitive.xml b/cts/scheduler/xml/failed-probe-primitive.xml +new file mode 100644 +index 0000000000..0c2f6416f5 +--- /dev/null ++++ b/cts/scheduler/xml/failed-probe-primitive.xml +@@ -0,0 +1,71 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 271d50e7d6b0ee5ef670b571c6d7aae9272b75ad Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 11 Nov 2021 13:57:05 -0500 +Subject: [PATCH 13/21] Feature: scheduler: Don't output failed resource + probes... + +in the crm_mon "Failed Resource Actions" section. It is expected that +these one-off probes will fail, in which case displaying them in that +section can just come across as confusing to the user. + +And update the crm_mon test output to account for these changes. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 20 -------------------- + lib/pengine/pe_output.c | 4 ++++ + 2 files changed, 4 insertions(+), 20 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index d7b9d98e2c..b1643f8b29 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3498,10 +3498,6 @@ Active Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3646,10 +3642,6 @@ Failed Resource Actions: + + + +- +- +- +- + + + +@@ -3693,10 +3685,6 @@ Full List of Resources: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3784,10 +3772,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3959,10 +3943,6 @@ Operations: + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms +- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms +- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms +- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c +index 715e001d51..84684598dd 100644 +--- a/lib/pengine/pe_output.c ++++ b/lib/pengine/pe_output.c +@@ -1370,6 +1370,10 @@ failed_action_list(pcmk__output_t *out, va_list args) { + continue; + } + ++ if (pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ + id = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); + if (parse_op_key(id ? id : ID(xml_op), &rsc, NULL, NULL) == FALSE) { + continue; +-- +2.27.0 + + +From 90f641b9223c64701d494297ce3dd3382365acb8 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 10:11:19 -0500 +Subject: [PATCH 14/21] Feature: scheduler: Add a function for finding a failed + probe action... + +for a given resource ID. Optionally, a node ID can also be given to +restrict the failed probe action to one run on the given node. +Otherwise, just the first failed probe action for the resource ID will +be returned. + +See: rhbz#1506372 +--- + include/crm/pengine/internal.h | 2 ++ + lib/pengine/utils.c | 42 ++++++++++++++++++++++++++++++++++ + 2 files changed, 44 insertions(+) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 8c8fbaca90..58dd2e8727 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -574,4 +574,6 @@ gboolean pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean che + gboolean pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); + ++xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); ++ + #endif +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 07753e173a..3151f0120b 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2569,3 +2569,45 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + + return resources; + } ++ ++xmlNode * ++pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) ++{ ++ const char *rsc_id = rsc->id; ++ ++ for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; ++ xml_op = pcmk__xml_next(xml_op)) { ++ const char *value = NULL; ++ char *op_id = NULL; ++ ++ /* This resource operation is not a failed probe. */ ++ if (!pcmk_xe_mask_probe_failure(xml_op)) { ++ continue; ++ } ++ ++ /* This resource operation was not run on the given node. Note that if name is ++ * NULL, this will always succeed. ++ */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); ++ if (value == NULL || !pcmk__str_eq(value, name, pcmk__str_casei|pcmk__str_null_matches)) { ++ continue; ++ } ++ ++ /* This resource operation has no operation_key. */ ++ value = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); ++ if (!parse_op_key(value ? value : ID(xml_op), &op_id, NULL, NULL)) { ++ continue; ++ } ++ ++ /* This resource operation's ID does not match the rsc_id we are looking for. */ ++ if (!pcmk__str_eq(op_id, rsc_id, pcmk__str_none)) { ++ free(op_id); ++ continue; ++ } ++ ++ free(op_id); ++ return xml_op; ++ } ++ ++ return NULL; ++} +-- +2.27.0 + + +From 2ad9774fe994554243078b131799fed0d1a6dffd Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 9 Nov 2021 15:43:24 -0500 +Subject: [PATCH 15/21] Feature: scheduler: Display the reason why a native rsc + probe failed. + +If inactive resources are being shown, add an extra blurb of text to any +stopped resources that have a failed probe action indicating why the +probe failed. + +And then add a new primitive resource to crm_mon-partial.xml with a +failed probe operation and update the expected test output. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 10 +++++----- + cts/scheduler/summary/failed-probe-primitive.summary | 8 ++++---- + cts/scheduler/summary/multiply-active-stonith.summary | 2 +- + lib/pengine/native.c | 11 +++++++++++ + 4 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index b1643f8b29..4333caa11c 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3680,8 +3680,8 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms +@@ -3811,7 +3811,7 @@ Full List of Resources: + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) +- * dummy-4 (ocf:pacemaker:Dummy): Stopped ++ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) + =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active group, with inactive resources + =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= +@@ -3889,7 +3889,7 @@ Inactive Resources: + * ping (ocf:pacemaker:ping): Stopped + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Node Attributes: + * Node: cluster01 (1): +@@ -3963,7 +3963,7 @@ Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 +- * smart-mon (ocf:pacemaker:HealthSMART): Stopped ++ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node + =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= +diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary +index a634e7f00b..ea8edae494 100644 +--- a/cts/scheduler/summary/failed-probe-primitive.summary ++++ b/cts/scheduler/summary/failed-probe-primitive.summary +@@ -4,8 +4,8 @@ Current cluster status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped +- * dummy-2 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) ++ * dummy-2 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 + + Transition Summary: +@@ -22,6 +22,6 @@ Revised Cluster Status: + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 +- * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +- * dummy-3 (ocf:pacemaker:Dummy): Stopped ++ * dummy-3 (ocf:pacemaker:Dummy): Stopped (not installed) +diff --git a/cts/scheduler/summary/multiply-active-stonith.summary b/cts/scheduler/summary/multiply-active-stonith.summary +index 8ce21d68ee..ec37de03b0 100644 +--- a/cts/scheduler/summary/multiply-active-stonith.summary ++++ b/cts/scheduler/summary/multiply-active-stonith.summary +@@ -25,4 +25,4 @@ Revised Cluster Status: + + * Full List of Resources: + * fencer (stonith:fence_ipmilan): Started node3 +- * rsc1 (lsb:rsc1): Stopped ++ * rsc1 (lsb:rsc1): Stopped (not installed) +diff --git a/lib/pengine/native.c b/lib/pengine/native.c +index 36121c527f..a95c90c09a 100644 +--- a/lib/pengine/native.c ++++ b/lib/pengine/native.c +@@ -599,6 +599,17 @@ pcmk__native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node + g_string_append_printf(outstr, " %s", node->details->uname); + } + ++ // Failed probe operation ++ if (native_displayable_role(rsc) == RSC_ROLE_STOPPED) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node ? node->details->uname : NULL); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_string_append_printf(outstr, " (%s) ", services_ocf_exitcode_str(rc)); ++ } ++ } ++ + // Flags, as: ( [...]) + if (node && !(node->details->online) && node->details->unclean) { + have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); +-- +2.27.0 + + +From b9ca2e834ee01b35c03f153438ef8828b609fb38 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 18 Nov 2021 10:41:42 -0500 +Subject: [PATCH 16/21] Refactor: scheduler: Rearrange pe__clone_default. + +Instead of the single stopped list, maintain a hash table where the keys +are nodes and the values are the status of the node. For now, this is +just "Stopped" or "Stopped (disabled)" but in the future will be +expanded to cover failed probe operations. +--- + lib/pengine/clone.c | 103 +++++++++++++++++++++++++++++++++++--------- + 1 file changed, 82 insertions(+), 21 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 5569c6b6e9..58fb24d24e 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -28,6 +28,55 @@ + #define UNPROMOTED_INSTANCES RSC_ROLE_UNPROMOTED_S + #endif + ++static GList * ++sorted_hash_table_values(GHashTable *table) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!g_list_find_custom(retval, value, (GCompareFunc) strcmp)) { ++ retval = g_list_prepend(retval, (char *) value); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) strcmp); ++ return retval; ++} ++ ++static GList * ++nodes_with_status(GHashTable *table, const char *status) ++{ ++ GList *retval = NULL; ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, table); ++ while (g_hash_table_iter_next(&iter, &key, &value)) { ++ if (!strcmp((char *) value, status)) { ++ retval = g_list_prepend(retval, key); ++ } ++ } ++ ++ retval = g_list_sort(retval, (GCompareFunc) pcmk__numeric_strcasecmp); ++ return retval; ++} ++ ++static char * ++node_list_to_str(GList *list) ++{ ++ char *retval = NULL; ++ size_t len = 0; ++ ++ for (GList *iter = list; iter != NULL; iter = iter->next) { ++ pcmk__add_word(&retval, &len, (char *) iter->data); ++ } ++ ++ return retval; ++} ++ + static void + clone_header(pcmk__output_t *out, int *rc, pe_resource_t *rsc, clone_variant_data_t *clone_data) + { +@@ -710,10 +759,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + ++ GHashTable *stopped = pcmk__strkey_table(free, free); ++ + char *list_text = NULL; +- char *stopped_list = NULL; + size_t list_text_len = 0; +- size_t stopped_list_len = 0; + + GList *promoted_list = NULL; + GList *started_list = NULL; +@@ -768,7 +817,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, child_rsc->id); ++ g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + + } else if (is_set_recursive(child_rsc, pe_rsc_orphan, TRUE) +@@ -822,7 +871,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- free(stopped_list); ++ g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -890,23 +939,15 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { +- const char *state = "Stopped"; +- enum rsc_role_e role = configured_role(rsc); +- +- if (role == RSC_ROLE_STOPPED) { +- state = "Stopped (disabled)"; +- } +- + if (!pcmk_is_set(rsc->flags, pe_rsc_unique) + && (clone_data->clone_max > active_instances)) { + + GList *nIter; + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + +- /* Custom stopped list for non-unique clones */ +- free(stopped_list); +- stopped_list = NULL; +- stopped_list_len = 0; ++ /* Custom stopped table for non-unique clones */ ++ g_hash_table_destroy(stopped); ++ stopped = pcmk__strkey_table(free, free); + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -922,19 +963,39 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { +- pcmk__add_word(&stopped_list, &stopped_list_len, +- node->details->uname); ++ const char *state = "Stopped"; ++ ++ if (configured_role(rsc) == RSC_ROLE_STOPPED) { ++ state = "Stopped (disabled)"; ++ } ++ ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); + } + } + g_list_free(list); + } + +- if (stopped_list != NULL) { ++ if (g_hash_table_size(stopped) > 0) { ++ GList *list = sorted_hash_table_values(stopped); ++ + clone_header(out, &rc, rsc, clone_data); + +- out->list_item(out, NULL, "%s: [ %s ]", state, stopped_list); +- free(stopped_list); +- stopped_list_len = 0; ++ for (GList *status_iter = list; status_iter != NULL; status_iter = status_iter->next) { ++ const char *status = status_iter->data; ++ GList *nodes = nodes_with_status(stopped, status); ++ char *str = node_list_to_str(nodes); ++ ++ if (str != NULL) { ++ out->list_item(out, NULL, "%s: [ %s ]", status, str); ++ free(str); ++ } ++ ++ g_list_free(nodes); ++ } ++ ++ g_list_free(list); ++ g_hash_table_destroy(stopped); + + /* If there are no instances of this clone (perhaps because there are no + * nodes configured), simply output the clone header by itself. This can +-- +2.27.0 + + +From 0228a64cea412936fb8ee91b0f83f9800048d3ba Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 19 Nov 2021 10:06:18 -0500 +Subject: [PATCH 17/21] Feature: scheduler: Display the reason why a clone rsc + probe failed. + +This is similar to the previous commit that adds reasons for primitive +resources. + +See: rhbz#1506372 +--- + cts/cli/regression.crm_mon.exp | 8 +++---- + .../summary/failed-probe-clone.summary | 14 +++++++------ + include/crm/pengine/internal.h | 2 ++ + lib/pengine/clone.c | 21 +++++++++++++++++-- + lib/pengine/utils.c | 7 +++++++ + 5 files changed, 40 insertions(+), 12 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 4333caa11c..5688500ce5 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3479,7 +3479,7 @@ Node List: + Active Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3663,7 +3663,7 @@ Node List: + Full List of Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Fencing (stonith:fence_xvm): Started cluster01 + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] +@@ -3705,7 +3705,7 @@ Full List of Resources: + * 1/1 (stonith:fence_xvm): Active cluster01 + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Container bundle set: httpd-bundle [pcmk:http]: + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 +@@ -3886,7 +3886,7 @@ Node List: + Inactive Resources: + * Clone Set: ping-clone [ping]: + * ping (ocf:pacemaker:ping): Started cluster01 +- * ping (ocf:pacemaker:ping): Stopped ++ * ping (ocf:pacemaker:ping): Stopped (not installed) + * Resource Group: partially-active-group: + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) +diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary +index ca15c302aa..febee14400 100644 +--- a/cts/scheduler/summary/failed-probe-clone.summary ++++ b/cts/scheduler/summary/failed-probe-clone.summary +@@ -5,12 +5,13 @@ Current cluster status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: + * ping-3 (ocf:pacemaker:ping): FAILED cluster01 +- * Stopped: [ cluster02 ] ++ * Stopped (not installed): [ cluster02 ] + + Transition Summary: + * Start ping-2:0 ( cluster02 ) +@@ -38,9 +39,10 @@ Revised Cluster Status: + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped (not installed): [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: + * Started: [ cluster02 ] +- * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: +- * Stopped: [ cluster01 cluster02 ] ++ * Stopped: [ cluster01 ] ++ * Stopped (not installed): [ cluster02 ] +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 58dd2e8727..2b20da6e5f 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -576,4 +576,6 @@ gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean ch + + xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); + ++const char *pe__clone_child_id(pe_resource_t *rsc); ++ + #endif +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 58fb24d24e..ef4bdc0edf 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -963,14 +963,23 @@ pe__clone_default(pcmk__output_t *out, va_list args) + if (pe_find_node(rsc->running_on, node->details->uname) == NULL && + pcmk__str_in_list(node->details->uname, only_node, + pcmk__str_star_matches|pcmk__str_casei)) { ++ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node->details->uname); + const char *state = "Stopped"; + + if (configured_role(rsc) == RSC_ROLE_STOPPED) { + state = "Stopped (disabled)"; + } + +- g_hash_table_insert(stopped, strdup(node->details->uname), +- strdup(state)); ++ if (probe_op != NULL) { ++ int rc; ++ ++ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ crm_strdup_printf("Stopped (%s)", services_ocf_exitcode_str(rc))); ++ } else { ++ g_hash_table_insert(stopped, strdup(node->details->uname), ++ strdup(state)); ++ } + } + } + g_list_free(list); +@@ -1113,3 +1122,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent + + return !passes; + } ++ ++const char * ++pe__clone_child_id(pe_resource_t *rsc) ++{ ++ clone_variant_data_t *clone_data = NULL; ++ get_clone_variant_data(clone_data, rsc); ++ return ID(clone_data->xml_obj_child); ++} +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 3151f0120b..6c4f3b6971 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2573,8 +2573,15 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { + xmlNode * + pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) + { ++ pe_resource_t *parent = uber_parent(rsc); + const char *rsc_id = rsc->id; + ++ if (rsc->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(rsc); ++ } else if (parent->variant == pe_clone) { ++ rsc_id = pe__clone_child_id(parent); ++ } ++ + for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; + xml_op = pcmk__xml_next(xml_op)) { + const char *value = NULL; +-- +2.27.0 + + +From cf8b01da93fce87526617fefdcee6eb9f6ecdbd1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 24 Nov 2021 10:57:05 -0500 +Subject: [PATCH 18/21] Test: cts-cli: Update the last-rc-change sed + expression. + +This can now occur in both the XML output (where it's wrapped in double +quotes) and the text output (where it's wrapped in single quotes and +followed by a comma). In addition, a plus or minus can occur in the +time string. + +The "{0,1}" syntax takes the place of a "?" for marking the optional +comma. In FreeBSD sed, "?" doesn't mean anything special. +--- + cts/cli/regression.crm_mon.exp | 12 ++++++------ + cts/cts-cli.in | 2 +- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp +index 5688500ce5..957758832d 100644 +--- a/cts/cli/regression.crm_mon.exp ++++ b/cts/cli/regression.crm_mon.exp +@@ -3497,7 +3497,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources + =#=#=#= Begin test: XML output of partially active resources =#=#=#= +@@ -3641,7 +3641,7 @@ Failed Resource Actions: + + + +- ++ + + + +@@ -3684,7 +3684,7 @@ Full List of Resources: + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Text output of partially active resources, with inactive resources + =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= +@@ -3771,7 +3771,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output, with inactive resources + =#=#=#= Begin test: Text output of partially active group =#=#=#= +@@ -3850,7 +3850,7 @@ Active Resources: + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= + * Passed: crm_mon - Text output of inactive member of partially active group + =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= +@@ -3942,7 +3942,7 @@ Operations: + * (1) probe + + Failed Resource Actions: +- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms ++ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms + =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= + * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources + =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= +diff --git a/cts/cts-cli.in b/cts/cts-cli.in +index 457816afab..72e9a1e912 100755 +--- a/cts/cts-cli.in ++++ b/cts/cts-cli.in +@@ -1870,7 +1870,7 @@ for t in $tests; do + -e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \ + -e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \ +- -e 's/ last-rc-change=\"[A-Za-z0-9: ]*\"//'\ ++ -e "s/ last-rc-change=['\"][-+A-Za-z0-9: ]*['\"],\{0,1\}//" \ + -e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\ + -e 's/^Entity: line [0-9][0-9]*: //'\ + -e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \ +-- +2.27.0 + + +From dea61f1b6507fbc978e040c1555384d8d7ffa9f3 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 1 Dec 2021 16:23:14 -0500 +Subject: [PATCH 19/21] Fix: include: Bump feature set to 3.12.0. + +This is for the scheduler handling changing regarding maskable probe +failures. + +See: rhbz#1506372. +--- + include/crm/crm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 04d2324d75..16b35e9c55 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.11.0" ++# define CRM_FEATURE_SET "3.12.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +-- +2.27.0 + + +From fef2c61ef462c221809dc91467ea1e96d5478c74 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 6 Dec 2021 16:42:15 -0500 +Subject: [PATCH 20/21] Feature: scheduler: Handle masked probes in the + scheduler. + +These probe operations get their rc/status codes mapped to not +running/done, but still ensures they end up in the list of failed +operations so tool output continues to display them properly. + +Note that failures on bundled resources do not get masked. + +There are no test case changes for this patch. + +See: rhbz#1506372. +--- + lib/pengine/unpack.c | 42 +++++++++++++++++++++++++++++++++++++----- + 1 file changed, 37 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index b659f319fb..f3583e97d8 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3169,6 +3169,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, + } + } + ++ if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { ++ *status = PCMK_EXEC_DONE; ++ *rc = PCMK_OCF_NOT_RUNNING; ++ } ++ + /* If the executor reported an operation status of anything but done or + * error, consider that final. But for done or error, we know better whether + * it should be treated as a failure or not, because we know the expected +@@ -3567,12 +3572,12 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c + CRM_ASSERT(rsc); + CRM_ASSERT(xml_op); + +- if (rc == PCMK_OCF_NOT_RUNNING) { +- clear_past_failure = TRUE; +- +- } else if (rc == PCMK_OCF_NOT_INSTALLED) { ++ if (rc == PCMK_OCF_NOT_INSTALLED || (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op))) { + rsc->role = RSC_ROLE_STOPPED; + ++ } else if (rc == PCMK_OCF_NOT_RUNNING) { ++ clear_past_failure = TRUE; ++ + } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) { + if (last_failure) { + const char *op_key = get_op_key(xml_op); +@@ -3661,8 +3666,10 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + pe_working_set_t *data_set) + { + int rc = 0; ++ int old_rc = 0; + int task_id = 0; + int target_rc = 0; ++ int old_target_rc = 0; + int status = PCMK_EXEC_UNKNOWN; + guint interval_ms = 0; + const char *task = NULL; +@@ -3671,6 +3678,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + bool expired = false; + pe_resource_t *parent = rsc; + enum action_fail_response failure_strategy = action_fail_recover; ++ bool maskable_probe_failure = false; + + CRM_CHECK(rsc && node && xml_op, return); + +@@ -3727,10 +3735,22 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + expired = true; + } + ++ old_rc = rc; ++ old_target_rc = target_rc; ++ + remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, + &rc, &status); + +- if (expired && (rc != target_rc)) { ++ maskable_probe_failure = !pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op); ++ ++ if (expired && maskable_probe_failure && old_rc != old_target_rc) { ++ if (rsc->role <= RSC_ROLE_STOPPED) { ++ rsc->role = RSC_ROLE_UNKNOWN; ++ } ++ ++ goto done; ++ ++ } else if (expired && (rc != target_rc)) { + const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); + + if (interval_ms == 0) { +@@ -3758,6 +3778,18 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + } + } + ++ if (maskable_probe_failure) { ++ crm_notice("Treating probe result '%s' for %s on %s as 'not running'", ++ services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, ++ on_fail, data_set); ++ crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); ++ ++ record_failed_op(xml_op, node, rsc, data_set); ++ resource_location(parent, node, -INFINITY, "masked-probe-failure", data_set); ++ goto done; ++ } ++ + switch (status) { + case PCMK_EXEC_CANCELLED: + // Should never happen +-- +2.27.0 + + +From ccff6eb60598f389008b0621447056457da79671 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 4 Jan 2022 10:14:48 -0500 +Subject: [PATCH 21/21] Test: scheduler: Add tests for expired, masked probe + failures. + +dummy-1 is a stopped resource with an expired masked probe failure. +This probe should be rescheduled. dummy-2 is a started resource with an +expired masked probe failure. This probe should not be rescheduled. +--- + cts/cts-scheduler.in | 1 + + .../dot/expired-failed-probe-primitive.dot | 8 ++ + .../exp/expired-failed-probe-primitive.exp | 45 ++++++++++++ + .../expired-failed-probe-primitive.scores | 7 ++ + .../expired-failed-probe-primitive.summary | 26 +++++++ + .../xml/expired-failed-probe-primitive.xml | 73 +++++++++++++++++++ + 6 files changed, 160 insertions(+) + create mode 100644 cts/scheduler/dot/expired-failed-probe-primitive.dot + create mode 100644 cts/scheduler/exp/expired-failed-probe-primitive.exp + create mode 100644 cts/scheduler/scores/expired-failed-probe-primitive.scores + create mode 100644 cts/scheduler/summary/expired-failed-probe-primitive.summary + create mode 100644 cts/scheduler/xml/expired-failed-probe-primitive.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 3abcbc6c9d..7bc41a0936 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -115,6 +115,7 @@ TESTS = [ + [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], + [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], + [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], ++ [ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ], + [ "standby", "Standby" ], + [ "comments", "Comments" ], + ], +diff --git a/cts/scheduler/dot/expired-failed-probe-primitive.dot b/cts/scheduler/dot/expired-failed-probe-primitive.dot +new file mode 100644 +index 0000000000..610c2b8047 +--- /dev/null ++++ b/cts/scheduler/dot/expired-failed-probe-primitive.dot +@@ -0,0 +1,8 @@ ++ digraph "g" { ++"dummy-1_monitor_0 cluster01" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++"dummy-1_monitor_0 cluster02" -> "dummy-1_start_0 cluster02" [ style = bold] ++"dummy-1_monitor_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-1_start_0 cluster02" [ style=bold color="green" fontcolor="black"] ++"dummy-2_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/expired-failed-probe-primitive.exp b/cts/scheduler/exp/expired-failed-probe-primitive.exp +new file mode 100644 +index 0000000000..3c2cbfe411 +--- /dev/null ++++ b/cts/scheduler/exp/expired-failed-probe-primitive.exp +@@ -0,0 +1,45 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/expired-failed-probe-primitive.scores b/cts/scheduler/scores/expired-failed-probe-primitive.scores +new file mode 100644 +index 0000000000..51ae5510e6 +--- /dev/null ++++ b/cts/scheduler/scores/expired-failed-probe-primitive.scores +@@ -0,0 +1,7 @@ ++ ++pcmk__native_allocate: Fencing allocation score on cluster01: 0 ++pcmk__native_allocate: Fencing allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-1 allocation score on cluster02: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster01: 0 ++pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 +diff --git a/cts/scheduler/summary/expired-failed-probe-primitive.summary b/cts/scheduler/summary/expired-failed-probe-primitive.summary +new file mode 100644 +index 0000000000..ac0604e84f +--- /dev/null ++++ b/cts/scheduler/summary/expired-failed-probe-primitive.summary +@@ -0,0 +1,26 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Stopped ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 ++ ++Transition Summary: ++ * Start dummy-1 ( cluster02 ) ++ ++Executing Cluster Transition: ++ * Resource action: dummy-1 monitor on cluster02 ++ * Resource action: dummy-1 monitor on cluster01 ++ * Resource action: dummy-2 monitor on cluster01 ++ * Resource action: dummy-1 start on cluster02 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ cluster01 cluster02 ] ++ ++ * Full List of Resources: ++ * Fencing (stonith:fence_xvm): Started cluster01 ++ * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 ++ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 +diff --git a/cts/scheduler/xml/expired-failed-probe-primitive.xml b/cts/scheduler/xml/expired-failed-probe-primitive.xml +new file mode 100644 +index 0000000000..684aa73f92 +--- /dev/null ++++ b/cts/scheduler/xml/expired-failed-probe-primitive.xml +@@ -0,0 +1,73 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + diff --git a/SOURCES/010-probe-pending.patch b/SOURCES/010-probe-pending.patch deleted file mode 100644 index 336c33e..0000000 --- a/SOURCES/010-probe-pending.patch +++ /dev/null @@ -1,715 +0,0 @@ -From b0347f7b8e609420a7055d5fe537cc40ac0d1bb2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 16 Jul 2021 11:08:05 -0500 -Subject: [PATCH 1/3] Fix: scheduler: don't schedule probes of unmanaged - resources on pending nodes - -Previously, custom_action() would set an action's optional or runnable flag in -the same, exclusive if-else sequence. This means that if an action should be -optional *and* runnable, only one would be set. In particular, this meant that -if a resource is unmanaged *and* its allocated node is pending, any probe would -be set to optional, but not unrunnable, and the controller could wrongly -attempt the probe before the join completed. - -Now, optional is checked separately. ---- - lib/pengine/utils.c | 22 ++++++++++++++-------- - 1 file changed, 14 insertions(+), 8 deletions(-) - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 5ef742e..965824b 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -541,6 +541,20 @@ custom_action(pe_resource_t * rsc, char *key, const char *task, - FALSE, data_set); - } - -+ // Make the action optional if its resource is unmanaged -+ if (!pcmk_is_set(action->flags, pe_action_pseudo) -+ && (action->node != NULL) -+ && !pcmk_is_set(action->rsc->flags, pe_rsc_managed) -+ && (g_hash_table_lookup(action->meta, -+ XML_LRM_ATTR_INTERVAL_MS) == NULL)) { -+ pe_rsc_debug(rsc, "%s on %s is optional (%s is unmanaged)", -+ action->uuid, action->node->details->uname, -+ action->rsc->id); -+ pe__set_action_flags(action, pe_action_optional); -+ // We shouldn't clear runnable here because ... something -+ } -+ -+ // Make the action runnable or unrunnable as appropriate - if (pcmk_is_set(action->flags, pe_action_pseudo)) { - /* leave untouched */ - -@@ -549,14 +563,6 @@ custom_action(pe_resource_t * rsc, char *key, const char *task, - action->uuid); - pe__clear_action_flags(action, pe_action_runnable); - -- } else if (!pcmk_is_set(rsc->flags, pe_rsc_managed) -- && g_hash_table_lookup(action->meta, -- XML_LRM_ATTR_INTERVAL_MS) == NULL) { -- pe_rsc_debug(rsc, "%s on %s is optional (%s is unmanaged)", -- action->uuid, action->node->details->uname, rsc->id); -- pe__set_action_flags(action, pe_action_optional); -- //pe__clear_action_flags(action, pe_action_runnable); -- - } else if (!pcmk_is_set(action->flags, pe_action_dc) - && !(action->node->details->online) - && (!pe__is_guest_node(action->node) --- -1.8.3.1 - - -From 520303b90eb707f5b7a9afa9b106e4a38b90f0f9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 14 Jul 2021 17:18:44 -0500 -Subject: [PATCH 2/3] Test: scheduler: update existing tests for probe - scheduling change - -This is an improvement. Looking at bundle-probe-order-2 for example, -the bundle's first instance has this status to start: - - * Replica[0] - * galera (ocf::heartbeat:galera): Stopped (unmanaged) - * galera-bundle-docker-0 (ocf::heartbeat:docker): Started centos2 (unmanaged) - * galera-bundle-0 (ocf::pacemaker:remote): Started centos2 (unmanaged) - -After the changes, we now schedule recurring monitors for -galera-bundle-docker-0 and galera-bundle-0 on centos2, and a probe of galera:0 -on galera-bundle-0, all of which are possible. ---- - cts/scheduler/dot/bundle-probe-order-2.dot | 3 ++ - cts/scheduler/dot/bundle-probe-order-3.dot | 1 + - cts/scheduler/exp/bundle-probe-order-2.exp | 33 ++++++++++++++++++++-- - cts/scheduler/exp/bundle-probe-order-3.exp | 21 ++++++++++---- - cts/scheduler/summary/bundle-probe-order-2.summary | 3 ++ - cts/scheduler/summary/bundle-probe-order-3.summary | 1 + - 6 files changed, 53 insertions(+), 9 deletions(-) - -diff --git a/cts/scheduler/dot/bundle-probe-order-2.dot b/cts/scheduler/dot/bundle-probe-order-2.dot -index 0cce3fd..7706195 100644 ---- a/cts/scheduler/dot/bundle-probe-order-2.dot -+++ b/cts/scheduler/dot/bundle-probe-order-2.dot -@@ -1,6 +1,9 @@ - digraph "g" { -+"galera-bundle-0_monitor_30000 centos2" [ style=bold color="green" fontcolor="black"] -+"galera-bundle-docker-0_monitor_60000 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-1_monitor_0 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-2_monitor_0 centos1" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-2_monitor_0 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-2_monitor_0 centos3" [ style=bold color="green" fontcolor="black"] -+"galera:0_monitor_0 galera-bundle-0" [ style=bold color="green" fontcolor="black"] - } -diff --git a/cts/scheduler/dot/bundle-probe-order-3.dot b/cts/scheduler/dot/bundle-probe-order-3.dot -index a4b109f..53a384b 100644 ---- a/cts/scheduler/dot/bundle-probe-order-3.dot -+++ b/cts/scheduler/dot/bundle-probe-order-3.dot -@@ -2,6 +2,7 @@ - "galera-bundle-0_monitor_0 centos1" [ style=bold color="green" fontcolor="black"] - "galera-bundle-0_monitor_0 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-0_monitor_0 centos3" [ style=bold color="green" fontcolor="black"] -+"galera-bundle-docker-0_monitor_60000 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-1_monitor_0 centos2" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-2_monitor_0 centos1" [ style=bold color="green" fontcolor="black"] - "galera-bundle-docker-2_monitor_0 centos2" [ style=bold color="green" fontcolor="black"] -diff --git a/cts/scheduler/exp/bundle-probe-order-2.exp b/cts/scheduler/exp/bundle-probe-order-2.exp -index d6174e7..5b28050 100644 ---- a/cts/scheduler/exp/bundle-probe-order-2.exp -+++ b/cts/scheduler/exp/bundle-probe-order-2.exp -@@ -1,6 +1,33 @@ - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - -@@ -8,7 +35,7 @@ - - - -- -+ - - - -@@ -17,7 +44,7 @@ - - - -- -+ - - - -@@ -26,7 +53,7 @@ - - - -- -+ - - - -diff --git a/cts/scheduler/exp/bundle-probe-order-3.exp b/cts/scheduler/exp/bundle-probe-order-3.exp -index e1f60e7..69140a4 100644 ---- a/cts/scheduler/exp/bundle-probe-order-3.exp -+++ b/cts/scheduler/exp/bundle-probe-order-3.exp -@@ -1,6 +1,15 @@ - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - -@@ -8,7 +17,7 @@ - - - -- -+ - - - -@@ -17,7 +26,7 @@ - - - -- -+ - - - -@@ -26,7 +35,7 @@ - - - -- -+ - - - -@@ -35,7 +44,7 @@ - - - -- -+ - - - -@@ -44,7 +53,7 @@ - - - -- -+ - - - -@@ -53,7 +62,7 @@ - - - -- -+ - - - -diff --git a/cts/scheduler/summary/bundle-probe-order-2.summary b/cts/scheduler/summary/bundle-probe-order-2.summary -index 681d607..024c472 100644 ---- a/cts/scheduler/summary/bundle-probe-order-2.summary -+++ b/cts/scheduler/summary/bundle-probe-order-2.summary -@@ -13,6 +13,9 @@ Current cluster status: - Transition Summary: - - Executing Cluster Transition: -+ * Resource action: galera:0 monitor on galera-bundle-0 -+ * Resource action: galera-bundle-docker-0 monitor=60000 on centos2 -+ * Resource action: galera-bundle-0 monitor=30000 on centos2 - * Resource action: galera-bundle-docker-1 monitor on centos2 - * Resource action: galera-bundle-docker-2 monitor on centos3 - * Resource action: galera-bundle-docker-2 monitor on centos2 -diff --git a/cts/scheduler/summary/bundle-probe-order-3.summary b/cts/scheduler/summary/bundle-probe-order-3.summary -index f089618..331bd87 100644 ---- a/cts/scheduler/summary/bundle-probe-order-3.summary -+++ b/cts/scheduler/summary/bundle-probe-order-3.summary -@@ -12,6 +12,7 @@ Current cluster status: - Transition Summary: - - Executing Cluster Transition: -+ * Resource action: galera-bundle-docker-0 monitor=60000 on centos2 - * Resource action: galera-bundle-0 monitor on centos3 - * Resource action: galera-bundle-0 monitor on centos2 - * Resource action: galera-bundle-0 monitor on centos1 --- -1.8.3.1 - - -From cb9c294a7ef22916866e0e42e51e88c2b1a61c2e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 14 Jul 2021 17:23:11 -0500 -Subject: [PATCH 3/3] Test: scheduler: add test for probe of unmanaged resource - on pending node - -No probes should be scheduled in this case ---- - cts/cts-scheduler.in | 1 + - cts/scheduler/dot/probe-pending-node.dot | 2 + - cts/scheduler/exp/probe-pending-node.exp | 1 + - cts/scheduler/scores/probe-pending-node.scores | 61 ++++++ - cts/scheduler/summary/probe-pending-node.summary | 55 +++++ - cts/scheduler/xml/probe-pending-node.xml | 247 +++++++++++++++++++++++ - 6 files changed, 367 insertions(+) - create mode 100644 cts/scheduler/dot/probe-pending-node.dot - create mode 100644 cts/scheduler/exp/probe-pending-node.exp - create mode 100644 cts/scheduler/scores/probe-pending-node.scores - create mode 100644 cts/scheduler/summary/probe-pending-node.summary - create mode 100644 cts/scheduler/xml/probe-pending-node.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index fc9790b..7ba2415 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -110,6 +110,7 @@ TESTS = [ - [ "probe-2", "Correctly re-probe cloned groups" ], - [ "probe-3", "Probe (pending node)" ], - [ "probe-4", "Probe (pending node + stopped resource)" ], -+ [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], - [ "standby", "Standby" ], - [ "comments", "Comments" ], - ], -diff --git a/cts/scheduler/dot/probe-pending-node.dot b/cts/scheduler/dot/probe-pending-node.dot -new file mode 100644 -index 0000000..d8f1c9f ---- /dev/null -+++ b/cts/scheduler/dot/probe-pending-node.dot -@@ -0,0 +1,2 @@ -+ digraph "g" { -+} -diff --git a/cts/scheduler/exp/probe-pending-node.exp b/cts/scheduler/exp/probe-pending-node.exp -new file mode 100644 -index 0000000..56e315f ---- /dev/null -+++ b/cts/scheduler/exp/probe-pending-node.exp -@@ -0,0 +1 @@ -+ -diff --git a/cts/scheduler/scores/probe-pending-node.scores b/cts/scheduler/scores/probe-pending-node.scores -new file mode 100644 -index 0000000..020a1a0 ---- /dev/null -+++ b/cts/scheduler/scores/probe-pending-node.scores -@@ -0,0 +1,61 @@ -+ -+pcmk__clone_allocate: fs_UC5_SAPMNT-clone allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SAPMNT-clone allocation score on gcdoubwap02: 0 -+pcmk__clone_allocate: fs_UC5_SAPMNT:0 allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SAPMNT:0 allocation score on gcdoubwap02: 0 -+pcmk__clone_allocate: fs_UC5_SAPMNT:1 allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SAPMNT:1 allocation score on gcdoubwap02: 0 -+pcmk__clone_allocate: fs_UC5_SYS-clone allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SYS-clone allocation score on gcdoubwap02: 0 -+pcmk__clone_allocate: fs_UC5_SYS:0 allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SYS:0 allocation score on gcdoubwap02: 0 -+pcmk__clone_allocate: fs_UC5_SYS:1 allocation score on gcdoubwap01: 0 -+pcmk__clone_allocate: fs_UC5_SYS:1 allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: fs_UC5_ascs allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: fs_UC5_ascs allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: fs_UC5_ers allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: fs_UC5_ers allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: grp_UC5_ascs allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: grp_UC5_ascs allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: grp_UC5_ers allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: grp_UC5_ers allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_sap_UC5_ASCS11 allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: rsc_sap_UC5_ASCS11 allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_sap_UC5_ERS12 allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: rsc_sap_UC5_ERS12 allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_vip_gcp_ascs allocation score on gcdoubwap01: INFINITY -+pcmk__group_allocate: rsc_vip_gcp_ascs allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_vip_gcp_ers allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: rsc_vip_gcp_ers allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_vip_init_ers allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: rsc_vip_init_ers allocation score on gcdoubwap02: 0 -+pcmk__group_allocate: rsc_vip_int_ascs allocation score on gcdoubwap01: 0 -+pcmk__group_allocate: rsc_vip_int_ascs allocation score on gcdoubwap02: 0 -+pcmk__native_allocate: fs_UC5_SAPMNT:0 allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: fs_UC5_SAPMNT:0 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: fs_UC5_SAPMNT:1 allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: fs_UC5_SAPMNT:1 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: fs_UC5_SYS:0 allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: fs_UC5_SYS:0 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: fs_UC5_SYS:1 allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: fs_UC5_SYS:1 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: fs_UC5_ascs allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: fs_UC5_ascs allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: fs_UC5_ers allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: fs_UC5_ers allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_sap_UC5_ASCS11 allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: rsc_sap_UC5_ASCS11 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_sap_UC5_ERS12 allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: rsc_sap_UC5_ERS12 allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_vip_gcp_ascs allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: rsc_vip_gcp_ascs allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_vip_gcp_ers allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: rsc_vip_gcp_ers allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_vip_init_ers allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: rsc_vip_init_ers allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: rsc_vip_int_ascs allocation score on gcdoubwap01: INFINITY -+pcmk__native_allocate: rsc_vip_int_ascs allocation score on gcdoubwap02: -INFINITY -+pcmk__native_allocate: stonith_gcdoubwap01 allocation score on gcdoubwap01: -INFINITY -+pcmk__native_allocate: stonith_gcdoubwap01 allocation score on gcdoubwap02: 0 -+pcmk__native_allocate: stonith_gcdoubwap02 allocation score on gcdoubwap01: 0 -+pcmk__native_allocate: stonith_gcdoubwap02 allocation score on gcdoubwap02: -INFINITY -diff --git a/cts/scheduler/summary/probe-pending-node.summary b/cts/scheduler/summary/probe-pending-node.summary -new file mode 100644 -index 0000000..208186b ---- /dev/null -+++ b/cts/scheduler/summary/probe-pending-node.summary -@@ -0,0 +1,55 @@ -+Using the original execution date of: 2021-06-11 13:55:24Z -+ -+ *** Resource management is DISABLED *** -+ The cluster will not attempt to start, stop or recover services -+ -+Current cluster status: -+ * Node List: -+ * Node gcdoubwap02: pending -+ * Online: [ gcdoubwap01 ] -+ -+ * Full List of Resources: -+ * stonith_gcdoubwap01 (stonith:fence_gce): Stopped (unmanaged) -+ * stonith_gcdoubwap02 (stonith:fence_gce): Stopped (unmanaged) -+ * Clone Set: fs_UC5_SAPMNT-clone [fs_UC5_SAPMNT] (unmanaged): -+ * Stopped: [ gcdoubwap01 gcdoubwap02 ] -+ * Clone Set: fs_UC5_SYS-clone [fs_UC5_SYS] (unmanaged): -+ * Stopped: [ gcdoubwap01 gcdoubwap02 ] -+ * Resource Group: grp_UC5_ascs (unmanaged): -+ * rsc_vip_int_ascs (ocf:heartbeat:IPaddr2): Stopped (unmanaged) -+ * rsc_vip_gcp_ascs (ocf:heartbeat:gcp-vpc-move-vip): Started gcdoubwap01 (unmanaged) -+ * fs_UC5_ascs (ocf:heartbeat:Filesystem): Stopped (unmanaged) -+ * rsc_sap_UC5_ASCS11 (ocf:heartbeat:SAPInstance): Stopped (unmanaged) -+ * Resource Group: grp_UC5_ers (unmanaged): -+ * rsc_vip_init_ers (ocf:heartbeat:IPaddr2): Stopped (unmanaged) -+ * rsc_vip_gcp_ers (ocf:heartbeat:gcp-vpc-move-vip): Stopped (unmanaged) -+ * fs_UC5_ers (ocf:heartbeat:Filesystem): Stopped (unmanaged) -+ * rsc_sap_UC5_ERS12 (ocf:heartbeat:SAPInstance): Stopped (unmanaged) -+ -+Transition Summary: -+ -+Executing Cluster Transition: -+Using the original execution date of: 2021-06-11 13:55:24Z -+ -+Revised Cluster Status: -+ * Node List: -+ * Node gcdoubwap02: pending -+ * Online: [ gcdoubwap01 ] -+ -+ * Full List of Resources: -+ * stonith_gcdoubwap01 (stonith:fence_gce): Stopped (unmanaged) -+ * stonith_gcdoubwap02 (stonith:fence_gce): Stopped (unmanaged) -+ * Clone Set: fs_UC5_SAPMNT-clone [fs_UC5_SAPMNT] (unmanaged): -+ * Stopped: [ gcdoubwap01 gcdoubwap02 ] -+ * Clone Set: fs_UC5_SYS-clone [fs_UC5_SYS] (unmanaged): -+ * Stopped: [ gcdoubwap01 gcdoubwap02 ] -+ * Resource Group: grp_UC5_ascs (unmanaged): -+ * rsc_vip_int_ascs (ocf:heartbeat:IPaddr2): Stopped (unmanaged) -+ * rsc_vip_gcp_ascs (ocf:heartbeat:gcp-vpc-move-vip): Started gcdoubwap01 (unmanaged) -+ * fs_UC5_ascs (ocf:heartbeat:Filesystem): Stopped (unmanaged) -+ * rsc_sap_UC5_ASCS11 (ocf:heartbeat:SAPInstance): Stopped (unmanaged) -+ * Resource Group: grp_UC5_ers (unmanaged): -+ * rsc_vip_init_ers (ocf:heartbeat:IPaddr2): Stopped (unmanaged) -+ * rsc_vip_gcp_ers (ocf:heartbeat:gcp-vpc-move-vip): Stopped (unmanaged) -+ * fs_UC5_ers (ocf:heartbeat:Filesystem): Stopped (unmanaged) -+ * rsc_sap_UC5_ERS12 (ocf:heartbeat:SAPInstance): Stopped (unmanaged) -diff --git a/cts/scheduler/xml/probe-pending-node.xml b/cts/scheduler/xml/probe-pending-node.xml -new file mode 100644 -index 0000000..9f55c92 ---- /dev/null -+++ b/cts/scheduler/xml/probe-pending-node.xml -@@ -0,0 +1,247 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - diff --git a/SOURCES/011-crm_attribute-regression.patch b/SOURCES/011-crm_attribute-regression.patch deleted file mode 100644 index 7263313..0000000 --- a/SOURCES/011-crm_attribute-regression.patch +++ /dev/null @@ -1,150 +0,0 @@ -From ea5510dd979bb6d375324cda26925d9e7c4362f5 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 19 Jul 2021 10:04:16 -0400 -Subject: [PATCH 1/2] Low: tools: The --get-value option does not require an - arg. - -Regression in 2.1.0 introduced by 15f5c2901. ---- - tools/crm_attribute.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c -index 2cc8d26..8a5b4e4 100644 ---- a/tools/crm_attribute.c -+++ b/tools/crm_attribute.c -@@ -242,7 +242,7 @@ static GOptionEntry deprecated_entries[] = { - NULL, NULL - }, - -- { "get-value", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, value_cb, -+ { "get-value", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, value_cb, - NULL, NULL - }, - --- -1.8.3.1 - - -From ef054d943afe8e60017f6adc4e25f88a59ac91a4 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 19 Jul 2021 11:37:04 -0400 -Subject: [PATCH 2/2] Low: libcrmcommon: Allow negative numbers as cmdline - options. - -The bug here is that negative numbers (for instance, negative scores) -are not supported as command line arguments. Because we break up a -string that starts with a single dash into multiple arguments, "-1000" -becomes "-1", "-0", "-0", and "-0". - -Because we don't have enough information about what is happening on the -command line, the best we can do here is recognize something as a -negative number and pass it on. Any errors will have to be detected at -a later step. - -Also note that we only recognize negative numbers if they start with -1-9. Starting with 0 will be recognized as some sort of string. - -Regression in 2.1.0 caused by a long-standing bug in -pcmk__cmdline_preproc_test. ---- - lib/common/cmdline.c | 29 ++++++++++++++++++++++ - .../tests/cmdline/pcmk__cmdline_preproc_test.c | 24 +++++++++++++++++- - 2 files changed, 52 insertions(+), 1 deletion(-) - -diff --git a/lib/common/cmdline.c b/lib/common/cmdline.c -index 7c95d02..9c1b810 100644 ---- a/lib/common/cmdline.c -+++ b/lib/common/cmdline.c -@@ -9,6 +9,7 @@ - - #include - -+#include - #include - - #include -@@ -189,6 +190,34 @@ pcmk__cmdline_preproc(char **argv, const char *special) { - /* Skip over leading dash */ - char *ch = argv[i]+1; - -+ /* This looks like the start of a number, which means it is a negative -+ * number. It's probably the argument to the preceeding option, but -+ * we can't know that here. Copy it over and let whatever handles -+ * arguments next figure it out. -+ */ -+ if (*ch != '\0' && *ch >= '1' && *ch <= '9') { -+ bool is_numeric = true; -+ -+ while (*ch != '\0') { -+ if (!isdigit(*ch)) { -+ is_numeric = false; -+ break; -+ } -+ -+ ch++; -+ } -+ -+ if (is_numeric) { -+ g_ptr_array_add(arr, g_strdup_printf("%s", argv[i])); -+ continue; -+ } else { -+ /* This argument wasn't entirely numeric. Reset ch to the -+ * beginning so we can process it one character at a time. -+ */ -+ ch = argv[i]+1; -+ } -+ } -+ - while (*ch != '\0') { - /* This is a special short argument that takes an option. getopt - * allows values to be interspersed with a list of arguments, but -diff --git a/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c b/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -index b8506c6..9a752ef 100644 ---- a/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -+++ b/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2020 the Pacemaker project contributors -+ * Copyright 2020-2021 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -86,6 +86,26 @@ long_arg(void) { - g_strfreev(processed); - } - -+static void -+negative_score(void) { -+ const char *argv[] = { "-v", "-1000", NULL }; -+ const gchar *expected[] = { "-v", "-1000", NULL }; -+ -+ gchar **processed = pcmk__cmdline_preproc((char **) argv, "v"); -+ LISTS_EQ(processed, expected); -+ g_strfreev(processed); -+} -+ -+static void -+negative_score_2(void) { -+ const char *argv[] = { "-1i3", NULL }; -+ const gchar *expected[] = { "-1", "-i", "-3", NULL }; -+ -+ gchar **processed = pcmk__cmdline_preproc((char **) argv, NULL); -+ LISTS_EQ(processed, expected); -+ g_strfreev(processed); -+} -+ - int - main(int argc, char **argv) - { -@@ -98,5 +118,7 @@ main(int argc, char **argv) - g_test_add_func("/common/cmdline/preproc/special_args", special_args); - g_test_add_func("/common/cmdline/preproc/special_arg_at_end", special_arg_at_end); - g_test_add_func("/common/cmdline/preproc/long_arg", long_arg); -+ g_test_add_func("/common/cmdline/preproc/negative_score", negative_score); -+ g_test_add_func("/common/cmdline/preproc/negative_score_2", negative_score_2); - return g_test_run(); - } --- -1.8.3.1 - diff --git a/SOURCES/011-fencing-reasons.patch b/SOURCES/011-fencing-reasons.patch new file mode 100644 index 0000000..4422ca0 --- /dev/null +++ b/SOURCES/011-fencing-reasons.patch @@ -0,0 +1,1450 @@ +From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:25:21 -0600 +Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function + renames + +--- + doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index a51220cac9..68158484ce 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -106,7 +106,7 @@ or messaging layer callback, which calls: + the number of active peers), and if this is the last expected reply, + calls + +- * ``call_remote_stonith()``, which calculates the timeout and sends ++ * ``request_peer_fencing()``, which calculates the timeout and sends + ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target + node has a fencing "topology" (which allows specifications such as + "this node can be fenced either with device A, or devices B and C in +@@ -156,7 +156,7 @@ returns, and calls + * done callback (``st_child_done()``), which calls ``schedule_stonith_command()`` + for a new device if there are further required actions to execute or if the + original action failed, then builds and sends an XML reply to the original +- fencer (via ``stonith_send_async_reply()``), then checks whether any ++ fencer (via ``send_async_reply()``), then checks whether any + pending actions are the same as the one just executed and merges them if so. + + Fencing replies +@@ -169,18 +169,18 @@ messaging layer callback, which calls: + + * ``handle_reply()``, which calls + +- * ``process_remote_stonith_exec()``, which calls either +- ``call_remote_stonith()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ * ``fenced_process_fencing_reply()``, which calls either ++ ``request_peer_fencing()`` (to retry a failed operation, or try the next ++ device in a topology is appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or +- ``remote_op_done()`` (if the operation is definitively failed or ++ ``finalize_op()`` (if the operation is definitively failed or + successful). + +- * remote_op_done() broadcasts the result to all peers. ++ * ``finalize_op()`` broadcasts the result to all peers. + + Finally, all peers receive the broadcast result and call + +-* ``remote_op_done()``, which sends the result to all local clients. ++* ``finalize_op()``, which sends the result to all local clients. + + + .. index:: +-- +2.27.0 + + +From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 12 Nov 2021 09:58:16 -0600 +Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action + callback data + +stonith_callback_data_t previously only contained the legacy return code for +the action. Use its new opaque member to store the full result, along with +accessors (available only internally for now). +--- + include/crm/fencing/internal.h | 3 ++ + lib/fencing/st_client.c | 99 ++++++++++++++++++++++++++-------- + 2 files changed, 81 insertions(+), 21 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index f0d294a0b3..eff689e59b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++int stonith__exit_status(stonith_callback_data_t *data); ++int stonith__execution_status(stonith_callback_data_t *data); ++const char *stonith__exit_reason(stonith_callback_data_t *data); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 2ca094566b..9d93ffd481 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) + * \param[in] st Fencer API connection + * \param[in] call_id If positive, call ID of completed fence action, otherwise + * legacy return code for early action failure +- * \param[in] rc Legacy return code for action result ++ * \param[in] result Full result for action + * \param[in] userdata User data to pass to callback + * \param[in] callback Fence action callback to invoke + */ + static void +-invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, ++invoke_fence_action_callback(stonith_t *st, int call_id, ++ pcmk__action_result_t *result, ++ void *userdata, + void (*callback) (stonith_t *st, + stonith_callback_data_t *data)) + { + stonith_callback_data_t data = { 0, }; + + data.call_id = call_id; +- data.rc = rc; ++ data.rc = pcmk_rc2legacy(stonith__result2rc(result)); + data.userdata = userdata; ++ data.opaque = (void *) result; + + callback(st, &data); + } +@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + { + stonith_private_t *private = NULL; + stonith_callback_client_t *cb_info = NULL; +- int rc = pcmk_ok; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(stonith != NULL, return); + CRM_CHECK(stonith->st_private != NULL, return); +@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + + if (msg == NULL) { + // Fencer didn't reply in time +- rc = -ETIME; ++ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ "Timeout waiting for reply from fencer"); + CRM_LOG_ASSERT(call_id > 0); + + } else { + // We have the fencer reply +- +- if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { +- rc = -pcmk_err_generic; +- } +- + if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) + || (call_id <= 0)) { + crm_log_xml_warn(msg, "Bad fencer reply"); + } ++ stonith__xe_get_result(msg, &result); + } + + if (call_id > 0) { +@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + } + + if ((cb_info != NULL) && (cb_info->callback != NULL) +- && (rc == pcmk_ok || !(cb_info->only_success))) { ++ && (pcmk__result_ok(&result) || !(cb_info->only_success))) { + crm_trace("Invoking callback %s for call %d", + crm_str(cb_info->id), call_id); +- invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, +- cb_info->callback); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ cb_info->user_data, cb_info->callback); + +- } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { +- crm_warn("Fencing action without registered callback failed: %s", +- pcmk_strerror(rc)); ++ } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { ++ crm_warn("Fencing action without registered callback failed: %d (%s)", ++ result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); + crm_log_xml_debug(msg, "Failed fence update"); + } + + if (private->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); +- invoke_fence_action_callback(stonith, call_id, rc, NULL, ++ invoke_fence_action_callback(stonith, call_id, &result, NULL, + private->op_callback); + } + + if (cb_info != NULL) { + stonith_api_del_callback(stonith, call_id, FALSE); + } ++ pcmk__reset_result(&result); + } + + static gboolean +@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti + CRM_CHECK(stonith->st_private != NULL, return -EINVAL); + private = stonith->st_private; + +- if (call_id == 0) { ++ if (call_id == 0) { // Add global callback + private->op_callback = callback; + +- } else if (call_id < 0) { ++ } else if (call_id < 0) { // Call failed immediately, so call callback now + if (!(options & st_opt_report_only_success)) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); +- invoke_fence_action_callback(stonith, call_id, call_id, user_data, +- callback); ++ pcmk__set_result(&result, CRM_EX_ERROR, ++ stonith__legacy2status(call_id), NULL); ++ invoke_fence_action_callback(stonith, call_id, &result, ++ user_data, callback); + } else { + crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); + } +@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, + freeXpathObject(xpath); + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit status from callback data ++ */ ++int ++stonith__exit_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Execution status from callback data ++ */ ++int ++stonith__execution_status(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an async action callback ++ * ++ * \param[in] data Callback data ++ * ++ * \return Exit reason from callback data ++ */ ++const char * ++stonith__exit_reason(stonith_callback_data_t *data) ++{ ++ if ((data == NULL) || (data->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) data->opaque)->exit_reason; ++} ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 9 Nov 2021 16:16:03 -0600 +Subject: [PATCH 03/17] Log: controller: improve fencing result messages + +Now that fence callbacks get the full result, we can log a better message. +Also check for error conditions better, improve message wording, and ensure +only a single message is logged per result. +--- + daemons/controld/controld_fencing.c | 83 +++++++++++++++++++---------- + 1 file changed, 56 insertions(+), 27 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f5a252c813..f8d2fc13f4 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + int stonith_id = -1; + int transition_id = -1; + crm_action_t *action = NULL; +- int call_id = data->call_id; +- int rc = data->rc; +- char *userdata = data->userdata; +- +- CRM_CHECK(userdata != NULL, return); +- crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, +- pcmk_strerror(rc), rc); ++ const char *target = NULL; + +- if (AM_I_DC == FALSE) { ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence operation %d result: " ++ "No transition key given (bug?)", ++ ((data == NULL)? -1 : data->call_id)); + return; + } + +- /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ +- /* op->call_id, op->optype, op->node_name, op->op_result, */ +- /* (char *)op->node_list, op->private_data); */ ++ if (!AM_I_DC) { ++ const char *reason = stonith__exit_reason(data); ++ ++ if (reason == NULL) { ++ reason = pcmk_exec_status_str(stonith__execution_status(data)); ++ } ++ crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s", ++ data->call_id, stonith__exit_status(data), reason, ++ (const char *) data->userdata); ++ return; ++ } + +- /* filter out old STONITH actions */ +- CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), ++ CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, ++ &stonith_id, NULL), + goto bail); + +- if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei) +- || transition_graph->id != transition_id) { +- crm_info("Ignoring STONITH action initiated outside of the current transition"); ++ if (transition_graph->complete || (stonith_id < 0) ++ || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none) ++ || (transition_graph->id != transition_id)) { ++ crm_info("Ignoring fence operation %d result: " ++ "Not from current transition " CRM_XS ++ " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", ++ data->call_id, pcmk__btoa(transition_graph->complete), ++ stonith_id, uuid, te_uuid, transition_id, transition_graph->id); + goto bail; + } + + action = controld_get_action(stonith_id); + if (action == NULL) { +- crm_err("Stonith action not matched"); ++ crm_err("Ignoring fence operation %d result: " ++ "Action %d not found in transition graph (bug?) " ++ CRM_XS " uuid=%s transition=%d", ++ data->call_id, stonith_id, uuid, transition_id); ++ goto bail; ++ } ++ ++ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (target == NULL) { ++ crm_err("Ignoring fence operation %d result: No target given (bug?)", ++ data->call_id); + goto bail; + } + + stop_te_timer(action->timer); +- if (rc == pcmk_ok) { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ if (stonith__exit_status(data) == CRM_EX_OK) { + const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + const char *op = crm_meta_value(action->params, "stonith_action"); + +- crm_info("Stonith operation %d for %s passed", call_id, target); ++ crm_notice("Fence operation %d for %s passed", data->call_id, target); + if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { + te_action_confirmed(action, NULL); + if (pcmk__str_eq("on", op, pcmk__str_casei)) { +@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + st_fail_count_reset(target); + + } else { +- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + enum transition_action abort_action = tg_restart; ++ int status = stonith__execution_status(data); ++ const char *reason = stonith__exit_reason(data); + ++ if (reason == NULL) { ++ if (status == PCMK_EXEC_DONE) { ++ reason = "Agent returned error"; ++ } else { ++ reason = pcmk_exec_status_str(status); ++ } ++ } + crm__set_graph_action_flags(action, pcmk__graph_action_failed); +- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", +- call_id, target, pcmk_strerror(rc)); + + /* If no fence devices were available, there's no use in immediately + * checking again, so don't start a new transition in that case. + */ +- if (rc == -ENODEV) { +- crm_warn("No devices found in cluster to fence %s, giving up", +- target); ++ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { ++ crm_warn("Fence operation %d for %s failed: %s " ++ "(aborting transition and giving up for now)", ++ data->call_id, target, reason); + abort_action = tg_stop; ++ } else { ++ crm_notice("Fence operation %d for %s failed: %s " ++ "(aborting transition)", data->call_id, target, reason); + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + trigger_graph(); + + bail: +- free(userdata); ++ free(data->userdata); + free(uuid); + return; + } +-- +2.27.0 + + +From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:37:16 -0600 +Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc() + function + +action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as +appropriate to the action standard. However, it was called only from a place +that did not process stonith actions, so that place can just call +services_result2ocf() directly. + +This will simplify planned changes. +--- + daemons/execd/execd_commands.c | 24 ++++++------------------ + 1 file changed, 6 insertions(+), 18 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5bb2aab692..5e123e322e 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc) + return rc; + } + +-static int +-action_get_uniform_rc(svc_action_t *action) +-{ +- lrmd_cmd_t *cmd = action->cb_data; +- +- if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH, +- pcmk__str_casei)) { +- return stonith2uniform_rc(cmd->action, action->rc); +- } else { +- enum ocf_exitcode code = services_result2ocf(action->standard, +- cmd->action, action->rc); +- +- // Cast variable instead of function return to keep compilers happy +- return (int) code; +- } +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -848,6 +831,7 @@ action_complete(svc_action_t * action) + { + lrmd_rsc_t *rsc; + lrmd_cmd_t *cmd = action->cb_data; ++ enum ocf_exitcode code; + + #ifdef PCMK__TIME_USE_CGT + const char *rclass = NULL; +@@ -867,8 +851,12 @@ action_complete(svc_action_t * action) + #endif + + cmd->last_pid = action->pid; +- pcmk__set_result(&(cmd->result), action_get_uniform_rc(action), ++ ++ // Cast variable instead of function return to keep compilers happy ++ code = services_result2ocf(action->standard, cmd->action, action->rc); ++ pcmk__set_result(&(cmd->result), (int) code, + action->status, services__exit_reason(action)); ++ + rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; + + #ifdef PCMK__TIME_USE_CGT +-- +2.27.0 + + +From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 12:39:30 -0600 +Subject: [PATCH 05/17] Feature: executor: use full result from fencer for + fence actions + +Now that fence callbacks get the full result, we can improve the executor +command result for fence actions. stonith_action_complete() now takes a +full result, allowing the executor to use that directly rather than map a +legacy return code. +--- + daemons/execd/execd_commands.c | 140 +++++++++++++++++++-------------- + 1 file changed, 80 insertions(+), 60 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5e123e322e..e722994012 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -8,6 +8,7 @@ + */ + + #include ++#include + + #include + +@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) + } + } + +-static int +-stonith2uniform_rc(const char *action, int rc) +-{ +- switch (rc) { +- case pcmk_ok: +- rc = PCMK_OCF_OK; +- break; +- +- case -ENODEV: +- /* This should be possible only for probes in practice, but +- * interpret for all actions to be safe. +- */ +- if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { +- rc = PCMK_OCF_NOT_RUNNING; +- } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) { +- rc = PCMK_OCF_OK; +- } else { +- rc = PCMK_OCF_NOT_INSTALLED; +- } +- break; +- +- case -EOPNOTSUPP: +- rc = PCMK_OCF_UNIMPLEMENT_FEATURE; +- break; +- +- default: +- rc = PCMK_OCF_UNKNOWN_ERROR; +- break; +- } +- return rc; +-} +- + struct notify_new_client_data { + xmlNode *notify; + pcmk__client_t *new_client; +@@ -988,46 +957,84 @@ action_complete(svc_action_t * action) + cmd_finalize(cmd, rsc); + } + ++/*! ++ * \internal ++ * \brief Process the result of a fence device action (start, stop, or monitor) ++ * ++ * \param[in] cmd Fence device action that completed ++ * \param[in] exit_status Fencer API exit status for action ++ * \param[in] execution_status Fencer API execution status for action ++ * \param[in] exit_reason Human-friendly detail, if action failed ++ */ + static void +-stonith_action_complete(lrmd_cmd_t * cmd, int rc) ++stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, ++ enum pcmk_exec_status execution_status, ++ const char *exit_reason) + { + // This can be NULL if resource was removed before command completed + lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); + +- cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc); ++ // Simplify fencer exit status to uniform exit status ++ if (exit_status != CRM_EX_OK) { ++ exit_status = PCMK_OCF_UNKNOWN_ERROR; ++ } + +- /* This function may be called with status already set to cancelled, if a +- * pending action was aborted. Otherwise, we need to determine status from +- * the fencer return code. +- */ +- if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { +- cmd->result.execution_status = stonith__legacy2status(rc); ++ if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { ++ /* An in-flight fence action was cancelled. The execution status is ++ * already correct, so don't overwrite it. ++ */ ++ execution_status = PCMK_EXEC_CANCELLED; + +- // Simplify status codes from fencer +- switch (cmd->result.execution_status) { ++ } else { ++ /* Some execution status codes have specific meanings for the fencer ++ * that executor clients may not expect, so map them to a simple error ++ * status. ++ */ ++ switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: +- cmd->result.execution_status = PCMK_EXEC_ERROR; ++ execution_status = PCMK_EXEC_ERROR; + break; +- default: ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: ++ /* This should be possible only for probes in practice, but ++ * interpret for all actions to be safe. ++ */ ++ if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_NOT_RUNNING; ++ ++ } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, ++ pcmk__str_none)) { ++ exit_status = PCMK_OCF_OK; ++ ++ } else { ++ exit_status = PCMK_OCF_NOT_INSTALLED; ++ } ++ execution_status = PCMK_EXEC_ERROR; + break; +- } + +- // Certain successful actions change the known state of the resource +- if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { +- if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK +- } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING +- } ++ case PCMK_EXEC_NOT_SUPPORTED: ++ exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; ++ break; ++ ++ default: ++ break; + } + } + +- // Give the user more detail than an OCF code +- if (rc != -pcmk_err_generic) { +- cmd->result.exit_reason = strdup(pcmk_strerror(rc)); ++ pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); ++ ++ // Certain successful actions change the known state of the resource ++ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { ++ ++ if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { ++ rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ ++ } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { ++ rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ } + } + + /* The recurring timer should not be running at this point in any case, but +@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) + static void + lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- stonith_action_complete(data->userdata, data->rc); ++ if ((data == NULL) || (data->userdata == NULL)) { ++ crm_err("Ignoring fence action result: " ++ "Invalid callback arguments (bug?)"); ++ } else { ++ stonith_action_complete((lrmd_cmd_t *) data->userdata, ++ stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); ++ } + } + + void +@@ -1097,7 +1112,9 @@ stonith_connection_failed(void) + crm_err("Connection to fencer failed, finalizing %d pending operations", + g_list_length(cmd_list)); + for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { +- stonith_action_complete(cmd_iter->data, -ENOTCONN); ++ stonith_action_complete((lrmd_cmd_t *) cmd_iter->data, ++ CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, ++ "Lost connection to fencer"); + } + g_list_free(cmd_list); + } +@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +- if (rc == 0) { ++ if (rc == pcmk_ok) { + do_monitor = TRUE; + } + +@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + } + } + +- stonith_action_complete(cmd, rc); ++ stonith_action_complete(cmd, ++ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), ++ stonith__legacy2status(rc), ++ rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); + } + + static int +-- +2.27.0 + + +From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 22 Nov 2021 16:15:05 -0600 +Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence + results as a hard error + +Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to +PCMK_EXEC_ERROR to keep handling of that situation the same as before the new +code was added. + +However, the earlier handling was less than ideal -- a resource action that +failed due to missing secrets would be retried on the same node, and almost +certainly fail again for the same reason. Now, the executor passes along +PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the +CIB status, and the scheduler will treat it as a hard error (i.e. not retrying +on the same node). + +Backward compatibility isn't a problem because the scheduler treats unknown +status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to +handle it as before. The CRM feature set has been bumped so the handling can't +flip back and forth in a mixed-version cluster. +--- + daemons/execd/execd_commands.c | 1 - + include/crm/crm.h | 4 ++-- + lib/pengine/unpack.c | 3 --- + 3 files changed, 2 insertions(+), 6 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index e722994012..4ced6d1d5c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + switch (execution_status) { + case PCMK_EXEC_NOT_CONNECTED: + case PCMK_EXEC_INVALID: +- case PCMK_EXEC_NO_SECRETS: + execution_status = PCMK_EXEC_ERROR; + break; + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index 16b35e9c55..56b07cb12a 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -66,7 +66,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.12.0" ++# define CRM_FEATURE_SET "3.13.0" + + /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and + * recipient of a CPG message. This imposes an arbitrary limit on cluster node +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 3e0384cd2a..8a2d2a6d6d 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + case PCMK_EXEC_INVALID: + break; // Not done, do error handling + +- /* These should only be possible in fence action results, not operation +- * history, but have some handling in place as a fail-safe. +- */ + case PCMK_EXEC_NO_FENCE_DEVICE: + case PCMK_EXEC_NO_SECRETS: + status = PCMK_EXEC_ERROR_HARD; +-- +2.27.0 + + +From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:05:20 -0600 +Subject: [PATCH 07/17] Low: executor: improve result for fence device probes + +Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy +return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and +rename to fence_probe_result). Set an appropriate exit reason when available. +--- + daemons/execd/execd_commands.c | 57 ++++++++++++++++++++++++++------- + daemons/execd/pacemaker-execd.h | 9 +++++- + 2 files changed, 54 insertions(+), 12 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 4ced6d1d5c..6e5505e973 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg) + rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); + rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); + rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc); +- rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running" ++ ++ // Initialize fence device probes (to return "not running") ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; + return rsc; + } + +@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, + if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { + + if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK ++ rsc->fence_probe_result = PCMK_EXEC_DONE; // "running" + + } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { +- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING ++ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running" + } + } + +@@ -1081,14 +1083,13 @@ stonith_connection_failed(void) + if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { + /* If we registered this fence device, we don't know whether the + * fencer still has the registration or not. Cause future probes to +- * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or +- * started successfully. This is especially important if the +- * controller also went away (possibly due to a cluster layer +- * restart) and won't receive our client notification of any +- * monitors finalized below. ++ * return an error until the resource is stopped or started ++ * successfully. This is especially important if the controller also ++ * went away (possibly due to a cluster layer restart) and won't ++ * receive our client notification of any monitors finalized below. + */ +- if (rsc->st_probe_rc == pcmk_ok) { +- rsc->st_probe_rc = pcmk_err_generic; ++ if (rsc->fence_probe_result == PCMK_EXEC_DONE) { ++ rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED; + } + + if (rsc->active) { +@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) + return rc; + } + ++/*! ++ * \internal ++ * \brief Finalize the result of a fence device probe ++ * ++ * \param[in] cmd Probe action ++ * \param[in] probe_result Probe result ++ */ ++static void ++finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result) ++{ ++ int exit_status = CRM_EX_ERROR; ++ const char *reason = NULL; ++ ++ switch (probe_result) { ++ case PCMK_EXEC_DONE: // Device is "running" ++ exit_status = CRM_EX_OK; ++ break; ++ ++ case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running" ++ break; ++ ++ case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed() ++ reason = "Lost connection to fencer"; ++ break; ++ ++ default: // Shouldn't be possible ++ probe_result = PCMK_EXEC_ERROR; ++ reason = "Invalid fence device probe result (bug?)"; ++ break; ++ } ++ stonith_action_complete(cmd, exit_status, probe_result, reason); ++} ++ + static void + lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + { +@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + if (cmd->interval_ms > 0) { + do_monitor = TRUE; + } else { +- rc = rsc->st_probe_rc; ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; + } + } + +diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h +index 51ef8d22e6..057d889584 100644 +--- a/daemons/execd/pacemaker-execd.h ++++ b/daemons/execd/pacemaker-execd.h +@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s { + * that have been handed off from the pending ops list. */ + GList *recurring_ops; + +- int st_probe_rc; // What value should be returned for a probe if stonith ++ /* If this resource is a fence device, probes are handled internally by the ++ * executor, and this value indicates the result that should currently be ++ * returned for probes. It should be one of: ++ * PCMK_EXEC_DONE (to indicate "running"), ++ * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or ++ * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost"). ++ */ ++ enum pcmk_exec_status fence_probe_result; + + crm_trigger_t *work; + } lrmd_rsc_t; +-- +2.27.0 + + +From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:14:48 -0600 +Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for + probes + +For fence devices, probe results are based on earlier state determinations, +so handle them before requiring an active fencer connection. The effect may be +negligible, but it would allow probes to proceed while waiting for a +reconnection. +--- + daemons/execd/execd_commands.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 6e5505e973..5999ba19c9 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + stonith_t *stonith_api = get_stonith_connection(); + +- if (!stonith_api) { ++ if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) ++ && (cmd->interval_ms == 0)) { ++ // Probes don't require a fencer connection ++ finalize_fence_device_probe(cmd, rsc->fence_probe_result); ++ return; ++ ++ } else if (stonith_api == NULL) { + rc = -ENOTCONN; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { +@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + rc = execd_stonith_stop(stonith_api, rsc); + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { +- if (cmd->interval_ms > 0) { +- do_monitor = TRUE; +- } else { +- finalize_fence_device_probe(cmd, rsc->fence_probe_result); +- return; +- } ++ do_monitor = TRUE; + } + + if (do_monitor) { +-- +2.27.0 + + +From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:20:34 -0600 +Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence + device actions + +... and set an exit reason. Previously, it would return success for unsupported +actions. It shouldn't be possible, but it would be nice to have an indication +of what is wrong if a bug is introduced. +--- + daemons/execd/execd_commands.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 5999ba19c9..772d6446dc 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + + } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + do_monitor = TRUE; ++ ++ } else { ++ stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, ++ PCMK_EXEC_ERROR, ++ "Invalid fence device action (bug?)"); ++ return; + } + + if (do_monitor) { +-- +2.27.0 + + +From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 12:24:07 -0600 +Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection + +--- + daemons/execd/execd_commands.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 772d6446dc..7ae309d94c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + return; + + } else if (stonith_api == NULL) { +- rc = -ENOTCONN; ++ stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, ++ "No connection to fencer"); ++ return; + + } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + rc = execd_stonith_start(stonith_api, rsc, cmd); +-- +2.27.0 + + +From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 14:42:26 -0600 +Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence + callback + +--- + daemons/fenced/cts-fence-helper.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2adb032f24..c2b55d73b9 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2020 the Pacemaker project contributors ++ * Copyright 2009-2021 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e) + static void + st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- crm_notice("Call id %d completed with rc %d", data->call_id, data->rc); ++ crm_notice("Call %d exited %d: %s (%s)", ++ data->call_id, stonith__exit_status(data), ++ stonith__execution_status(data), ++ crm_str(stonith__exit_reason(data))); + } + + static void +-- +2.27.0 + + +From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:10:14 -0600 +Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of + legacy return code + +--- + daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++---------------- + 1 file changed, 37 insertions(+), 40 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index c2b55d73b9..2739f57804 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -34,23 +34,12 @@ + static GMainLoop *mainloop = NULL; + static crm_trigger_t *trig = NULL; + static int mainloop_iter = 0; +-static int callback_rc = 0; ++static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + typedef void (*mainloop_test_iteration_cb) (int check_event); + + #define MAINLOOP_DEFAULT_TIMEOUT 2 + +-#define mainloop_test_done(pass) \ +- if (pass) { \ +- crm_info("SUCCESS - %s", __func__); \ +- mainloop_iter++; \ +- mainloop_set_trigger(trig); \ +- } else { \ +- crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \ +- crm_exit(CRM_EX_ERROR); \ +- } \ +- callback_rc = 0; \ +- +- + enum test_modes { + test_standard = 0, // test using a specific developer environment + test_passive, // watch notifications only +@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call; + static int expected_notifications = 0; + static int verbose = 0; + ++static void ++mainloop_test_done(const char *origin, bool pass) ++{ ++ if (pass) { ++ crm_info("SUCCESS - %s", origin); ++ mainloop_iter++; ++ mainloop_set_trigger(trig); ++ result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.exit_status = CRM_EX_OK; ++ } else { ++ crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, ++ pcmk_exec_status_str(result.execution_status)); ++ crm_exit(CRM_EX_ERROR); ++ } ++} ++ ++ + static void + dispatch_helper(int timeout) + { +@@ -385,7 +391,9 @@ static void + static void + mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- callback_rc = data->rc; ++ pcmk__set_result(&result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + iterate_mainloop_tests(TRUE); + } + +@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != 0) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event) + if (check_event) { + uint32_t diff = (time(NULL) - begin); + +- if (callback_rc != -ETIME) { +- mainloop_test_done(FALSE); ++ if (result.execution_status != PCMK_EXEC_TIMEOUT) { ++ mainloop_test_done(__func__, false); + } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { + crm_err + ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", + CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } else { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + return; + } +@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event) + rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc != -ENODEV) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, ++ (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); + return; + } + + rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); + if (rc < 0) { + crm_err("fence failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + register_callback_helper(rc); + /* wait for event */ +@@ -483,18 +484,14 @@ test_async_monitor(int check_event) + int rc = 0; + + if (check_event) { +- if (callback_rc) { +- mainloop_test_done(FALSE); +- } else { +- mainloop_test_done(TRUE); +- } ++ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); + return; + } + + rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); + if (rc < 0) { + crm_err("monitor failed with rc %d", rc); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + register_callback_helper(rc); +@@ -531,7 +528,7 @@ test_register_async_devices(int check_event) + params); + stonith_key_value_freeall(params, 1, 1); + +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + } + + static void +@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event) + int rc = stonith_api_connect_retry(st, crm_system_name, 10); + + if (rc == pcmk_ok) { +- mainloop_test_done(TRUE); ++ mainloop_test_done(__func__, true); + return; + } + crm_err("API CONNECTION FAILURE"); +- mainloop_test_done(FALSE); ++ mainloop_test_done(__func__, false); + } + + static void +-- +2.27.0 + + +From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 10:27:45 -0600 +Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo + +caught in review +--- + doc/sphinx/Pacemaker_Development/components.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index 68158484ce..c4d10fc9f5 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -171,7 +171,7 @@ messaging layer callback, which calls: + + * ``fenced_process_fencing_reply()``, which calls either + ``request_peer_fencing()`` (to retry a failed operation, or try the next +- device in a topology is appropriate, which issues a new ++ device in a topology if appropriate, which issues a new + ``STONITH_OP_FENCE`` request, proceeding as before) or + ``finalize_op()`` (if the operation is definitively failed or + successful). +-- +2.27.0 + + +From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:05:40 -0600 +Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how + fencing history works + +--- + doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst +index c4d10fc9f5..760da77c9b 100644 +--- a/doc/sphinx/Pacemaker_Development/components.rst ++++ b/doc/sphinx/Pacemaker_Development/components.rst +@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call + * ``finalize_op()``, which sends the result to all local clients. + + ++.. index:: ++ single: fence history ++ ++Fencing History ++_______________ ++ ++The fencer keeps a running history of all fencing operations. The bulk of the ++relevant code is in `fenced_history.c` and ensures the history is synchronized ++across all nodes even if a node leaves and rejoins the cluster. ++ ++In libstonithd, this information is represented by `stonith_history_t` and is ++queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and ++`stonith_admin` use this API to display the history. ++ ++ + .. index:: + single: scheduler + single: pacemaker-schedulerd +-- +2.27.0 + + +From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 10 Jan 2022 11:25:31 -0600 +Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a + callback + +--- + lib/fencing/st_client.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 9d93ffd481..4823751267 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + cb_info->user_data, cb_info->callback); + + } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { +- crm_warn("Fencing action without registered callback failed: %d (%s)", ++ crm_warn("Fencing action without registered callback failed: %d (%s%s%s)", + result.exit_status, +- pcmk_exec_status_str(result.execution_status)); ++ pcmk_exec_status_str(result.execution_status), ++ ((result.exit_reason == NULL)? "" : ": "), ++ ((result.exit_reason == NULL)? "" : result.exit_reason)); + crm_log_xml_debug(msg, "Failed fence update"); + } + +-- +2.27.0 + + +From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:28:27 -0600 +Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent + +... even if the line runs a little long +--- + daemons/execd/execd_commands.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c +index 7ae309d94c..bc3b392b2c 100644 +--- a/daemons/execd/execd_commands.c ++++ b/daemons/execd/execd_commands.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2012-2021 the Pacemaker project contributors ++ * Copyright 2012-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) + stonith_action_complete(cmd, + ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), + stonith__legacy2status(rc), +- rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); ++ ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); + } + + static int +-- +2.27.0 + + +From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 11 Jan 2022 09:29:03 -0600 +Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution + status for completed tests + +It doesn't matter since the value is only checked against a couple of specific +failure values, but this is less confusing. +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 2739f57804..e222a59f9f 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass) + crm_info("SUCCESS - %s", origin); + mainloop_iter++; + mainloop_set_trigger(trig); +- result.execution_status = PCMK_EXEC_UNKNOWN; ++ result.execution_status = PCMK_EXEC_DONE; + result.exit_status = CRM_EX_OK; + } else { + crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, +-- +2.27.0 + diff --git a/SOURCES/012-notify-crash.patch b/SOURCES/012-notify-crash.patch new file mode 100644 index 0000000..c18e4f5 --- /dev/null +++ b/SOURCES/012-notify-crash.patch @@ -0,0 +1,65 @@ +From ed8b2c86ab77aaa3d7fd688c049ad5e1b922a9c6 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Thu, 13 Jan 2022 02:56:55 -0800 +Subject: [PATCH] Fix: liblrmd: Avoid double-free during notify operation + +This commit fixes a regression introduced by 31c7fa8a, causing a +double-free in notify operations. lrmd_dispatch_internal() assigns the +exit_reason string directly from an XML node to a new lrmd_event_data_t +object (without duplicating), and this string gets freed twice. + +Free #1: pcmk__create_history_xml() (reached via callback) calls +lrmd__set_result(), which frees event.exit_reason and sets it to NULL. +Free #2: lrmd_ipc_dispatch() frees the XML node, which contains a +pointer to the exit_reason string just freed, after +lrmd_dispatch_internal() returns. + +Prior to 31c7fa8a, pcmk__create_history_xml reset event.rc and +event.op_status but **not** event.exit_reason. + +In this commit we simply make a copy of event.exit_reason in +lrmd_dispatch_internal() before the callback. This way we don't have to +worry about whatever happens in the callback, and we can continue to +unset the exit_reason alongside the rc and op_status. The added overhead +should be minimal. + +This commit also makes a copy of output. That's not strictly necessary +but adds some futureproofing and allows us to call lrmd__reset_result() +at the end of lrmd_dispatch_internal(). + +Resolves: RHBZ#2039675 + +Signed-off-by: Reid Wahl +--- + lib/lrmd/lrmd_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index ee31bb5ae9..5131a648b7 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -305,9 +305,10 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR); + event.type = lrmd_event_exec_complete; + +- // No need to duplicate the memory, so don't use setter functions +- event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT); +- event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON); ++ /* output and exit_reason may be freed by a callback */ ++ event.output = crm_element_value_copy(msg, F_LRMD_RSC_OUTPUT); ++ lrmd__set_result(&event, event.rc, event.op_status, ++ crm_element_value(msg, F_LRMD_RSC_EXIT_REASON)); + + event.params = xml2list(msg); + } else if (pcmk__str_eq(type, LRMD_OP_NEW_CLIENT, pcmk__str_none)) { +@@ -324,6 +325,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) + if (event.params) { + g_hash_table_destroy(event.params); + } ++ lrmd__reset_result(&event); + } + + // \return Always 0, to indicate that IPC mainloop source should be kept +-- +2.27.0 + diff --git a/SOURCES/012-string-arguments.patch b/SOURCES/012-string-arguments.patch deleted file mode 100644 index 6419117..0000000 --- a/SOURCES/012-string-arguments.patch +++ /dev/null @@ -1,221 +0,0 @@ -From 2eee93e8f9ea2daa81769bc69843d63ced1a7112 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 20 Jul 2021 16:39:07 -0400 -Subject: [PATCH 1/2] Low: tools: Audit command line options. - -This just goes through and makes sure the command line options that take -arguments are in the special parameter to pcmk__cmdline_preproc, and -that options that do not take arguments are not. ---- - tools/crm_attribute.c | 2 +- - tools/crm_error.c | 2 +- - tools/crm_resource.c | 2 +- - tools/crm_rule.c | 2 +- - tools/crm_simulate.c | 2 +- - tools/crmadmin.c | 2 +- - tools/stonith_admin.c | 2 +- - 7 files changed, 7 insertions(+), 7 deletions(-) - -diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c -index 8a5b4e4..6bd4e2a 100644 ---- a/tools/crm_attribute.c -+++ b/tools/crm_attribute.c -@@ -312,7 +312,7 @@ main(int argc, char **argv) - - GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "DGNPdilnpstv"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "NPUdilnpstv"); - GOptionContext *context = build_arg_context(args, &output_group); - - if (!g_option_context_parse_strv(context, &processed_args, &error)) { -diff --git a/tools/crm_error.c b/tools/crm_error.c -index b4328ce..923f393 100644 ---- a/tools/crm_error.c -+++ b/tools/crm_error.c -@@ -79,7 +79,7 @@ main(int argc, char **argv) - - GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "lrnX"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, NULL); - GOptionContext *context = build_arg_context(args, &output_group); - - if (!g_option_context_parse_strv(context, &processed_args, &error)) { -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index fa7902c..d8e140f 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -1530,7 +1530,7 @@ main(int argc, char **argv) - */ - - args = pcmk__new_common_args(SUMMARY); -- processed_args = pcmk__cmdline_preproc(argv, "GINSTdginpstuv"); -+ processed_args = pcmk__cmdline_preproc(argv, "GHINSTdginpstuvx"); - context = build_arg_context(args, &output_group); - - pcmk__register_formats(output_group, formats); -diff --git a/tools/crm_rule.c b/tools/crm_rule.c -index 8b19bcd..30c5155 100644 ---- a/tools/crm_rule.c -+++ b/tools/crm_rule.c -@@ -239,7 +239,7 @@ main(int argc, char **argv) - - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); - GOptionContext *context = build_arg_context(args); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "nopNO"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "drX"); - - if (!g_option_context_parse_strv(context, &processed_args, &error)) { - exit_code = CRM_EX_USAGE; -diff --git a/tools/crm_simulate.c b/tools/crm_simulate.c -index 0406bff..c83b1b1 100644 ---- a/tools/crm_simulate.c -+++ b/tools/crm_simulate.c -@@ -865,7 +865,7 @@ main(int argc, char **argv) - - GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "bdefgiqrtuwxDFGINO"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "bdefgiqrtuwxDFGINOP"); - GOptionContext *context = build_arg_context(args, &output_group); - - /* This must come before g_option_context_parse_strv. */ -diff --git a/tools/crmadmin.c b/tools/crmadmin.c -index 5cbde1b..b98f282 100644 ---- a/tools/crmadmin.c -+++ b/tools/crmadmin.c -@@ -188,7 +188,7 @@ main(int argc, char **argv) - - GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "itBDEHKNPS"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "itKNS"); - GOptionContext *context = build_arg_context(args, &output_group); - - pcmk__register_formats(output_group, formats); -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index 6773cea..2d48326 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -349,7 +349,7 @@ main(int argc, char **argv) - - GOptionGroup *output_group = NULL; - pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); -- gchar **processed_args = pcmk__cmdline_preproc(argv, "adehilorstvBCDFHQRTU"); -+ gchar **processed_args = pcmk__cmdline_preproc(argv, "adehilorstvyBCDFHQRTU"); - GOptionContext *context = build_arg_context(args, &output_group); - - pcmk__register_formats(output_group, formats); --- -1.8.3.1 - - -From 8301678ad1162450814d2fea5288aefe47a67a74 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 20 Jul 2021 16:40:58 -0400 -Subject: [PATCH 2/2] Low: libcrmcommon: Also allow string arguments that start - with a dash. - -There's various places where an option to a command line argument could -itself be a valid command line argument. For instance: - - crm_attribute -n crm_mon_options -v "-1i3" - -The previous patching to pcmk__cmdline_preproc did not take this into -account. With this patch, options that are last in a string (or by -themselves) and take an argument will have the next command line option -grabbed and copied straight through without processing. - -Regression in 2.1.0 caused by a long-standing bug in pcmk__cmdline_preproc. ---- - lib/common/cmdline.c | 8 ++++++ - .../tests/cmdline/pcmk__cmdline_preproc_test.c | 33 ++++++++++++++++++++++ - 2 files changed, 41 insertions(+) - -diff --git a/lib/common/cmdline.c b/lib/common/cmdline.c -index 9c1b810..1ca6147 100644 ---- a/lib/common/cmdline.c -+++ b/lib/common/cmdline.c -@@ -146,6 +146,7 @@ gchar ** - pcmk__cmdline_preproc(char **argv, const char *special) { - GPtrArray *arr = NULL; - bool saw_dash_dash = false; -+ bool copy_option = false; - - if (argv == NULL) { - return NULL; -@@ -175,6 +176,12 @@ pcmk__cmdline_preproc(char **argv, const char *special) { - continue; - } - -+ if (copy_option == true) { -+ g_ptr_array_add(arr, g_strdup(argv[i])); -+ copy_option = false; -+ continue; -+ } -+ - /* This is just a dash by itself. That could indicate stdin/stdout, or - * it could be user error. Copy it over and let glib figure it out. - */ -@@ -239,6 +246,7 @@ pcmk__cmdline_preproc(char **argv, const char *special) { - */ - } else { - g_ptr_array_add(arr, g_strdup_printf("-%c", *ch)); -+ copy_option = true; - ch++; - } - -diff --git a/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c b/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -index 9a752ef..edc5640 100644 ---- a/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -+++ b/lib/common/tests/cmdline/pcmk__cmdline_preproc_test.c -@@ -106,6 +106,36 @@ negative_score_2(void) { - g_strfreev(processed); - } - -+static void -+string_arg_with_dash(void) { -+ const char *argv[] = { "-n", "crm_mon_options", "-v", "--opt1 --opt2", NULL }; -+ const gchar *expected[] = { "-n", "crm_mon_options", "-v", "--opt1 --opt2", NULL }; -+ -+ gchar **processed = pcmk__cmdline_preproc((char **) argv, "v"); -+ LISTS_EQ(processed, expected); -+ g_strfreev(processed); -+} -+ -+static void -+string_arg_with_dash_2(void) { -+ const char *argv[] = { "-n", "crm_mon_options", "-v", "-1i3", NULL }; -+ const gchar *expected[] = { "-n", "crm_mon_options", "-v", "-1i3", NULL }; -+ -+ gchar **processed = pcmk__cmdline_preproc((char **) argv, "v"); -+ LISTS_EQ(processed, expected); -+ g_strfreev(processed); -+} -+ -+static void -+string_arg_with_dash_3(void) { -+ const char *argv[] = { "-abc", "-1i3", NULL }; -+ const gchar *expected[] = { "-a", "-b", "-c", "-1i3", NULL }; -+ -+ gchar **processed = pcmk__cmdline_preproc((char **) argv, "c"); -+ LISTS_EQ(processed, expected); -+ g_strfreev(processed); -+} -+ - int - main(int argc, char **argv) - { -@@ -120,5 +150,8 @@ main(int argc, char **argv) - g_test_add_func("/common/cmdline/preproc/long_arg", long_arg); - g_test_add_func("/common/cmdline/preproc/negative_score", negative_score); - g_test_add_func("/common/cmdline/preproc/negative_score_2", negative_score_2); -+ g_test_add_func("/common/cmdline/preproc/string_arg_with_dash", string_arg_with_dash); -+ g_test_add_func("/common/cmdline/preproc/string_arg_with_dash_2", string_arg_with_dash_2); -+ g_test_add_func("/common/cmdline/preproc/string_arg_with_dash_3", string_arg_with_dash_3); - return g_test_run(); - } --- -1.8.3.1 - diff --git a/SOURCES/013-leaks.patch b/SOURCES/013-leaks.patch deleted file mode 100644 index daa42b8..0000000 --- a/SOURCES/013-leaks.patch +++ /dev/null @@ -1,241 +0,0 @@ -From bee54eba4d9c28d3a7907a3e13a5deeee6bc0916 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 27 Jul 2021 11:01:04 -0500 -Subject: [PATCH 1/2] Low: tools: avoid (insignificant) memory leaks - -detected by valgrind ---- - lib/pacemaker/pcmk_cluster_queries.c | 2 ++ - tools/crm_diff.c | 2 +- - tools/crm_resource.c | 33 ++++++++++++++++++++------------- - tools/crm_resource_ban.c | 2 +- - 4 files changed, 24 insertions(+), 15 deletions(-) - -diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c -index c68cf9d..46e5538 100644 ---- a/lib/pacemaker/pcmk_cluster_queries.c -+++ b/lib/pacemaker/pcmk_cluster_queries.c -@@ -440,6 +440,7 @@ pcmk__list_nodes(pcmk__output_t *out, char *node_types, gboolean BASH_EXPORT) - } - rc = the_cib->cmds->signon(the_cib, crm_system_name, cib_command); - if (rc != pcmk_ok) { -+ cib_delete(the_cib); - return pcmk_legacy2rc(rc); - } - -@@ -488,6 +489,7 @@ pcmk__list_nodes(pcmk__output_t *out, char *node_types, gboolean BASH_EXPORT) - free_xml(xml_node); - } - the_cib->cmds->signoff(the_cib); -+ cib_delete(the_cib); - return pcmk_legacy2rc(rc); - } - -diff --git a/tools/crm_diff.c b/tools/crm_diff.c -index b37f0ea..9890c10 100644 ---- a/tools/crm_diff.c -+++ b/tools/crm_diff.c -@@ -383,5 +383,5 @@ done: - free_xml(object_2); - - pcmk__output_and_clear_error(error, NULL); -- return exit_code; -+ crm_exit(exit_code); - } -diff --git a/tools/crm_resource.c b/tools/crm_resource.c -index d8e140f..8ca90cb 100644 ---- a/tools/crm_resource.c -+++ b/tools/crm_resource.c -@@ -1081,6 +1081,8 @@ clear_constraints(pcmk__output_t *out, xmlNodePtr *cib_xml_copy) - g_set_error(&error, PCMK__RC_ERROR, rc, - "Could not get modified CIB: %s\n", pcmk_strerror(rc)); - g_list_free(before); -+ free_xml(*cib_xml_copy); -+ *cib_xml_copy = NULL; - return rc; - } - -@@ -1232,29 +1234,34 @@ populate_working_set(xmlNodePtr *cib_xml_copy) - - if (options.xml_file != NULL) { - *cib_xml_copy = filename2xml(options.xml_file); -+ if (*cib_xml_copy == NULL) { -+ rc = pcmk_rc_cib_corrupt; -+ } - } else { - rc = cib_conn->cmds->query(cib_conn, NULL, cib_xml_copy, cib_scope_local | cib_sync_call); - rc = pcmk_legacy2rc(rc); - } - -- if(rc != pcmk_rc_ok) { -- return rc; -+ if (rc == pcmk_rc_ok) { -+ data_set = pe_new_working_set(); -+ if (data_set == NULL) { -+ rc = ENOMEM; -+ } else { -+ pe__set_working_set_flags(data_set, -+ pe_flag_no_counts|pe_flag_no_compat); -+ data_set->priv = out; -+ rc = update_working_set_xml(data_set, cib_xml_copy); -+ } - } - -- /* Populate the working set instance */ -- data_set = pe_new_working_set(); -- if (data_set == NULL) { -- rc = ENOMEM; -+ if (rc != pcmk_rc_ok) { -+ free_xml(*cib_xml_copy); -+ *cib_xml_copy = NULL; - return rc; - } - -- pe__set_working_set_flags(data_set, pe_flag_no_counts|pe_flag_no_compat); -- data_set->priv = out; -- rc = update_working_set_xml(data_set, cib_xml_copy); -- if (rc == pcmk_rc_ok) { -- cluster_status(data_set); -- } -- return rc; -+ cluster_status(data_set); -+ return pcmk_rc_ok; - } - - static int -diff --git a/tools/crm_resource_ban.c b/tools/crm_resource_ban.c -index a297d49..2c4f48d 100644 ---- a/tools/crm_resource_ban.c -+++ b/tools/crm_resource_ban.c -@@ -292,7 +292,7 @@ resource_clear_node_in_location(const char *rsc_id, const char *host, cib_t * ci - rc = pcmk_legacy2rc(rc); - } - -- free(fragment); -+ free_xml(fragment); - return rc; - } - --- -1.8.3.1 - - -From a30ff4a87f291a0c9e03c4efb9c9046d2ac594f1 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 27 Jul 2021 11:26:59 -0500 -Subject: [PATCH 2/2] Fix: tools: avoid memory leaks in crm_mon - -could be significant in an interactive session - -regressions introduced in 2.0.4 and 2.0.5 ---- - lib/pengine/bundle.c | 3 ++- - lib/pengine/clone.c | 5 ++--- - lib/pengine/pe_output.c | 3 +++ - 3 files changed, 7 insertions(+), 4 deletions(-) - -diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c -index 6ba786a..7e1d428 100644 ---- a/lib/pengine/bundle.c -+++ b/lib/pengine/bundle.c -@@ -1497,7 +1497,7 @@ pe__bundle_xml(pcmk__output_t *out, va_list args) - for (GList *gIter = bundle_data->replicas; gIter != NULL; - gIter = gIter->next) { - pe__bundle_replica_t *replica = gIter->data; -- char *id = pcmk__itoa(replica->offset); -+ char *id = NULL; - gboolean print_ip, print_child, print_ctnr, print_remote; - - CRM_ASSERT(replica); -@@ -1531,6 +1531,7 @@ pe__bundle_xml(pcmk__output_t *out, va_list args) - CRM_ASSERT(rc == pcmk_rc_ok); - } - -+ id = pcmk__itoa(replica->offset); - rc = pe__name_and_nvpairs_xml(out, true, "replica", 1, "id", id); - free(id); - CRM_ASSERT(rc == pcmk_rc_ok); -diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c -index 6323692..ab91fd1 100644 ---- a/lib/pengine/clone.c -+++ b/lib/pengine/clone.c -@@ -807,10 +807,10 @@ pe__clone_html(pcmk__output_t *out, va_list args) - pcmk__add_word(&list_text, &list_text_len, host->details->uname); - active_instances++; - } -+ g_list_free(promoted_list); - - if (list_text != NULL) { - out->list_item(out, NULL, PROMOTED_INSTANCES ": [ %s ]", list_text); -- g_list_free(promoted_list); - free(list_text); - list_text = NULL; - list_text_len = 0; -@@ -828,6 +828,7 @@ pe__clone_html(pcmk__output_t *out, va_list args) - pcmk__add_word(&list_text, &list_text_len, host->details->uname); - active_instances++; - } -+ g_list_free(started_list); - - if (list_text != NULL) { - if (pcmk_is_set(rsc->flags, pe_rsc_promotable)) { -@@ -847,7 +848,6 @@ pe__clone_html(pcmk__output_t *out, va_list args) - out->list_item(out, NULL, "Started: [ %s ]", list_text); - } - -- g_list_free(started_list); - free(list_text); - list_text = NULL; - list_text_len = 0; -@@ -1048,10 +1048,10 @@ pe__clone_text(pcmk__output_t *out, va_list args) - pcmk__add_word(&list_text, &list_text_len, host->details->uname); - active_instances++; - } -+ g_list_free(promoted_list); - - if (list_text != NULL) { - out->list_item(out, PROMOTED_INSTANCES, "[ %s ]", list_text); -- g_list_free(promoted_list); - free(list_text); - list_text = NULL; - list_text_len = 0; -@@ -1069,6 +1069,7 @@ pe__clone_text(pcmk__output_t *out, va_list args) - pcmk__add_word(&list_text, &list_text_len, host->details->uname); - active_instances++; - } -+ g_list_free(started_list); - - if (list_text != NULL) { - if (pcmk_is_set(rsc->flags, pe_rsc_promotable)) { -@@ -1084,7 +1085,6 @@ pe__clone_text(pcmk__output_t *out, va_list args) - out->list_item(out, "Started", "[ %s ]", list_text); - } - -- g_list_free(started_list); - free(list_text); - list_text = NULL; - } -diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c -index b8997c4..20bd1a9 100644 ---- a/lib/pengine/pe_output.c -+++ b/lib/pengine/pe_output.c -@@ -1410,6 +1410,8 @@ node_text(pcmk__output_t *out, va_list args) { - - out->end_list(out); - out->end_list(out); -+ -+ g_list_free(rscs); - } - - } else { -@@ -1739,6 +1741,7 @@ node_attribute_list(pcmk__output_t *out, va_list args) { - } - - if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ g_list_free(attr_list); - continue; - } - --- -1.8.3.1 - diff --git a/SOURCES/013-probe-failures.patch b/SOURCES/013-probe-failures.patch new file mode 100644 index 0000000..c13867e --- /dev/null +++ b/SOURCES/013-probe-failures.patch @@ -0,0 +1,26 @@ +From 186d5a02fba919c455fd6eeb050b4be107f82159 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 13 Jan 2022 17:02:47 -0500 +Subject: [PATCH] Low: scheduler: Use the old RC code to log maskable probe + failures. + +--- + lib/pengine/unpack.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 8a2d2a6d6d..b01f86257a 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -3780,7 +3780,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, + + if (maskable_probe_failure) { + crm_notice("Treating probe result '%s' for %s on %s as 'not running'", +- services_ocf_exitcode_str(rc), rsc->id, node->details->uname); ++ services_ocf_exitcode_str(old_rc), rsc->id, node->details->uname); + update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, + on_fail, data_set); + crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); +-- +2.27.0 + diff --git a/SOURCES/014-pcmk_delay_base.patch b/SOURCES/014-pcmk_delay_base.patch new file mode 100644 index 0000000..8aba265 --- /dev/null +++ b/SOURCES/014-pcmk_delay_base.patch @@ -0,0 +1,43 @@ +From 9d812b0401d4cedef53a3cc3653ec782a5c49e37 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 13 Jan 2022 10:42:02 -0600 +Subject: [PATCH] Doc: fencer: improve pcmk_delay_base meta-data + +Update its type, since its value can now be a node map as well as a string, +and add more detail to its description. +--- + daemons/fenced/pacemaker-fenced.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c +index 1b954be5a4..12f331496c 100644 +--- a/daemons/fenced/pacemaker-fenced.c ++++ b/daemons/fenced/pacemaker-fenced.c +@@ -1548,13 +1548,17 @@ main(int argc, char **argv) + PCMK_STONITH_DELAY_BASE); + printf(" Enable a base delay for " + "fencing actions and specify base delay value.\n"); +- printf(" This prevents double fencing when " +- "different delays are configured on the nodes.\nUse this to " +- "enable a static delay for fencing actions.\nThe overall delay " +- "is derived from a random delay value adding this static delay " +- "so that the sum is kept below the maximum delay.\nSet to eg. " +- "node1:1s;node2:5 to set different value per node.\n"); +- printf(" \n"); ++ printf(" This enables a static delay for " ++ "fencing actions, which can help avoid \"death matches\" where " ++ "two nodes try to fence each other at the same time. If " ++ PCMK_STONITH_DELAY_MAX " is also used, a random delay will be " ++ "added such that the total delay is kept below that value.\n" ++ "This can be set to a single time value to apply to any node " ++ "targeted by this device (useful if a separate device is " ++ "configured for each target), or to a node map (for example, " ++ "\"node1:1s;node2:5\") to set a different value per target.\n" ++ " \n"); ++ printf(" \n"); + printf(" \n"); + + printf(" \n", +-- +2.27.0 + diff --git a/SOURCES/014-str-list.patch b/SOURCES/014-str-list.patch deleted file mode 100644 index e6993ab..0000000 --- a/SOURCES/014-str-list.patch +++ /dev/null @@ -1,465 +0,0 @@ -From 45813df3eb4c8ad8b1744fa5dd56af86ad0fb3dd Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 17 Jun 2021 16:07:55 -0400 -Subject: [PATCH] Refactor: libs: pcmk__str_in_list should support pcmk__str_* - flags. - ---- - include/crm/common/strings_internal.h | 2 +- - lib/common/strings.c | 34 +++++++++++++++++++++++---- - lib/fencing/st_output.c | 10 ++++---- - lib/pengine/bundle.c | 8 +++---- - lib/pengine/clone.c | 28 +++++++++++----------- - lib/pengine/group.c | 18 +++++++------- - lib/pengine/native.c | 4 ++-- - lib/pengine/pe_output.c | 22 ++++++++--------- - lib/pengine/utils.c | 6 ++--- - 9 files changed, 79 insertions(+), 53 deletions(-) - -diff --git a/include/crm/common/strings_internal.h b/include/crm/common/strings_internal.h -index 94982cb4e..687079814 100644 ---- a/include/crm/common/strings_internal.h -+++ b/include/crm/common/strings_internal.h -@@ -117,7 +117,7 @@ pcmk__intkey_table_remove(GHashTable *hash_table, int key) - return g_hash_table_remove(hash_table, GINT_TO_POINTER(key)); - } - --gboolean pcmk__str_in_list(GList *lst, const gchar *s); -+gboolean pcmk__str_in_list(GList *lst, const gchar *s, uint32_t flags); - - bool pcmk__strcase_any_of(const char *s, ...) G_GNUC_NULL_TERMINATED; - bool pcmk__str_any_of(const char *s, ...) G_GNUC_NULL_TERMINATED; -diff --git a/lib/common/strings.c b/lib/common/strings.c -index 3264db5b6..e1e98803b 100644 ---- a/lib/common/strings.c -+++ b/lib/common/strings.c -@@ -872,14 +872,30 @@ pcmk__parse_ll_range(const char *srcstring, long long *start, long long *end) - * Search \p lst for \p s, taking case into account. As a special case, - * if "*" is the only element of \p lst, the search is successful. - * -- * \param[in] lst List to search -- * \param[in] s String to search for -+ * Behavior can be changed with various flags: -+ * -+ * - pcmk__str_casei - By default, comparisons are done taking case into -+ * account. This flag makes comparisons case-insensitive. -+ * - pcmk__str_null_matches - If the input string is NULL, return TRUE. -+ * -+ * \note The special "*" matching rule takes precedence over flags. In -+ * particular, "*" will match a NULL input string even without -+ * pcmk__str_null_matches being specified. -+ * -+ * \note No matter what input string or flags are provided, an empty -+ * list will always return FALSE. -+ * -+ * \param[in] lst List to search -+ * \param[in] s String to search for -+ * \param[in] flags A bitfield of pcmk__str_flags to modify operation - * - * \return \c TRUE if \p s is in \p lst, or \c FALSE otherwise - */ - gboolean --pcmk__str_in_list(GList *lst, const gchar *s) -+pcmk__str_in_list(GList *lst, const gchar *s, uint32_t flags) - { -+ GCompareFunc fn; -+ - if (lst == NULL) { - return FALSE; - } -@@ -888,7 +904,17 @@ pcmk__str_in_list(GList *lst, const gchar *s) - return TRUE; - } - -- return g_list_find_custom(lst, s, (GCompareFunc) strcmp) != NULL; -+ if (s == NULL) { -+ return pcmk_is_set(flags, pcmk__str_null_matches); -+ } -+ -+ if (pcmk_is_set(flags, pcmk__str_casei)) { -+ fn = (GCompareFunc) strcasecmp; -+ } else { -+ fn = (GCompareFunc) strcmp; -+ } -+ -+ return g_list_find_custom(lst, s, fn) != NULL; - } - - static bool -diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c -index 568ae46a8..e1ae8ac87 100644 ---- a/lib/fencing/st_output.c -+++ b/lib/fencing/st_output.c -@@ -47,7 +47,7 @@ stonith__failed_history(pcmk__output_t *out, va_list args) { - continue; - } - -- if (!pcmk__str_in_list(only_node, hp->target)) { -+ if (!pcmk__str_in_list(only_node, hp->target, pcmk__str_none)) { - continue; - } - -@@ -72,7 +72,7 @@ stonith__history(pcmk__output_t *out, va_list args) { - int rc = pcmk_rc_no_output; - - for (stonith_history_t *hp = history; hp; hp = hp->next) { -- if (!pcmk__str_in_list(only_node, hp->target)) { -+ if (!pcmk__str_in_list(only_node, hp->target, pcmk__str_none)) { - continue; - } - -@@ -101,7 +101,7 @@ stonith__full_history(pcmk__output_t *out, va_list args) { - int rc = pcmk_rc_no_output; - - for (stonith_history_t *hp = history; hp; hp = hp->next) { -- if (!pcmk__str_in_list(only_node, hp->target)) { -+ if (!pcmk__str_in_list(only_node, hp->target, pcmk__str_none)) { - continue; - } - -@@ -129,7 +129,7 @@ full_history_xml(pcmk__output_t *out, va_list args) { - - if (history_rc == 0) { - for (stonith_history_t *hp = history; hp; hp = hp->next) { -- if (!pcmk__str_in_list(only_node, hp->target)) { -+ if (!pcmk__str_in_list(only_node, hp->target, pcmk__str_none)) { - continue; - } - -@@ -218,7 +218,7 @@ stonith__pending_actions(pcmk__output_t *out, va_list args) { - int rc = pcmk_rc_no_output; - - for (stonith_history_t *hp = history; hp; hp = hp->next) { -- if (!pcmk__str_in_list(only_node, hp->target)) { -+ if (!pcmk__str_in_list(only_node, hp->target, pcmk__str_none)) { - continue; - } - -diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c -index 9237392e4..6ba786ae6 100644 ---- a/lib/pengine/bundle.c -+++ b/lib/pengine/bundle.c -@@ -1492,7 +1492,7 @@ pe__bundle_xml(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc->id); -+ print_everything = pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none); - - for (GList *gIter = bundle_data->replicas; gIter != NULL; - gIter = gIter->next) { -@@ -1614,7 +1614,7 @@ pe__bundle_html(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc->id); -+ print_everything = pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none); - - for (GList *gIter = bundle_data->replicas; gIter != NULL; - gIter = gIter->next) { -@@ -1742,7 +1742,7 @@ pe__bundle_text(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc->id); -+ print_everything = pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none); - - for (GList *gIter = bundle_data->replicas; gIter != NULL; - gIter = gIter->next) { -@@ -2044,7 +2044,7 @@ pe__bundle_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_paren - gboolean passes = FALSE; - pe__bundle_variant_data_t *bundle_data = NULL; - -- if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc))) { -+ if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none)) { - passes = TRUE; - } else { - get_bundle_variant_data(bundle_data, rsc); -diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c -index 5662338f3..5a6bfa61f 100644 ---- a/lib/pengine/clone.c -+++ b/lib/pengine/clone.c -@@ -624,8 +624,8 @@ pe__clone_xml(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - for (; gIter != NULL; gIter = gIter->next) { - pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; -@@ -693,8 +693,8 @@ pe__clone_html(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - out->begin_list(out, NULL, NULL, "Clone Set: %s [%s]%s%s%s%s", - rsc->id, ID(clone_data->xml_obj_child), -@@ -801,7 +801,7 @@ pe__clone_html(pcmk__output_t *out, va_list args) - for (gIter = promoted_list; gIter; gIter = gIter->next) { - pe_node_t *host = gIter->data; - -- if (!pcmk__str_in_list(only_node, host->details->uname)) { -+ if (!pcmk__str_in_list(only_node, host->details->uname, pcmk__str_none)) { - continue; - } - -@@ -822,7 +822,7 @@ pe__clone_html(pcmk__output_t *out, va_list args) - for (gIter = started_list; gIter; gIter = gIter->next) { - pe_node_t *host = gIter->data; - -- if (!pcmk__str_in_list(only_node, host->details->uname)) { -+ if (!pcmk__str_in_list(only_node, host->details->uname, pcmk__str_none)) { - continue; - } - -@@ -884,7 +884,7 @@ pe__clone_html(pcmk__output_t *out, va_list args) - pe_node_t *node = (pe_node_t *)nIter->data; - - if (pe_find_node(rsc->running_on, node->details->uname) == NULL && -- pcmk__str_in_list(only_node, node->details->uname)) { -+ pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - pcmk__add_word(&stopped_list, &stopped_list_len, - node->details->uname); - } -@@ -933,8 +933,8 @@ pe__clone_text(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - out->begin_list(out, NULL, NULL, "Clone Set: %s [%s]%s%s%s%s", - rsc->id, ID(clone_data->xml_obj_child), -@@ -1041,7 +1041,7 @@ pe__clone_text(pcmk__output_t *out, va_list args) - for (gIter = promoted_list; gIter; gIter = gIter->next) { - pe_node_t *host = gIter->data; - -- if (!pcmk__str_in_list(only_node, host->details->uname)) { -+ if (!pcmk__str_in_list(only_node, host->details->uname, pcmk__str_none)) { - continue; - } - -@@ -1062,7 +1062,7 @@ pe__clone_text(pcmk__output_t *out, va_list args) - for (gIter = started_list; gIter; gIter = gIter->next) { - pe_node_t *host = gIter->data; - -- if (!pcmk__str_in_list(only_node, host->details->uname)) { -+ if (!pcmk__str_in_list(only_node, host->details->uname, pcmk__str_none)) { - continue; - } - -@@ -1120,7 +1120,7 @@ pe__clone_text(pcmk__output_t *out, va_list args) - pe_node_t *node = (pe_node_t *)nIter->data; - - if (pe_find_node(rsc->running_on, node->details->uname) == NULL && -- pcmk__str_in_list(only_node, node->details->uname)) { -+ pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - pcmk__add_word(&stopped_list, &stopped_list_len, - node->details->uname); - } -@@ -1220,11 +1220,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent - gboolean passes = FALSE; - clone_variant_data_t *clone_data = NULL; - -- if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc))) { -+ if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none)) { - passes = TRUE; - } else { - get_clone_variant_data(clone_data, rsc); -- passes = pcmk__str_in_list(only_rsc, ID(clone_data->xml_obj_child)); -+ passes = pcmk__str_in_list(only_rsc, ID(clone_data->xml_obj_child), pcmk__str_none); - - if (!passes) { - for (GList *gIter = rsc->children; gIter != NULL; gIter = gIter->next) { -diff --git a/lib/pengine/group.c b/lib/pengine/group.c -index 23a72cff7..5f9aa83ce 100644 ---- a/lib/pengine/group.c -+++ b/lib/pengine/group.c -@@ -201,8 +201,8 @@ pe__group_xml(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - for (; gIter != NULL; gIter = gIter->next) { - pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; -@@ -248,8 +248,8 @@ pe__group_html(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - if (options & pe_print_brief) { - GList *rscs = pe__filter_rsc_list(rsc->children, only_rsc); -@@ -303,8 +303,8 @@ pe__group_text(pcmk__output_t *out, va_list args) - return rc; - } - -- print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)); -+ print_everything = pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)); - - if (options & pe_print_brief) { - GList *rscs = pe__filter_rsc_list(rsc->children, only_rsc); -@@ -387,11 +387,11 @@ pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent - { - gboolean passes = FALSE; - -- if (check_parent && pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(rsc)))) { -+ if (check_parent && pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(rsc)), pcmk__str_none)) { - passes = TRUE; -- } else if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc))) { -+ } else if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none)) { - passes = TRUE; -- } else if (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id)) { -+ } else if (strstr(rsc->id, ":") != NULL && pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)) { - passes = TRUE; - } else { - for (GList *gIter = rsc->children; gIter != NULL; gIter = gIter->next) { -diff --git a/lib/pengine/native.c b/lib/pengine/native.c -index c2333d0d2..56054fc4a 100644 ---- a/lib/pengine/native.c -+++ b/lib/pengine/native.c -@@ -1338,8 +1338,8 @@ pe__rscs_brief_output(pcmk__output_t *out, GList *rsc_list, unsigned int show_op - gboolean - pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent) - { -- if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) || -- pcmk__str_in_list(only_rsc, rsc->id)) { -+ if (pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) || -+ pcmk__str_in_list(only_rsc, rsc->id, pcmk__str_none)) { - return FALSE; - } else if (check_parent) { - pe_resource_t *up = uber_parent(rsc); -diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c -index 727475735..a6dc4ade8 100644 ---- a/lib/pengine/pe_output.c -+++ b/lib/pengine/pe_output.c -@@ -670,8 +670,8 @@ ban_list(pcmk__output_t *out, va_list args) { - continue; - } - -- if (!pcmk__str_in_list(only_rsc, rsc_printable_id(location->rsc_lh)) && -- !pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(location->rsc_lh)))) { -+ if (!pcmk__str_in_list(only_rsc, rsc_printable_id(location->rsc_lh), pcmk__str_none) && -+ !pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(location->rsc_lh)), pcmk__str_none)) { - continue; - } - -@@ -1254,7 +1254,7 @@ failed_action_list(pcmk__output_t *out, va_list args) { - xml_op = pcmk__xml_next(xml_op)) { - char *rsc = NULL; - -- if (!pcmk__str_in_list(only_node, crm_element_value(xml_op, XML_ATTR_UNAME))) { -+ if (!pcmk__str_in_list(only_node, crm_element_value(xml_op, XML_ATTR_UNAME), pcmk__str_none)) { - continue; - } - -@@ -1263,7 +1263,7 @@ failed_action_list(pcmk__output_t *out, va_list args) { - continue; - } - -- if (!pcmk__str_in_list(only_rsc, rsc)) { -+ if (!pcmk__str_in_list(only_rsc, rsc, pcmk__str_none)) { - free(rsc); - continue; - } -@@ -1738,7 +1738,7 @@ node_attribute_list(pcmk__output_t *out, va_list args) { - continue; - } - -- if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ if (!pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - g_list_free(attr_list); - continue; - } -@@ -1835,8 +1835,8 @@ node_history_list(pcmk__output_t *out, va_list args) { - * For other resource types, is_filtered is okay. - */ - if (uber_parent(rsc)->variant == pe_group) { -- if (!pcmk__str_in_list(only_rsc, rsc_printable_id(rsc)) && -- !pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(rsc)))) { -+ if (!pcmk__str_in_list(only_rsc, rsc_printable_id(rsc), pcmk__str_none) && -+ !pcmk__str_in_list(only_rsc, rsc_printable_id(uber_parent(rsc)), pcmk__str_none)) { - continue; - } - } else { -@@ -1899,7 +1899,7 @@ node_list_html(pcmk__output_t *out, va_list args) { - for (GList *gIter = nodes; gIter != NULL; gIter = gIter->next) { - pe_node_t *node = (pe_node_t *) gIter->data; - -- if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ if (!pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - continue; - } - -@@ -1940,7 +1940,7 @@ pe__node_list_text(pcmk__output_t *out, va_list args) { - const char *node_mode = NULL; - char *node_name = pe__node_display_name(node, print_clone_detail); - -- if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ if (!pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - free(node_name); - continue; - } -@@ -2059,7 +2059,7 @@ node_list_xml(pcmk__output_t *out, va_list args) { - for (GList *gIter = nodes; gIter != NULL; gIter = gIter->next) { - pe_node_t *node = (pe_node_t *) gIter->data; - -- if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ if (!pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - continue; - } - -@@ -2097,7 +2097,7 @@ node_summary(pcmk__output_t *out, va_list args) { - continue; - } - -- if (!pcmk__str_in_list(only_node, node->details->uname)) { -+ if (!pcmk__str_in_list(only_node, node->details->uname, pcmk__str_none)) { - continue; - } - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 450d8348c..d1be9e4ca 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -2394,7 +2394,7 @@ pe__rsc_running_on_any_node_in_list(pe_resource_t *rsc, GList *node_list) - { - for (GList *ele = rsc->running_on; ele; ele = ele->next) { - pe_node_t *node = (pe_node_t *) ele->data; -- if (pcmk__str_in_list(node_list, node->details->uname)) { -+ if (pcmk__str_in_list(node_list, node->details->uname, pcmk__str_none)) { - return true; - } - } -@@ -2419,8 +2419,8 @@ pe__filter_rsc_list(GList *rscs, GList *filter) - /* I think the second condition is safe here for all callers of this - * function. If not, it needs to move into pe__node_text. - */ -- if (pcmk__str_in_list(filter, rsc_printable_id(rsc)) || -- (rsc->parent && pcmk__str_in_list(filter, rsc_printable_id(rsc->parent)))) { -+ if (pcmk__str_in_list(filter, rsc_printable_id(rsc), pcmk__str_none) || -+ (rsc->parent && pcmk__str_in_list(filter, rsc_printable_id(rsc->parent), pcmk__str_none))) { - retval = g_list_prepend(retval, rsc); - } - } --- -2.27.0 - diff --git a/SOURCES/015-fencing-reasons.patch b/SOURCES/015-fencing-reasons.patch new file mode 100644 index 0000000..c53b6c9 --- /dev/null +++ b/SOURCES/015-fencing-reasons.patch @@ -0,0 +1,1093 @@ +From 87365f49b1bee0baa536783865fbd835a9cacc97 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 2 Dec 2021 16:12:24 -0600 +Subject: [PATCH 01/11] Refactor: libstonithd: functionize getting notification + data XML + +Also, only get the data when needed. +--- + lib/fencing/st_client.c | 32 +++++++++++++++++++++++--------- + 1 file changed, 23 insertions(+), 9 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 4823751267..72a0a49408 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1312,6 +1312,23 @@ stonith_dump_pending_callbacks(stonith_t * stonith) + return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); + } + ++/*! ++ * \internal ++ * \brief Get the data section of a fencer notification ++ * ++ * \param[in] msg Notification XML ++ * \param[in] ntype Notification type ++ */ ++static xmlNode * ++get_event_data_xml(xmlNode *msg, const char *ntype) ++{ ++ char *data_addr = crm_strdup_printf("//%s", ntype); ++ xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); ++ ++ free(data_addr); ++ return data; ++} ++ + /* + + +@@ -1336,17 +1353,18 @@ xml_to_event(xmlNode * msg) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); +- char *data_addr = crm_strdup_printf("//%s", ntype); +- xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); + + crm_log_xml_trace(msg, "stonith_notify"); + + crm_element_value_int(msg, F_STONITH_RC, &(event->result)); + + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); ++ xmlNode *data = get_event_data_xml(msg, ntype); + +- if (data) { ++ if (data == NULL) { ++ crm_err("No data for %s event", ntype); ++ crm_log_xml_notice(msg, "BadEvent"); ++ } else { + event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); + event->action = crm_element_value_copy(data, F_STONITH_ACTION); + event->target = crm_element_value_copy(data, F_STONITH_TARGET); +@@ -1354,14 +1372,10 @@ xml_to_event(xmlNode * msg) + event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); + event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); + event->device = crm_element_value_copy(data, F_STONITH_DEVICE); +- +- } else { +- crm_err("No data for %s event", ntype); +- crm_log_xml_notice(msg, "BadEvent"); + } ++ event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); + } + +- free(data_addr); + return event; + } + +-- +2.27.0 + + +From 448f86a029d5d7e3c255d813929003a8cc2cffba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:01:23 -0600 +Subject: [PATCH 02/11] Refactor: fencing: parse full result from fencer + notifications + +stonith_event_t previously contained only the legacy return code for the +notification event. Use its new opaque member to store the full result, along +with accessors (available only internally for now). Nothing uses them yet. +--- + include/crm/fencing/internal.h | 5 +++ + lib/fencing/st_client.c | 68 ++++++++++++++++++++++++++++++++-- + 2 files changed, 70 insertions(+), 3 deletions(-) + +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index eff689e59b..acc16d05e9 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -187,10 +187,15 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); + bool stonith__event_state_neq(stonith_history_t *history, void *user_data); + + int stonith__legacy2status(int rc); ++ + int stonith__exit_status(stonith_callback_data_t *data); + int stonith__execution_status(stonith_callback_data_t *data); + const char *stonith__exit_reason(stonith_callback_data_t *data); + ++int stonith__event_exit_status(stonith_event_t *event); ++int stonith__event_execution_status(stonith_event_t *event); ++const char *stonith__event_exit_reason(stonith_event_t *event); ++ + /*! + * \internal + * \brief Is a fencing operation in pending state? +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 72a0a49408..f58b3a6745 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1349,15 +1349,23 @@ get_event_data_xml(xmlNode *msg, const char *ntype) + + */ + static stonith_event_t * +-xml_to_event(xmlNode * msg) ++xml_to_event(xmlNode *msg, pcmk__action_result_t *result) + { + stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); + const char *ntype = crm_element_value(msg, F_SUBTYPE); + ++ CRM_ASSERT((event != NULL) && (result != NULL)); ++ + crm_log_xml_trace(msg, "stonith_notify"); + +- crm_element_value_int(msg, F_STONITH_RC, &(event->result)); ++ // All notification types have the operation result ++ event->opaque = result; ++ stonith__xe_get_result(msg, result); ++ ++ // @COMPAT The API originally provided the result as a legacy return code ++ event->result = pcmk_rc2legacy(stonith__result2rc(result)); + ++ // Fence notifications have additional information + if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { + xmlNode *data = get_event_data_xml(msg, ntype); + +@@ -1392,6 +1400,7 @@ event_free(stonith_event_t * event) + free(event->executioner); + free(event->device); + free(event->client_origin); ++ pcmk__reset_result((pcmk__action_result_t *) (event->opaque)); + free(event); + } + +@@ -1402,6 +1411,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + stonith_notify_client_t *entry = data; + stonith_event_t *st_event = NULL; + const char *event = NULL; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + if (blob->xml == NULL) { + crm_warn("Skipping callback - NULL message"); +@@ -1427,7 +1437,7 @@ stonith_send_notification(gpointer data, gpointer user_data) + return; + } + +- st_event = xml_to_event(blob->xml); ++ st_event = xml_to_event(blob->xml, &result); + + crm_trace("Invoking callback for %p/%s event...", entry, event); + entry->notify(blob->stonith, st_event); +@@ -2366,6 +2376,58 @@ stonith__exit_reason(stonith_callback_data_t *data) + return ((pcmk__action_result_t *) data->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return the exit status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit status from event ++ */ ++int ++stonith__event_exit_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return CRM_EX_ERROR; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the execution status from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Execution status from event ++ */ ++int ++stonith__event_execution_status(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return PCMK_EXEC_UNKNOWN; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->execution_status; ++} ++ ++/*! ++ * \internal ++ * \brief Return the exit reason from an event notification ++ * ++ * \param[in] event Event ++ * ++ * \return Exit reason from event ++ */ ++const char * ++stonith__event_exit_reason(stonith_event_t *event) ++{ ++ if ((event == NULL) || (event->opaque == NULL)) { ++ return NULL; ++ } ++ return ((pcmk__action_result_t *) event->opaque)->exit_reason; ++} ++ ++ + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START + +-- +2.27.0 + + +From 8dab65e65fe760052d1151749a7bfb2203445813 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 19 Nov 2021 17:02:28 -0600 +Subject: [PATCH 03/11] Refactor: fencing: parse full result from synchronous + fencer replies + +stonith_send_command() now parses the full result from synchronous fencer +replies, and maps that to a legacy return code, rather than parse the legacy +return code directly. + +The full result is not used yet, and won't be until we can break backward API +compatibility, since the API functions that call stonith_send_command() +currently return a legacy code. +--- + lib/fencing/st_client.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index f58b3a6745..5fec7529e3 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1537,11 +1537,13 @@ stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNod + crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); + + if (reply_id == stonith->call_id) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ + crm_trace("Synchronous reply %d received", reply_id); + +- if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { +- rc = -ENOMSG; +- } ++ stonith__xe_get_result(op_reply, &result); ++ rc = pcmk_rc2legacy(stonith__result2rc(&result)); ++ pcmk__reset_result(&result); + + if ((call_options & st_opt_discard_reply) || output_data == NULL) { + crm_trace("Discarding reply"); +-- +2.27.0 + + +From 1beb319d8c62ab93b4c08b26a4e03151906c6189 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 6 Dec 2021 17:13:44 -0600 +Subject: [PATCH 04/11] Log: fencing: improve cts-fence-helper result logs + +Use the full result from the fencing event +--- + daemons/fenced/cts-fence-helper.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index e222a59f9f..858cddc9de 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -125,10 +125,14 @@ st_callback(stonith_t * st, stonith_event_t * e) + crm_exit(CRM_EX_DISCONNECT); + } + +- crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)", +- e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed", +- e->target, e->executioner ? e->executioner : "", +- pcmk_strerror(e->result), e->id); ++ crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)", ++ ((e->operation == NULL)? "unknown" : e->operation), ++ ((e->target == NULL)? "no node" : e->target), ++ ((e->executioner == NULL)? "any node" : e->executioner), ++ ((e->origin == NULL)? "unknown client" : e->origin), ++ pcmk_exec_status_str(stonith__event_execution_status(e)), ++ stonith__event_exit_status(e), ++ ((e->id == NULL)? "none" : e->id)); + + if (expected_notifications) { + expected_notifications--; +-- +2.27.0 + + +From b26f701833ade5d7441fba317832d6e827bd16d0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Dec 2021 16:52:09 -0600 +Subject: [PATCH 05/11] Test: cts-fence-helper: update expected return code + +Before recent changes, libstonithd obtained the fence API's legacy result code +directly from the fencer's XML reply, meaning that the legacy code was the +result of the fencer's mapping of the full result (including the action stderr). + +After those changes, libstonithd now ignores the legacy code in the fencer's +reply, and instead maps the legacy code itself from the full result in the +fencer's reply. + +However, the fencer's reply does not have the action stderr, so failures that +mapped to -pcmk_err_generic on the server side now map to -ENODATA on the +client side. Update cts-fence-helper's expected return code to match (neither +code is particularly useful, so there wouldn't be much benefit from having the +fencer pass the action stderr with replies, which would be considerable +additional work). +--- + daemons/fenced/cts-fence-helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c +index 858cddc9de..e3113452ef 100644 +--- a/daemons/fenced/cts-fence-helper.c ++++ b/daemons/fenced/cts-fence-helper.c +@@ -207,10 +207,10 @@ run_fence_failure_test(void) + "Register device1 for failure test", 1, 0); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), +- "Fence failure results off", 1, -pcmk_err_generic); ++ "Fence failure results off", 1, -ENODATA); + + single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0), +- "Fence failure results reboot", 1, -pcmk_err_generic); ++ "Fence failure results reboot", 1, -ENODATA); + + single_test(st->cmds->remove_device(st, st_opts, "test-id1"), + "Remove device1 for failure test", 1, 0); +-- +2.27.0 + + +From 123429de229c2148e320c76530b95e6ba458b9f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:28:48 -0600 +Subject: [PATCH 06/11] Low: controller: compare fencing targets + case-insensitively + +... since they are node names +--- + daemons/controld/controld_fencing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index f8d2fc13f4..70e141dc28 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -466,7 +466,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + return; + + } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { ++ && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { + + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster +-- +2.27.0 + + +From 3a067b8e58b3aefb49b2af1c35d0ad28b2de8784 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:37:56 -0600 +Subject: [PATCH 07/11] Refactor: controller: best practices for handling + fencing notifications + +Rename tengine_stonith_notify() to handle_fence_notification(), rename its +st_event argument to event, add a doxygen block, and use some new variables and +reformatting to make it easier to follow (and change later). +--- + daemons/controld/controld_fencing.c | 131 ++++++++++++++++------------ + 1 file changed, 75 insertions(+), 56 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 70e141dc28..00626444da 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -435,39 +435,59 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) + } + } + ++/*! ++ * \internal ++ * \brief Handle an event notification from the fencing API ++ * ++ * \param[in] st Fencing API connection ++ * \param[in] event Fencing API event notification ++ */ + static void +-tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) ++handle_fence_notification(stonith_t *st, stonith_event_t *event) + { ++ bool succeeded = true; ++ const char *executioner = "the cluster"; ++ const char *client = "a client"; ++ + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, + (unsigned long) getpid()); + } + +- if (st_event == NULL) { ++ if (event == NULL) { + crm_err("Notify data not found"); + return; + } + +- crmd_alert_fencing_op(st_event); ++ if (event->executioner != NULL) { ++ executioner = event->executioner; ++ } ++ if (event->client_origin != NULL) { ++ client = event->client_origin; ++ } + +- if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- st_event->origin); +- /* TODO: Hook up st_event->device */ +- return; ++ if (event->result != pcmk_ok) { ++ succeeded = false; ++ } + +- } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { +- crm_err("Unfencing of %s by %s failed: %s (%d)", +- st_event->target, +- st_event->executioner? st_event->executioner : "", +- pcmk_strerror(st_event->result), st_event->result); +- return; ++ crmd_alert_fencing_op(event); + +- } else if ((st_event->result == pcmk_ok) +- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { ++ if (pcmk__str_eq("on", event->action, pcmk__str_none)) { ++ // Unfencing doesn't need special handling, just a log message ++ if (succeeded) { ++ crm_notice("%s was successfully unfenced by %s (at the request of %s)", ++ event->target, executioner, event->origin); ++ /* TODO: Hook up event->device */ ++ } else { ++ crm_err("Unfencing of %s by %s failed: %s (%d)", ++ event->target, executioner, ++ pcmk_strerror(st_event->result), st_event->result); ++ } ++ return; ++ } + ++ if (succeeded ++ && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) { + /* We were notified of our own fencing. Most likely, either fencing was + * misconfigured, or fabric fencing that doesn't cut cluster + * communication is in use. +@@ -478,44 +498,41 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * our subsequent election votes as "not part of our cluster". + */ + crm_crit("We were allegedly just fenced by %s for %s!", +- st_event->executioner? st_event->executioner : "the cluster", +- st_event->origin); /* Dumps blackbox if enabled */ ++ executioner, event->origin); // Dumps blackbox if enabled + if (fence_reaction_panic) { + pcmk__panic(__func__); + } else { + crm_exit(CRM_EX_FATAL); + } +- return; ++ return; // Should never get here + } + +- /* Update the count of stonith failures for this target, in case we become ++ /* Update the count of fencing failures for this target, in case we become + * DC later. The current DC has already updated its fail count in + * tengine_stonith_callback(). + */ +- if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { +- if (st_event->result == pcmk_ok) { +- st_fail_count_reset(st_event->target); ++ if (!AM_I_DC ++ && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, ++ pcmk__str_casei)) { ++ ++ if (succeeded) { ++ st_fail_count_reset(event->target); + } else { +- st_fail_count_increment(st_event->target); ++ st_fail_count_increment(event->target); + } + } + + crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " + CRM_XS " initiator=%s ref=%s", +- st_event->target, st_event->result == pcmk_ok ? "" : " not", +- st_event->action, +- st_event->executioner ? st_event->executioner : "", +- (st_event->client_origin? st_event->client_origin : ""), +- pcmk_strerror(st_event->result), +- st_event->origin, st_event->id); +- +- if (st_event->result == pcmk_ok) { +- crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, ++ event->target, (succeeded? "" : " not"), ++ event->action, executioner, client, ++ pcmk_strerror(event->result), ++ event->origin, event->id); ++ ++ if (succeeded) { ++ crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, + CRM_GET_PEER_ANY); + const char *uuid = NULL; +- gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, +- fsa_our_uname, +- pcmk__str_casei); + + if (peer == NULL) { + return; +@@ -523,10 +540,9 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + + uuid = crm_peer_uuid(peer); + +- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); +- if(AM_I_DC) { ++ if (AM_I_DC) { + /* The DC always sends updates */ +- send_stonith_update(NULL, st_event->target, uuid); ++ send_stonith_update(NULL, event->target, uuid); + + /* @TODO Ideally, at this point, we'd check whether the fenced node + * hosted any guest nodes, and call remote_node_down() for them. +@@ -536,31 +552,33 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * on the scheduler creating fence pseudo-events for the guests. + */ + +- if (st_event->client_origin +- && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { +- +- /* Abort the current transition graph if it wasn't us +- * that invoked stonith to fence someone ++ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { ++ /* Abort the current transition if it wasn't the cluster that ++ * initiated fencing. + */ +- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); +- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); ++ crm_info("External fencing operation from %s fenced %s", ++ client, event->target); ++ abort_transition(INFINITY, tg_restart, ++ "External Fencing Operation", NULL); + } + + /* Assume it was our leader if we don't currently have one */ +- } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) ++ } else if (pcmk__str_eq(fsa_our_dc, event->target, ++ pcmk__str_null_matches|pcmk__str_casei) + && !pcmk_is_set(peer->flags, crm_remote_node)) { + + crm_notice("Fencing target %s %s our leader", +- st_event->target, (fsa_our_dc? "was" : "may have been")); ++ event->target, (fsa_our_dc? "was" : "may have been")); + + /* Given the CIB resyncing that occurs around elections, + * have one node update the CIB now and, if the new DC is different, + * have them do so too after the election + */ +- if (we_are_executioner) { +- send_stonith_update(NULL, st_event->target, uuid); ++ if (pcmk__str_eq(event->executioner, fsa_our_uname, ++ pcmk__str_casei)) { ++ send_stonith_update(NULL, event->target, uuid); + } +- add_stonith_cleanup(st_event->target); ++ add_stonith_cleanup(event->target); + } + + /* If the target is a remote node, and we host its connection, +@@ -569,7 +587,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) + * so the failure might not otherwise be detected until the next poke. + */ + if (pcmk_is_set(peer->flags, crm_remote_node)) { +- remote_ra_fail(st_event->target); ++ remote_ra_fail(event->target); + } + + crmd_peer_down(peer, TRUE); +@@ -632,7 +650,7 @@ te_connect_stonith(gpointer user_data) + tengine_stonith_connection_destroy); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_FENCE, +- tengine_stonith_notify); ++ handle_fence_notification); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_HISTORY_SYNCED, + tengine_stonith_history_synced); +@@ -837,7 +855,8 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) + } + + /* Increment the fail count now, so abort_for_stonith_failure() can +- * check it. Non-DC nodes will increment it in tengine_stonith_notify(). ++ * check it. Non-DC nodes will increment it in ++ * handle_fence_notification(). + */ + st_fail_count_increment(target); + abort_for_stonith_failure(abort_action, target, NULL); +-- +2.27.0 + + +From 5ec9dcbbe1ee7f6252968f87d7df5a5ea17244fb Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 10:40:21 -0600 +Subject: [PATCH 08/11] Log: controller: improve messages when handling fencing + notifications + +Now that the fencing API provides a full result including exit reasons with +fencing event notifications, make the controller logs more useful and +consistent. +--- + daemons/controld/controld_fencing.c | 34 ++++++++++++++++++++--------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 00626444da..0aa9ef083c 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -448,6 +448,8 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + bool succeeded = true; + const char *executioner = "the cluster"; + const char *client = "a client"; ++ const char *reason = NULL; ++ int exec_status; + + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, +@@ -466,22 +468,31 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + client = event->client_origin; + } + +- if (event->result != pcmk_ok) { ++ exec_status = stonith__event_execution_status(event); ++ if ((stonith__event_exit_status(event) != CRM_EX_OK) ++ || (exec_status != PCMK_EXEC_DONE)) { + succeeded = false; ++ if (exec_status == PCMK_EXEC_DONE) { ++ exec_status = PCMK_EXEC_ERROR; ++ } + } ++ reason = stonith__event_exit_reason(event); + + crmd_alert_fencing_op(event); + + if (pcmk__str_eq("on", event->action, pcmk__str_none)) { + // Unfencing doesn't need special handling, just a log message + if (succeeded) { +- crm_notice("%s was successfully unfenced by %s (at the request of %s)", +- event->target, executioner, event->origin); ++ crm_notice("%s was unfenced by %s at the request of %s@%s", ++ event->target, executioner, client, event->origin); + /* TODO: Hook up event->device */ + } else { +- crm_err("Unfencing of %s by %s failed: %s (%d)", ++ crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", + event->target, executioner, +- pcmk_strerror(st_event->result), st_event->result); ++ pcmk_exec_status_str(exec_status), ++ ((reason == NULL)? "" : ": "), ++ ((reason == NULL)? "" : reason), ++ stonith__event_exit_status(event)); + } + return; + } +@@ -522,12 +533,15 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + } + } + +- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " +- CRM_XS " initiator=%s ref=%s", ++ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " ++ "%s%s%s%s " CRM_XS " event=%s", + event->target, (succeeded? "" : " not"), +- event->action, executioner, client, +- pcmk_strerror(event->result), +- event->origin, event->id); ++ event->action, executioner, client, event->origin, ++ (succeeded? "OK" : pcmk_exec_status_str(exec_status)), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); + + if (succeeded) { + crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, +-- +2.27.0 + + +From fb484933ce7c8f3325300a9e01a114db1bbb5b70 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:33:15 -0600 +Subject: [PATCH 09/11] Refactor: controller: move alert functions into own + source file + +--- + daemons/controld/Makefile.am | 1 + + daemons/controld/controld_alerts.c | 92 +++++++++++++++++++++++++ + daemons/controld/controld_execd_state.c | 75 -------------------- + 3 files changed, 93 insertions(+), 75 deletions(-) + create mode 100644 daemons/controld/controld_alerts.c + +diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am +index db45bcba4a..0a29925c0b 100644 +--- a/daemons/controld/Makefile.am ++++ b/daemons/controld/Makefile.am +@@ -43,6 +43,7 @@ pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ + $(CLUSTERLIBS) + + pacemaker_controld_SOURCES = pacemaker-controld.c \ ++ controld_alerts.c \ + controld_attrd.c \ + controld_callbacks.c \ + controld_based.c \ +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +new file mode 100644 +index 0000000000..bd92795cf0 +--- /dev/null ++++ b/daemons/controld/controld_alerts.c +@@ -0,0 +1,92 @@ ++/* ++ * Copyright 2012-2021 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. ++ * ++ * This source code is licensed under the GNU General Public License version 2 ++ * or later (GPLv2+) WITHOUT ANY WARRANTY. ++ */ ++ ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static GList *crmd_alert_list = NULL; ++ ++void ++crmd_unpack_alerts(xmlNode *alerts) ++{ ++ pe_free_alert_list(crmd_alert_list); ++ crmd_alert_list = pe_unpack_alerts(alerts); ++} ++ ++void ++crmd_alert_node_event(crm_node_t *node) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ node->uname, node->id, node->state); ++} ++ ++void ++crmd_alert_fencing_op(stonith_event_t * e) ++{ ++ char *desc; ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", ++ e->action, e->target, ++ (e->executioner? e->executioner : ""), ++ e->client_origin, e->origin, ++ pcmk_strerror(e->result), e->id); ++ ++ lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, ++ e->target, e->operation, desc, e->result); ++ free(desc); ++} ++ ++void ++crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) ++{ ++ lrm_state_t *lrm_state; ++ ++ if (crmd_alert_list == NULL) { ++ return; ++ } ++ ++ lrm_state = lrm_state_find(fsa_our_uname); ++ if (lrm_state == NULL) { ++ return; ++ } ++ ++ lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, ++ op); ++} +diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c +index 67c376a426..5dce6c6d59 100644 +--- a/daemons/controld/controld_execd_state.c ++++ b/daemons/controld/controld_execd_state.c +@@ -777,78 +777,3 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state, + */ + return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options); + } +- +-/* +- * Functions for sending alerts via local executor connection +- */ +- +-static GList *crmd_alert_list = NULL; +- +-void +-crmd_unpack_alerts(xmlNode *alerts) +-{ +- pe_free_alert_list(crmd_alert_list); +- crmd_alert_list = pe_unpack_alerts(alerts); +-} +- +-void +-crmd_alert_node_event(crm_node_t *node) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- node->uname, node->id, node->state); +-} +- +-void +-crmd_alert_fencing_op(stonith_event_t * e) +-{ +- char *desc; +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- +- lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, +- e->target, e->operation, desc, e->result); +- free(desc); +-} +- +-void +-crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) +-{ +- lrm_state_t *lrm_state; +- +- if (crmd_alert_list == NULL) { +- return; +- } +- +- lrm_state = lrm_state_find(fsa_our_uname); +- if (lrm_state == NULL) { +- return; +- } +- +- lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, +- op); +-} +-- +2.27.0 + + +From 3d0b57406bcde6682623e9d62c8ee95878345eb1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 7 Dec 2021 11:25:41 -0600 +Subject: [PATCH 10/11] Feature: controller,tools: improve description for + fencing alerts/traps + +This functionizes creating a description for fencing events, so it can be used +by both the controller for alerts and crm_mon for traps, for consistency. + +Now that we have the full result including exit reason, we can improve the +description, but the format is kept similar to before to minimize the change. + +The alert/trap also includes the legacy return code for the event, but we can't +change that now because lrmd_send_fencing_alert() and the alert/trap +environment variables are public API. +--- + daemons/controld/controld_alerts.c | 8 ++----- + include/crm/fencing/internal.h | 1 + + lib/fencing/st_client.c | 38 ++++++++++++++++++++++++++++++ + tools/crm_mon.c | 5 ++-- + 4 files changed, 43 insertions(+), 9 deletions(-) + +diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c +index bd92795cf0..2e0a67dba2 100644 +--- a/daemons/controld/controld_alerts.c ++++ b/daemons/controld/controld_alerts.c +@@ -12,6 +12,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -62,12 +63,7 @@ crmd_alert_fencing_op(stonith_event_t * e) + return; + } + +- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", +- e->action, e->target, +- (e->executioner? e->executioner : ""), +- e->client_origin, e->origin, +- pcmk_strerror(e->result), e->id); +- ++ desc = stonith__event_description(e); + lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, + e->target, e->operation, desc, e->result); + free(desc); +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index acc16d05e9..d2b49f831a 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -195,6 +195,7 @@ const char *stonith__exit_reason(stonith_callback_data_t *data); + int stonith__event_exit_status(stonith_event_t *event); + int stonith__event_execution_status(stonith_event_t *event); + const char *stonith__event_exit_reason(stonith_event_t *event); ++char *stonith__event_description(stonith_event_t *event); + + /*! + * \internal +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 5fec7529e3..b1de912b2a 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -2429,6 +2429,44 @@ stonith__event_exit_reason(stonith_event_t *event) + return ((pcmk__action_result_t *) event->opaque)->exit_reason; + } + ++/*! ++ * \internal ++ * \brief Return a human-friendly description of a fencing event ++ * ++ * \param[in] event Event to describe ++ * ++ * \return Newly allocated string with description of \p event ++ * \note The caller is responsible for freeing the return value. ++ * This function asserts on memory errors and never returns NULL. ++ * \note This currently is useful only for events of type ++ * T_STONITH_NOTIFY_FENCE. ++ */ ++char * ++stonith__event_description(stonith_event_t *event) ++{ ++ const char *reason; ++ const char *status; ++ ++ if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) { ++ status = pcmk_exec_status_str(stonith__event_execution_status(event)); ++ } else if (stonith__event_exit_status(event) != CRM_EX_OK) { ++ status = pcmk_exec_status_str(PCMK_EXEC_ERROR); ++ } else { ++ status = crm_exit_str(CRM_EX_OK); ++ } ++ reason = stonith__event_exit_reason(event); ++ ++ return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)", ++ event->action, event->target, ++ (event->executioner? event->executioner : "the cluster"), ++ (event->client_origin? event->client_origin : "a client"), ++ event->origin, status, ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")"), ++ event->id); ++} ++ + + // Deprecated functions kept only for backward API compatibility + // LCOV_EXCL_START +diff --git a/tools/crm_mon.c b/tools/crm_mon.c +index a6c459aaf7..e7b4fe2847 100644 +--- a/tools/crm_mon.c ++++ b/tools/crm_mon.c +@@ -2237,9 +2237,8 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e) + /* disconnect cib as well and have everything reconnect */ + mon_cib_connection_destroy(NULL); + } else if (options.external_agent) { +- char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", +- e->operation, e->origin, e->target, pcmk_strerror(e->result), +- e->id); ++ char *desc = stonith__event_description(e); ++ + send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); + free(desc); + } +-- +2.27.0 + + +From 2fe03c2165680c717a1f6106c5150be7d117f1a5 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 14 Jan 2022 10:45:03 -0600 +Subject: [PATCH 11/11] Low: controller: compare case-sensitively where + appropriate + +--- + daemons/controld/controld_fencing.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 0aa9ef083c..15954b2358 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -524,7 +524,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + */ + if (!AM_I_DC + && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, +- pcmk__str_casei)) { ++ pcmk__str_none)) { + + if (succeeded) { + st_fail_count_reset(event->target); +-- +2.27.0 + diff --git a/SOURCES/015-sbd.patch b/SOURCES/015-sbd.patch deleted file mode 100644 index 9f47c35..0000000 --- a/SOURCES/015-sbd.patch +++ /dev/null @@ -1,1312 +0,0 @@ -From b49f49576ef9d801a48ce7a01a78c72e65be7880 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Fri, 30 Jul 2021 18:07:25 +0200 -Subject: [PATCH 1/3] Fix, Refactor: fenced: add return value to - get_agent_metadata - -Used to distinguish between empty metadata per design, -case of failed getting metadata that might succeed on a -retry and fatal failure. -Fixes as well regression that leads to endless retries getting -metadata for #watchdog - not superserious as it happens with -delays in between but still undesirable. ---- - daemons/fenced/fenced_commands.c | 92 +++++++++++++++++++------------- - 1 file changed, 55 insertions(+), 37 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index a778801b1..cd9968f1a 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -69,7 +69,7 @@ static void stonith_send_reply(xmlNode * reply, int call_options, const char *re - static void search_devices_record_result(struct device_search_s *search, const char *device, - gboolean can_fence); - --static xmlNode * get_agent_metadata(const char *agent); -+static int get_agent_metadata(const char *agent, xmlNode **metadata); - static void read_action_metadata(stonith_device_t *device); - - typedef struct async_command_s { -@@ -323,19 +323,26 @@ fork_cb(GPid pid, gpointer user_data) - static int - get_agent_metadata_cb(gpointer data) { - stonith_device_t *device = data; -+ guint period_ms; - -- device->agent_metadata = get_agent_metadata(device->agent); -- if (device->agent_metadata) { -- read_action_metadata(device); -- stonith__device_parameter_flags(&(device->flags), device->id, -+ switch (get_agent_metadata(device->agent, &device->agent_metadata)) { -+ case pcmk_rc_ok: -+ if (device->agent_metadata) { -+ read_action_metadata(device); -+ stonith__device_parameter_flags(&(device->flags), device->id, - device->agent_metadata); -- return G_SOURCE_REMOVE; -- } else { -- guint period_ms = pcmk__mainloop_timer_get_period(device->timer); -- if (period_ms < 160 * 1000) { -- mainloop_timer_set_period(device->timer, 2 * period_ms); -- } -- return G_SOURCE_CONTINUE; -+ } -+ return G_SOURCE_REMOVE; -+ -+ case EAGAIN: -+ period_ms = pcmk__mainloop_timer_get_period(device->timer); -+ if (period_ms < 160 * 1000) { -+ mainloop_timer_set_period(device->timer, 2 * period_ms); -+ } -+ return G_SOURCE_CONTINUE; -+ -+ default: -+ return G_SOURCE_REMOVE; - } - } - -@@ -700,38 +707,41 @@ init_metadata_cache(void) { - } - } - --static xmlNode * --get_agent_metadata(const char *agent) -+int -+get_agent_metadata(const char *agent, xmlNode ** metadata) - { -- xmlNode *xml = NULL; - char *buffer = NULL; - -+ if (metadata == NULL) { -+ return EINVAL; -+ } -+ *metadata = NULL; -+ if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { -+ return pcmk_rc_ok; -+ } - init_metadata_cache(); - buffer = g_hash_table_lookup(metadata_cache, agent); -- if(pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_casei)) { -- return NULL; -- -- } else if(buffer == NULL) { -+ if (buffer == NULL) { - stonith_t *st = stonith_api_new(); - int rc; - - if (st == NULL) { - crm_warn("Could not get agent meta-data: " - "API memory allocation failed"); -- return NULL; -+ return EAGAIN; - } -- rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); -+ rc = st->cmds->metadata(st, st_opt_sync_call, agent, -+ NULL, &buffer, 10); - stonith_api_delete(st); - if (rc || !buffer) { - crm_err("Could not retrieve metadata for fencing agent %s", agent); -- return NULL; -+ return EAGAIN; - } - g_hash_table_replace(metadata_cache, strdup(agent), buffer); - } - -- xml = string2xml(buffer); -- -- return xml; -+ *metadata = string2xml(buffer); -+ return pcmk_rc_ok; - } - - static gboolean -@@ -962,19 +972,27 @@ build_device_from_xml(xmlNode * msg) - g_list_free_full(device->targets, free); - device->targets = NULL; - } -- device->agent_metadata = get_agent_metadata(device->agent); -- if (device->agent_metadata) { -- read_action_metadata(device); -- stonith__device_parameter_flags(&(device->flags), device->id, -- device->agent_metadata); -- } else { -- if (device->timer == NULL) { -- device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, -+ switch (get_agent_metadata(device->agent, &device->agent_metadata)) { -+ case pcmk_rc_ok: -+ if (device->agent_metadata) { -+ read_action_metadata(device); -+ stonith__device_parameter_flags(&(device->flags), device->id, -+ device->agent_metadata); -+ } -+ break; -+ -+ case EAGAIN: -+ if (device->timer == NULL) { -+ device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, - TRUE, get_agent_metadata_cb, device); -- } -- if (!mainloop_timer_running(device->timer)) { -- mainloop_timer_start(device->timer); -- } -+ } -+ if (!mainloop_timer_running(device->timer)) { -+ mainloop_timer_start(device->timer); -+ } -+ break; -+ -+ default: -+ break; - } - - value = g_hash_table_lookup(device->params, "nodeid"); --- -2.27.0 - - -From 5dd1e4459335764e0adf5fa78d81c875ae2332e9 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Fri, 30 Jul 2021 18:15:10 +0200 -Subject: [PATCH 2/3] feature: watchdog-fencing: allow restriction to certain - nodes - -Bump CRM_FEATURE_SET to 3.11.0 to encourage cluster being -fully upgraded to a version that supports the feature -before explicitly adding a watchdog-fence-device. ---- - configure.ac | 1 + - daemons/controld/controld_control.c | 2 +- - daemons/controld/controld_fencing.c | 14 ++ - daemons/controld/controld_fencing.h | 1 + - daemons/fenced/Makefile.am | 2 +- - daemons/fenced/fence_watchdog.in | 283 ++++++++++++++++++++++++++++ - daemons/fenced/fenced_commands.c | 141 +++++++++++--- - daemons/fenced/fenced_remote.c | 71 ++++--- - daemons/fenced/pacemaker-fenced.c | 131 +++++++++---- - daemons/fenced/pacemaker-fenced.h | 5 +- - include/crm/crm.h | 2 +- - include/crm/fencing/internal.h | 8 +- - lib/fencing/st_client.c | 61 ++++++ - lib/lrmd/lrmd_client.c | 6 +- - rpm/pacemaker.spec.in | 3 + - 16 files changed, 635 insertions(+), 97 deletions(-) - create mode 100755 daemons/fenced/fence_watchdog.in - -diff --git a/configure.ac b/configure.ac -index 436100c81..013562e46 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -1972,6 +1972,7 @@ CONFIG_FILES_EXEC([cts/cts-cli], - [cts/support/fence_dummy], - [cts/support/pacemaker-cts-dummyd], - [daemons/fenced/fence_legacy], -+ [daemons/fenced/fence_watchdog], - [doc/abi-check], - [extra/resources/ClusterMon], - [extra/resources/HealthSMART], -diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c -index 45a70bb92..b5da6a46c 100644 ---- a/daemons/controld/controld_control.c -+++ b/daemons/controld/controld_control.c -@@ -615,7 +615,7 @@ static pcmk__cluster_option_t crmd_opts[] = { - }, - { - "stonith-watchdog-timeout", NULL, "time", NULL, -- "0", pcmk__valid_sbd_timeout, -+ "0", controld_verify_stonith_watchdog_timeout, - "How long to wait before we can assume nodes are safely down " - "when watchdog-based self-fencing via SBD is in use", - "If nonzero, along with `have-watchdog=true` automatically set by the " -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 0fba6613b..6c2a6c550 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - #include - - #include -@@ -886,6 +887,19 @@ te_fence_node(crm_graph_t *graph, crm_action_t *action) - return TRUE; - } - -+bool -+controld_verify_stonith_watchdog_timeout(const char *value) -+{ -+ gboolean rv = TRUE; -+ -+ if (stonith_api && (stonith_api->state != stonith_disconnected) && -+ stonith__watchdog_fencing_enabled_for_node_api(stonith_api, -+ fsa_our_uname)) { -+ rv = pcmk__valid_sbd_timeout(value); -+ } -+ return rv; -+} -+ - /* end stonith API client functions */ - - -diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h -index d0ecc8234..ef68a0c83 100644 ---- a/daemons/controld/controld_fencing.h -+++ b/daemons/controld/controld_fencing.h -@@ -24,6 +24,7 @@ void update_stonith_max_attempts(const char* value); - void controld_trigger_fencer_connect(void); - void controld_disconnect_fencer(bool destroy); - gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); -+bool controld_verify_stonith_watchdog_timeout(const char *value); - - // stonith cleanup list - void add_stonith_cleanup(const char *target); -diff --git a/daemons/fenced/Makefile.am b/daemons/fenced/Makefile.am -index 43413e11d..2923d7c9b 100644 ---- a/daemons/fenced/Makefile.am -+++ b/daemons/fenced/Makefile.am -@@ -15,7 +15,7 @@ halibdir = $(CRM_DAEMON_DIR) - - halib_PROGRAMS = pacemaker-fenced cts-fence-helper - --sbin_SCRIPTS = fence_legacy -+sbin_SCRIPTS = fence_legacy fence_watchdog - - noinst_HEADERS = pacemaker-fenced.h - -diff --git a/daemons/fenced/fence_watchdog.in b/daemons/fenced/fence_watchdog.in -new file mode 100755 -index 000000000..c83304f1d ---- /dev/null -+++ b/daemons/fenced/fence_watchdog.in -@@ -0,0 +1,283 @@ -+#!@PYTHON@ -+"""Dummy watchdog fence agent for providing meta-data for the pacemaker internal agent -+""" -+ -+__copyright__ = "Copyright 2012-2021 the Pacemaker project contributors" -+__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" -+ -+import io -+import os -+import re -+import sys -+import atexit -+import getopt -+ -+SHORT_DESC = "Dummy watchdog fence agent" -+LONG_DESC = """fence_watchdog just provides -+meta-data - actual fencing is done by the pacemaker internal watchdog agent.""" -+ -+ALL_OPT = { -+ "version" : { -+ "getopt" : "V", -+ "longopt" : "version", -+ "help" : "-V, --version Display version information and exit", -+ "required" : "0", -+ "shortdesc" : "Display version information and exit", -+ "order" : 53 -+ }, -+ "help" : { -+ "getopt" : "h", -+ "longopt" : "help", -+ "help" : "-h, --help Display this help and exit", -+ "required" : "0", -+ "shortdesc" : "Display help and exit", -+ "order" : 54 -+ }, -+ "action" : { -+ "getopt" : "o:", -+ "longopt" : "action", -+ "help" : "-o, --action=[action] Action: metadata", -+ "required" : "1", -+ "shortdesc" : "Fencing Action", -+ "default" : "metadata", -+ "order" : 1 -+ }, -+ "nodename" : { -+ "getopt" : "N:", -+ "longopt" : "nodename", -+ "help" : "-N, --nodename Node name of fence victim (ignored)", -+ "required" : "0", -+ "shortdesc" : "Ignored", -+ "order" : 2 -+ }, -+ "plug" : { -+ "getopt" : "n:", -+ "longopt" : "plug", -+ "help" : "-n, --plug=[id] Physical plug number on device (ignored)", -+ "required" : "1", -+ "shortdesc" : "Ignored", -+ "order" : 4 -+ } -+} -+ -+ -+def agent(): -+ """ Return name this file was run as. """ -+ -+ return os.path.basename(sys.argv[0]) -+ -+ -+def fail_usage(message): -+ """ Print a usage message and exit. """ -+ -+ sys.exit("%s\nPlease use '-h' for usage" % message) -+ -+ -+def show_docs(options): -+ """ Handle informational options (display info and exit). """ -+ -+ device_opt = options["device_opt"] -+ -+ if "-h" in options: -+ usage(device_opt) -+ sys.exit(0) -+ -+ if "-o" in options and options["-o"].lower() == "metadata": -+ metadata(device_opt, options) -+ sys.exit(0) -+ -+ if "-V" in options: -+ print(AGENT_VERSION) -+ sys.exit(0) -+ -+ -+def sorted_options(avail_opt): -+ """ Return a list of all options, in their internally specified order. """ -+ -+ sorted_list = [(key, ALL_OPT[key]) for key in avail_opt] -+ sorted_list.sort(key=lambda x: x[1]["order"]) -+ return sorted_list -+ -+ -+def usage(avail_opt): -+ """ Print a usage message. """ -+ print(LONG_DESC) -+ print() -+ print("Usage:") -+ print("\t" + agent() + " [options]") -+ print("Options:") -+ -+ for dummy, value in sorted_options(avail_opt): -+ if len(value["help"]) != 0: -+ print(" " + value["help"]) -+ -+ -+def metadata(avail_opt, options): -+ """ Print agent metadata. """ -+ -+ print(""" -+ -+%s -+""" % (agent(), SHORT_DESC, LONG_DESC)) -+ -+ for option, dummy in sorted_options(avail_opt): -+ if "shortdesc" in ALL_OPT[option]: -+ print(' ') -+ -+ default = "" -+ default_name_arg = "-" + ALL_OPT[option]["getopt"][:-1] -+ default_name_no_arg = "-" + ALL_OPT[option]["getopt"] -+ -+ if "default" in ALL_OPT[option]: -+ default = 'default="%s"' % str(ALL_OPT[option]["default"]) -+ elif default_name_arg in options: -+ if options[default_name_arg]: -+ try: -+ default = 'default="%s"' % options[default_name_arg] -+ except TypeError: -+ ## @todo/@note: Currently there is no clean way how to handle lists -+ ## we can create a string from it but we can't set it on command line -+ default = 'default="%s"' % str(options[default_name_arg]) -+ elif default_name_no_arg in options: -+ default = 'default="true"' -+ -+ mixed = ALL_OPT[option]["help"] -+ ## split it between option and help text -+ res = re.compile(r"^(.*--\S+)\s+", re.IGNORECASE | re.S).search(mixed) -+ if None != res: -+ mixed = res.group(1) -+ mixed = mixed.replace("<", "<").replace(">", ">") -+ print(' ') -+ -+ if ALL_OPT[option]["getopt"].count(":") > 0: -+ print(' ') -+ else: -+ print(' ') -+ -+ print(' ' + ALL_OPT[option]["shortdesc"] + '') -+ print(' ') -+ -+ print(' \n ') -+ print(' ') -+ print(' ') -+ print(' ') -+ print(' ') -+ print(' ') -+ print(' ') -+ print(' ') -+ print('') -+ -+ -+def option_longopt(option): -+ """ Return the getopt-compatible long-option name of the given option. """ -+ -+ if ALL_OPT[option]["getopt"].endswith(":"): -+ return ALL_OPT[option]["longopt"] + "=" -+ else: -+ return ALL_OPT[option]["longopt"] -+ -+ -+def opts_from_command_line(argv, avail_opt): -+ """ Read options from command-line arguments. """ -+ -+ # Prepare list of options for getopt -+ getopt_string = "" -+ longopt_list = [] -+ for k in avail_opt: -+ if k in ALL_OPT: -+ getopt_string += ALL_OPT[k]["getopt"] -+ else: -+ fail_usage("Parse error: unknown option '" + k + "'") -+ -+ if k in ALL_OPT and "longopt" in ALL_OPT[k]: -+ longopt_list.append(option_longopt(k)) -+ -+ try: -+ opt, dummy = getopt.gnu_getopt(argv, getopt_string, longopt_list) -+ except getopt.GetoptError as error: -+ fail_usage("Parse error: " + error.msg) -+ -+ # Transform longopt to short one which are used in fencing agents -+ old_opt = opt -+ opt = {} -+ for old_option in dict(old_opt).keys(): -+ if old_option.startswith("--"): -+ for option in ALL_OPT.keys(): -+ if "longopt" in ALL_OPT[option] and "--" + ALL_OPT[option]["longopt"] == old_option: -+ opt["-" + ALL_OPT[option]["getopt"].rstrip(":")] = dict(old_opt)[old_option] -+ else: -+ opt[old_option] = dict(old_opt)[old_option] -+ -+ return opt -+ -+ -+def opts_from_stdin(avail_opt): -+ """ Read options from standard input. """ -+ -+ opt = {} -+ name = "" -+ for line in sys.stdin.readlines(): -+ line = line.strip() -+ if line.startswith("#") or (len(line) == 0): -+ continue -+ -+ (name, value) = (line + "=").split("=", 1) -+ value = value[:-1] -+ -+ if name not in avail_opt: -+ print("Parse error: Ignoring unknown option '%s'" % line, -+ file=sys.stderr) -+ continue -+ -+ if ALL_OPT[name]["getopt"].endswith(":"): -+ opt["-"+ALL_OPT[name]["getopt"].rstrip(":")] = value -+ elif value.lower() in ["1", "yes", "on", "true"]: -+ opt["-"+ALL_OPT[name]["getopt"]] = "1" -+ -+ return opt -+ -+ -+def process_input(avail_opt): -+ """ Set standard environment variables, and parse all options. """ -+ -+ # Set standard environment -+ os.putenv("LANG", "C") -+ os.putenv("LC_ALL", "C") -+ -+ # Read options from command line or standard input -+ if len(sys.argv) > 1: -+ return opts_from_command_line(sys.argv[1:], avail_opt) -+ else: -+ return opts_from_stdin(avail_opt) -+ -+ -+def atexit_handler(): -+ """ Close stdout on exit. """ -+ -+ try: -+ sys.stdout.close() -+ os.close(1) -+ except IOError: -+ sys.exit("%s failed to close standard output" % agent()) -+ -+ -+def main(): -+ """ Make it so! """ -+ -+ device_opt = ALL_OPT.keys() -+ -+ ## Defaults for fence agent -+ atexit.register(atexit_handler) -+ options = process_input(device_opt) -+ options["device_opt"] = device_opt -+ show_docs(options) -+ -+ print("Watchdog fencing may be initiated only by the cluster, not this agent.", -+ file=sys.stderr) -+ -+ sys.exit(1) -+ -+ -+if __name__ == "__main__": -+ main() -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index cd9968f1a..9470ea2c1 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -397,15 +397,13 @@ stonith_device_execute(stonith_device_t * device) - return TRUE; - } - -- if(pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, pcmk__str_casei)) { -- if(pcmk__str_eq(cmd->action, "reboot", pcmk__str_casei)) { -- pcmk__panic(__func__); -- goto done; -- -- } else if(pcmk__str_eq(cmd->action, "off", pcmk__str_casei)) { -- pcmk__panic(__func__); -- goto done; -- -+ if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, -+ STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { -+ if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { -+ if (node_does_watchdog_fencing(stonith_our_uname)) { -+ pcmk__panic(__func__); -+ goto done; -+ } - } else { - crm_info("Faking success for %s watchdog operation", cmd->action); - cmd->done_cb(0, 0, NULL, cmd); -@@ -716,7 +714,7 @@ get_agent_metadata(const char *agent, xmlNode ** metadata) - return EINVAL; - } - *metadata = NULL; -- if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { -+ if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT_INTERNAL, pcmk__str_none)) { - return pcmk_rc_ok; - } - init_metadata_cache(); -@@ -1050,24 +1048,6 @@ schedule_internal_command(const char *origin, - schedule_stonith_command(cmd, device); - } - --gboolean --string_in_list(GList *list, const char *item) --{ -- int lpc = 0; -- int max = g_list_length(list); -- -- for (lpc = 0; lpc < max; lpc++) { -- const char *value = g_list_nth_data(list, lpc); -- -- if (pcmk__str_eq(item, value, pcmk__str_casei)) { -- return TRUE; -- } else { -- crm_trace("%d: '%s' != '%s'", lpc, item, value); -- } -- } -- return FALSE; --} -- - static void - status_search_cb(GPid pid, int rc, const char *output, gpointer user_data) - { -@@ -1144,7 +1124,7 @@ dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) - if (!alias) { - alias = search->host; - } -- if (string_in_list(dev->targets, alias)) { -+ if (pcmk__str_in_list(dev->targets, alias, pcmk__str_casei)) { - can_fence = TRUE; - } - } -@@ -1215,9 +1195,62 @@ stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) - stonith_device_t *dup = NULL; - stonith_device_t *device = build_device_from_xml(msg); - guint ndevices = 0; -+ int rv = pcmk_ok; - - CRM_CHECK(device != NULL, return -ENOMEM); - -+ /* do we have a watchdog-device? */ -+ if (pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none) || -+ pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, -+ STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) do { -+ if (stonith_watchdog_timeout_ms <= 0) { -+ crm_err("Ignoring watchdog fence device without " -+ "stonith-watchdog-timeout set."); -+ rv = -ENODEV; -+ /* fall through to cleanup & return */ -+ } else if (!pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, -+ STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { -+ crm_err("Ignoring watchdog fence device with unknown " -+ "agent '%s' unequal '" STONITH_WATCHDOG_AGENT "'.", -+ device->agent?device->agent:""); -+ rv = -ENODEV; -+ /* fall through to cleanup & return */ -+ } else if (!pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, -+ pcmk__str_none)) { -+ crm_err("Ignoring watchdog fence device " -+ "named %s !='"STONITH_WATCHDOG_ID"'.", -+ device->id?device->id:""); -+ rv = -ENODEV; -+ /* fall through to cleanup & return */ -+ } else { -+ if (pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, -+ pcmk__str_none)) { -+ /* this either has an empty list or the targets -+ configured for watchdog-fencing -+ */ -+ g_list_free_full(stonith_watchdog_targets, free); -+ stonith_watchdog_targets = device->targets; -+ device->targets = NULL; -+ } -+ if (node_does_watchdog_fencing(stonith_our_uname)) { -+ g_list_free_full(device->targets, free); -+ device->targets = stonith__parse_targets(stonith_our_uname); -+ g_hash_table_replace(device->params, -+ strdup(PCMK_STONITH_HOST_LIST), -+ strdup(stonith_our_uname)); -+ /* proceed as with any other stonith-device */ -+ break; -+ } -+ -+ crm_debug("Skip registration of watchdog fence device on node not in host-list."); -+ /* cleanup and fall through to more cleanup and return */ -+ device->targets = NULL; -+ stonith_device_remove(device->id, from_cib); -+ } -+ free_device(device); -+ return rv; -+ } while (0); -+ - dup = device_has_duplicate(device); - if (dup) { - ndevices = g_hash_table_size(device_list); -@@ -1598,6 +1631,39 @@ stonith_level_remove(xmlNode *msg, char **desc) - * (CIB registration is not sufficient), because monitor should not be - * possible unless the device is "started" (API registered). - */ -+ -+static char * -+list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) -+{ -+ int max = g_list_length(list); -+ size_t delim_len = delim?strlen(delim):0; -+ size_t alloc_size = 1 + (max?((max-1+(terminate_with_delim?1:0))*delim_len):0); -+ char *rv; -+ GList *gIter; -+ -+ for (gIter = list; gIter != NULL; gIter = gIter->next) { -+ const char *value = (const char *) gIter->data; -+ -+ alloc_size += strlen(value); -+ } -+ rv = calloc(alloc_size, sizeof(char)); -+ if (rv) { -+ char *pos = rv; -+ const char *lead_delim = ""; -+ -+ for (gIter = list; gIter != NULL; gIter = gIter->next) { -+ const char *value = (const char *) gIter->data; -+ -+ pos = &pos[sprintf(pos, "%s%s", lead_delim, value)]; -+ lead_delim = delim; -+ } -+ if (max && terminate_with_delim) { -+ sprintf(pos, "%s", delim); -+ } -+ } -+ return rv; -+} -+ - static int - stonith_device_action(xmlNode * msg, char **output) - { -@@ -1615,6 +1681,19 @@ stonith_device_action(xmlNode * msg, char **output) - return -EPROTO; - } - -+ if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { -+ if (stonith_watchdog_timeout_ms <= 0) { -+ return -ENODEV; -+ } else { -+ if (pcmk__str_eq(action, "list", pcmk__str_casei)) { -+ *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); -+ return pcmk_ok; -+ } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { -+ return pcmk_ok; -+ } -+ } -+ } -+ - device = g_hash_table_lookup(device_list, id); - if ((device == NULL) - || (!device->api_registered && !strcmp(action, "monitor"))) { -@@ -1742,7 +1821,7 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc - * Only use if all hosts on which the device can be active can always fence all listed hosts - */ - -- if (string_in_list(dev->targets, host)) { -+ if (pcmk__str_in_list(dev->targets, host, pcmk__str_casei)) { - can = TRUE; - } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP) - && g_hash_table_lookup(dev->aliases, host)) { -@@ -1763,7 +1842,7 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc - return; - } - -- if (string_in_list(dev->targets, alias)) { -+ if (pcmk__str_in_list(dev->targets, alias, pcmk__str_casei)) { - can = TRUE; - } - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index cf91acaed..224f2baba 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1522,6 +1522,25 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - } - } - -+static gboolean -+check_watchdog_fencing_and_wait(remote_fencing_op_t * op) -+{ -+ if (node_does_watchdog_fencing(op->target)) { -+ -+ crm_notice("Waiting %lds for %s to self-fence (%s) for " -+ "client %s " CRM_XS " id=%.8s", -+ (stonith_watchdog_timeout_ms / 1000), -+ op->target, op->action, op->client_name, op->id); -+ op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, -+ remote_op_watchdog_done, op); -+ return TRUE; -+ } else { -+ crm_debug("Skipping fallback to watchdog-fencing as %s is " -+ "not in host-list", op->target); -+ } -+ return FALSE; -+} -+ - void - call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) - { -@@ -1592,26 +1611,33 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) - g_source_remove(op->op_timer_one); - } - -- if(stonith_watchdog_timeout_ms > 0 && device && pcmk__str_eq(device, "watchdog", pcmk__str_casei)) { -- crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " -- CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), -- op->target, op->action, op->client_name, op->id); -- op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); -- -- /* TODO check devices to verify watchdog will be in use */ -- } else if(stonith_watchdog_timeout_ms > 0 -- && pcmk__str_eq(peer->host, op->target, pcmk__str_casei) -- && !pcmk__str_eq(op->action, "on", pcmk__str_casei)) { -- crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " -- CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), -- op->target, op->action, op->client_name, op->id); -- op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); -- -- } else { -+ if (!(stonith_watchdog_timeout_ms > 0 && ( -+ (pcmk__str_eq(device, STONITH_WATCHDOG_ID, -+ pcmk__str_none)) || -+ (pcmk__str_eq(peer->host, op->target, pcmk__str_casei) -+ && !pcmk__str_eq(op->action, "on", pcmk__str_casei))) && -+ check_watchdog_fencing_and_wait(op))) { -+ -+ /* Some thoughts about self-fencing cases reaching this point: -+ - Actually check in check_watchdog_fencing_and_wait -+ shouldn't fail if STONITH_WATCHDOG_ID is -+ chosen as fencing-device and it being present implies -+ watchdog-fencing is enabled anyway -+ - If watchdog-fencing is disabled either in general or for -+ a specific target - detected in check_watchdog_fencing_and_wait - -+ for some other kind of self-fencing we can't expect -+ a success answer but timeout is fine if the node doesn't -+ come back in between -+ - Delicate might be the case where we have watchdog-fencing -+ enabled for a node but the watchdog-fencing-device isn't -+ explicitly chosen for suicide. Local pe-execution in sbd -+ may detect the node as unclean and lead to timely suicide. -+ Otherwise the selection of stonith-watchdog-timeout at -+ least is questionable. -+ */ - op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op); - } - -- - send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE); - peer->tried = TRUE; - free_xml(remote_op); -@@ -1645,12 +1671,11 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) - * but we have all the expected replies, then no devices - * are available to execute the fencing operation. */ - -- if(stonith_watchdog_timeout_ms && pcmk__str_eq(device, "watchdog", pcmk__str_null_matches | pcmk__str_casei)) { -- crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " -- CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), -- op->target, op->action, op->client_name, op->id); -- op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); -- return; -+ if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device, -+ STONITH_WATCHDOG_ID, pcmk__str_null_matches)) { -+ if (check_watchdog_fencing_and_wait(op)) { -+ return; -+ } - } - - if (op->state == st_query) { -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 39738d8be..7f8b427d9 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -42,6 +42,7 @@ - - char *stonith_our_uname = NULL; - long stonith_watchdog_timeout_ms = 0; -+GList *stonith_watchdog_targets = NULL; - - static GMainLoop *mainloop = NULL; - -@@ -578,7 +579,44 @@ our_node_allowed_for(pe_resource_t *rsc) - } - - static void --watchdog_device_update(xmlNode *cib) -+watchdog_device_update(void) -+{ -+ if (stonith_watchdog_timeout_ms > 0) { -+ if (!g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID) && -+ !stonith_watchdog_targets) { -+ /* getting here watchdog-fencing enabled, no device there yet -+ and reason isn't stonith_watchdog_targets preventing that -+ */ -+ int rc; -+ xmlNode *xml; -+ -+ xml = create_device_registration_xml( -+ STONITH_WATCHDOG_ID, -+ st_namespace_internal, -+ STONITH_WATCHDOG_AGENT, -+ NULL, /* stonith_device_register will add our -+ own name as PCMK_STONITH_HOST_LIST param -+ so we can skip that here -+ */ -+ NULL); -+ rc = stonith_device_register(xml, NULL, TRUE); -+ free_xml(xml); -+ if (rc != pcmk_ok) { -+ crm_crit("Cannot register watchdog pseudo fence agent"); -+ crm_exit(CRM_EX_FATAL); -+ } -+ } -+ -+ } else { -+ /* be silent if no device - todo parameter to stonith_device_remove */ -+ if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { -+ stonith_device_remove(STONITH_WATCHDOG_ID, TRUE); -+ } -+ } -+} -+ -+static void -+update_stonith_watchdog_timeout_ms(xmlNode *cib) - { - xmlNode *stonith_enabled_xml = NULL; - const char *stonith_enabled_s = NULL; -@@ -608,33 +646,7 @@ watchdog_device_update(xmlNode *cib) - } - } - -- if (timeout_ms != stonith_watchdog_timeout_ms) { -- crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); -- stonith_watchdog_timeout_ms = timeout_ms; -- -- if (stonith_watchdog_timeout_ms > 0) { -- int rc; -- xmlNode *xml; -- stonith_key_value_t *params = NULL; -- -- params = stonith_key_value_add(params, PCMK_STONITH_HOST_LIST, -- stonith_our_uname); -- -- xml = create_device_registration_xml("watchdog", st_namespace_internal, -- STONITH_WATCHDOG_AGENT, params, -- NULL); -- stonith_key_value_freeall(params, 1, 1); -- rc = stonith_device_register(xml, NULL, FALSE); -- free_xml(xml); -- if (rc != pcmk_ok) { -- crm_crit("Cannot register watchdog pseudo fence agent"); -- crm_exit(CRM_EX_FATAL); -- } -- -- } else { -- stonith_device_remove("watchdog", FALSE); -- } -- } -+ stonith_watchdog_timeout_ms = timeout_ms; - } - - /*! -@@ -677,6 +689,16 @@ static void cib_device_update(pe_resource_t *rsc, pe_working_set_t *data_set) - return; - } - -+ /* if watchdog-fencing is disabled handle any watchdog-fence -+ resource as if it was disabled -+ */ -+ if ((stonith_watchdog_timeout_ms <= 0) && -+ pcmk__str_eq(rsc->id, STONITH_WATCHDOG_ID, pcmk__str_none)) { -+ crm_info("Watchdog-fencing disabled thus handling " -+ "device %s as disabled", rsc->id); -+ return; -+ } -+ - /* Check whether our node is allowed for this resource (and its parent if in a group) */ - node = our_node_allowed_for(rsc); - if (rsc->parent && (rsc->parent->variant == pe_group)) { -@@ -772,6 +794,12 @@ cib_devices_update(void) - } - } - -+ /* have list repopulated if cib has a watchdog-fencing-resource -+ TODO: keep a cached list for queries happening while we are refreshing -+ */ -+ g_list_free_full(stonith_watchdog_targets, free); -+ stonith_watchdog_targets = NULL; -+ - for (gIter = fenced_data_set->resources; gIter != NULL; gIter = gIter->next) { - cib_device_update(gIter->data, fenced_data_set); - } -@@ -825,6 +853,8 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) - if (search != NULL) { - *search = 0; - stonith_device_remove(rsc_id, TRUE); -+ /* watchdog_device_update called afterwards -+ to fall back to implicit definition if needed */ - } else { - crm_warn("Ignoring malformed CIB update (resource deletion)"); - } -@@ -968,6 +998,24 @@ node_has_attr(const char *node, const char *name, const char *value) - return (match != NULL); - } - -+/*! -+ * \internal -+ * \brief Check whether a node does watchdog-fencing -+ * -+ * \param[in] node Name of node to check -+ * -+ * \return TRUE if node found in stonith_watchdog_targets -+ * or stonith_watchdog_targets is empty indicating -+ * all nodes are doing watchdog-fencing -+ */ -+gboolean -+node_does_watchdog_fencing(const char *node) -+{ -+ return ((stonith_watchdog_targets == NULL) || -+ pcmk__str_in_list(stonith_watchdog_targets, node, pcmk__str_casei)); -+} -+ -+ - static void - update_fencing_topology(const char *event, xmlNode * msg) - { -@@ -1073,6 +1121,8 @@ update_cib_cache_cb(const char *event, xmlNode * msg) - xmlNode *stonith_enabled_xml = NULL; - const char *stonith_enabled_s = NULL; - static gboolean stonith_enabled_saved = TRUE; -+ long timeout_ms_saved = stonith_watchdog_timeout_ms; -+ gboolean need_full_refresh = FALSE; - - if(!have_cib_devices) { - crm_trace("Skipping updates until we get a full dump"); -@@ -1127,6 +1177,7 @@ update_cib_cache_cb(const char *event, xmlNode * msg) - } - - pcmk__refresh_node_caches_from_cib(local_cib); -+ update_stonith_watchdog_timeout_ms(local_cib); - - stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", - local_cib, LOG_NEVER); -@@ -1134,23 +1185,30 @@ update_cib_cache_cb(const char *event, xmlNode * msg) - stonith_enabled_s = crm_element_value(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE); - } - -- watchdog_device_update(local_cib); -- - if (stonith_enabled_s && crm_is_true(stonith_enabled_s) == FALSE) { - crm_trace("Ignoring CIB updates while fencing is disabled"); - stonith_enabled_saved = FALSE; -- return; - - } else if (stonith_enabled_saved == FALSE) { - crm_info("Updating fencing device and topology lists " - "now that fencing is enabled"); - stonith_enabled_saved = TRUE; -- fencing_topology_init(); -- cib_devices_update(); -+ need_full_refresh = TRUE; - - } else { -- update_fencing_topology(event, msg); -- update_cib_stonith_devices(event, msg); -+ if (timeout_ms_saved != stonith_watchdog_timeout_ms) { -+ need_full_refresh = TRUE; -+ } else { -+ update_fencing_topology(event, msg); -+ update_cib_stonith_devices(event, msg); -+ watchdog_device_update(); -+ } -+ } -+ -+ if (need_full_refresh) { -+ fencing_topology_init(); -+ cib_devices_update(); -+ watchdog_device_update(); - } - } - -@@ -1162,10 +1220,11 @@ init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *us - local_cib = copy_xml(output); - - pcmk__refresh_node_caches_from_cib(local_cib); -+ update_stonith_watchdog_timeout_ms(local_cib); - - fencing_topology_init(); -- watchdog_device_update(local_cib); - cib_devices_update(); -+ watchdog_device_update(); - } - - static void -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index d330fda4d..14e085e98 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -260,14 +260,15 @@ bool fencing_peer_active(crm_node_t *peer); - - int stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op); - --gboolean string_in_list(GList *list, const char *item); -- - gboolean node_has_attr(const char *node, const char *name, const char *value); - -+gboolean node_does_watchdog_fencing(const char *node); -+ - extern char *stonith_our_uname; - extern gboolean stand_alone; - extern GHashTable *device_list; - extern GHashTable *topology; - extern long stonith_watchdog_timeout_ms; -+extern GList *stonith_watchdog_targets; - - extern GHashTable *stonith_remote_op_list; -diff --git a/include/crm/crm.h b/include/crm/crm.h -index ee52c3630..7861c160e 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.10.2" -+# define CRM_FEATURE_SET "3.11.0" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 8bcb544d8..f222edba3 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -164,7 +164,10 @@ void stonith__device_parameter_flags(uint32_t *device_flags, - # define STONITH_OP_LEVEL_ADD "st_level_add" - # define STONITH_OP_LEVEL_DEL "st_level_remove" - --# define STONITH_WATCHDOG_AGENT "#watchdog" -+# define STONITH_WATCHDOG_AGENT "fence_watchdog" -+/* Don't change 2 below as it would break rolling upgrade */ -+# define STONITH_WATCHDOG_AGENT_INTERNAL "#watchdog" -+# define STONITH_WATCHDOG_ID "watchdog" - - # ifdef HAVE_STONITH_STONITH_H - // utilities from st_lha.c -@@ -211,4 +214,7 @@ stonith__op_state_pending(enum op_state state) - return state != st_failed && state != st_done; - } - -+gboolean stonith__watchdog_fencing_enabled_for_node(const char *node); -+gboolean stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node); -+ - #endif -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index e285f51e2..0ff98157b 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -195,6 +195,67 @@ stonith_get_namespace(const char *agent, const char *namespace_s) - return st_namespace_invalid; - } - -+gboolean -+stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) -+{ -+ gboolean rv = FALSE; -+ stonith_t *stonith_api = st?st:stonith_api_new(); -+ char *list = NULL; -+ -+ if(stonith_api) { -+ if (stonith_api->state == stonith_disconnected) { -+ int rc = stonith_api->cmds->connect(stonith_api, "stonith-api", NULL); -+ -+ if (rc != pcmk_ok) { -+ crm_err("Failed connecting to Stonith-API for watchdog-fencing-query."); -+ } -+ } -+ -+ if (stonith_api->state != stonith_disconnected) { -+ /* caveat!!! -+ * this might fail when when stonithd is just updating the device-list -+ * probably something we should fix as well for other api-calls */ -+ int rc = stonith_api->cmds->list(stonith_api, st_opt_sync_call, STONITH_WATCHDOG_ID, &list, 0); -+ if ((rc != pcmk_ok) || (list == NULL)) { -+ /* due to the race described above it can happen that -+ * we drop in here - so as not to make remote nodes -+ * panic on that answer -+ */ -+ crm_warn("watchdog-fencing-query failed"); -+ } else if (list[0] == '\0') { -+ crm_warn("watchdog-fencing-query returned an empty list - any node"); -+ rv = TRUE; -+ } else { -+ GList *targets = stonith__parse_targets(list); -+ rv = pcmk__str_in_list(targets, node, pcmk__str_casei); -+ g_list_free_full(targets, free); -+ } -+ free(list); -+ if (!st) { -+ /* if we're provided the api we still might have done the -+ * connection - but let's assume the caller won't bother -+ */ -+ stonith_api->cmds->disconnect(stonith_api); -+ } -+ } -+ -+ if (!st) { -+ stonith_api_delete(stonith_api); -+ } -+ } else { -+ crm_err("Stonith-API for watchdog-fencing-query couldn't be created."); -+ } -+ crm_trace("Pacemaker assumes node %s %sto do watchdog-fencing.", -+ node, rv?"":"not "); -+ return rv; -+} -+ -+gboolean -+stonith__watchdog_fencing_enabled_for_node(const char *node) -+{ -+ return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); -+} -+ - static void - log_action(stonith_action_t *action, pid_t pid) - { -diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c -index 87d050ed1..bf4bceb42 100644 ---- a/lib/lrmd/lrmd_client.c -+++ b/lib/lrmd/lrmd_client.c -@@ -34,6 +34,7 @@ - #include - - #include -+#include - - #ifdef HAVE_GNUTLS_GNUTLS_H - # undef KEYFILE -@@ -934,7 +935,10 @@ lrmd__validate_remote_settings(lrmd_t *lrmd, GHashTable *hash) - crm_xml_add(data, F_LRMD_ORIGIN, __func__); - - value = g_hash_table_lookup(hash, "stonith-watchdog-timeout"); -- crm_xml_add(data, F_LRMD_WATCHDOG, value); -+ if ((value) && -+ (stonith__watchdog_fencing_enabled_for_node(native->remote_nodename))) { -+ crm_xml_add(data, F_LRMD_WATCHDOG, value); -+ } - - rc = lrmd_send_command(lrmd, LRMD_OP_CHECK, data, NULL, 0, 0, - (native->type == pcmk__client_ipc)); -diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in -index 79e78ede9..f58357a77 100644 ---- a/rpm/pacemaker.spec.in -+++ b/rpm/pacemaker.spec.in -@@ -744,6 +744,7 @@ exit 0 - %doc %{_mandir}/man8/crm_attribute.* - %doc %{_mandir}/man8/crm_master.* - %doc %{_mandir}/man8/fence_legacy.* -+%doc %{_mandir}/man8/fence_watchdog.* - %doc %{_mandir}/man8/pacemakerd.* - - %doc %{_datadir}/pacemaker/alerts -@@ -796,6 +797,7 @@ exit 0 - %{_sbindir}/crm_simulate - %{_sbindir}/crm_report - %{_sbindir}/crm_ticket -+%{_sbindir}/fence_watchdog - %{_sbindir}/stonith_admin - # "dirname" is owned by -schemas, which is a prerequisite - %{_datadir}/pacemaker/report.collector -@@ -822,6 +824,7 @@ exit 0 - %exclude %{_mandir}/man8/crm_attribute.* - %exclude %{_mandir}/man8/crm_master.* - %exclude %{_mandir}/man8/fence_legacy.* -+%exclude %{_mandir}/man8/fence_watchdog.* - %exclude %{_mandir}/man8/pacemakerd.* - %exclude %{_mandir}/man8/pacemaker-remoted.* - --- -2.27.0 - - -From 53dd360f096e5f005e3221e8d44d82d3654b5172 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Wed, 4 Aug 2021 15:57:23 +0200 -Subject: [PATCH 3/3] Fix: watchdog-fencing: Silence warning without node - restriction - ---- - lib/fencing/st_client.c | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 0ff98157b..14fa7b2a6 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -223,7 +223,6 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) - */ - crm_warn("watchdog-fencing-query failed"); - } else if (list[0] == '\0') { -- crm_warn("watchdog-fencing-query returned an empty list - any node"); - rv = TRUE; - } else { - GList *targets = stonith__parse_targets(list); --- -2.27.0 - diff --git a/SOURCES/016-cts.patch b/SOURCES/016-cts.patch deleted file mode 100644 index 195afc3..0000000 --- a/SOURCES/016-cts.patch +++ /dev/null @@ -1,59 +0,0 @@ -From b37391fef92548f31822f9df2a9b5fa2a61b4514 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 23 Jun 2021 15:17:54 -0500 -Subject: [PATCH] Fix: CTS: handle longer Corosync token timeouts - -Previously, startall() would call cluster_stable() immediately after detecting -the "controller successfully started" message. If the Corosync token timeout is -small enough, this will be fine. However with a token timeout of more than -about 1 second, the controllers will not have formed a membership by this -point, causing cluster_stable() to think there are multiple partitions, and -wait for a DC to be elected in each one, when really they will unite into a -single partition in a short time, and only elect a single DC. - -Now, startall() waits until seeing that each node is a cluster member before -calling cluster_stable(). ---- - cts/lab/CTS.py.in | 3 ++- - cts/lab/patterns.py | 2 ++ - 2 files changed, 4 insertions(+), 1 deletion(-) - -diff --git a/cts/lab/CTS.py.in b/cts/lab/CTS.py.in -index abcb9d285..d9924437b 100644 ---- a/cts/lab/CTS.py.in -+++ b/cts/lab/CTS.py.in -@@ -628,9 +628,10 @@ class ClusterManager(UserDict): - watchpats = [ ] - watchpats.append(self.templates["Pat:DC_IDLE"]) - for node in nodelist: -- watchpats.append(self.templates["Pat:Local_started"] % node) - watchpats.append(self.templates["Pat:InfraUp"] % node) - watchpats.append(self.templates["Pat:PacemakerUp"] % node) -+ watchpats.append(self.templates["Pat:Local_started"] % node) -+ watchpats.append(self.templates["Pat:They_up"] % (nodelist[0], node)) - - # Start all the nodes - at about the same time... - watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) -diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py -index e21a016ff..400fd3dc8 100644 ---- a/cts/lab/patterns.py -+++ b/cts/lab/patterns.py -@@ -61,6 +61,7 @@ class BasePatterns(object): - "Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN", - "Pat:They_stopped" : "%s\W.*LOST:.* %s ", - "Pat:They_dead" : "node %s.*: is dead", -+ "Pat:They_up" : "%s %s\W.*OVERRIDE THIS PATTERN", - "Pat:TransitionComplete" : "Transition status: Complete: complete", - - "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s", -@@ -130,6 +131,7 @@ class crm_corosync(BasePatterns): - "Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines", - "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", - "Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost", -+ "Pat:They_up" : "\W%s\W.*pacemaker-controld.*Node %s state is now member", - - "Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(", - # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() --- -2.27.0 - diff --git a/SOURCES/016-fencing-crash.patch b/SOURCES/016-fencing-crash.patch new file mode 100644 index 0000000..c514c64 --- /dev/null +++ b/SOURCES/016-fencing-crash.patch @@ -0,0 +1,56 @@ +From e330568504ec379ea42460d21a2e20b1652d9445 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Fri, 14 Jan 2022 01:35:35 -0800 +Subject: [PATCH] Fix: fencing: Don't set stonith action to pending if fork + fails + +Currently, we set a stonith action to pending if +services_action_async_fork_notify() returns true. However, "true" means +that the svc_action should not be freed. This might be because the +svc_action forked successfully and is pending, or it might be because +the svc_action has already been freed. + +In the case of stonith actions, if we fail to fork, the stonith_action_t +object stored in svc_action->cb_data gets freed by the done callback, +and services_action_async_fork_notify() returns true. If we try to set +the action to pending, it causes a segfault. + +This commit moves the "set to pending" step to the +stonith_action_async_forked() callback. We avoid the segfault and only +set it to pending if it's actually pending. + +A slight difference in ordering was required to achieve this. Now, the +action gets set to pending immediately before being added to the +mainloop, instead of immediately after. + +Signed-off-by: Reid Wahl +--- + lib/fencing/st_actions.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c +index e4e43225cd..306001af69 100644 +--- a/lib/fencing/st_actions.c ++++ b/lib/fencing/st_actions.c +@@ -550,6 +550,9 @@ stonith_action_async_forked(svc_action_t *svc_action) + (action->fork_cb) (svc_action->pid, action->userdata); + } + ++ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, ++ NULL); ++ + crm_trace("Child process %d performing action '%s' successfully forked", + action->pid, action->action); + } +@@ -619,8 +622,6 @@ internal_stonith_action_execute(stonith_action_t * action) + if (services_action_async_fork_notify(svc_action, + &stonith_action_async_done, + &stonith_action_async_forked)) { +- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, +- PCMK_EXEC_PENDING, NULL); + return pcmk_ok; + } + +-- +2.27.0 + diff --git a/SOURCES/017-fencing-reasons.patch b/SOURCES/017-fencing-reasons.patch new file mode 100644 index 0000000..1e100ec --- /dev/null +++ b/SOURCES/017-fencing-reasons.patch @@ -0,0 +1,875 @@ +From 523f62eb235836a01ea039c23ada261a494f7b32 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 10 Nov 2021 15:22:47 -0600 +Subject: [PATCH 01/11] Feature: libpacemaker: improve result for high-level + fencing API + +Previously, pcmk__fencing_action()'s helpers for asynchronous fencing actions +initialized the result to a generic error, and then overrode that only on +success. + +Now, set a detailed result for early failures, and use the full result when +available from the fencing API. + +A standard return code is still returned to callers at this point. +--- + lib/pacemaker/pcmk_fence.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 7d6acd0de6..125e1b268b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -32,8 +32,8 @@ static struct { + unsigned int timeout; + unsigned int tolerance; + int delay; +- int rc; +-} async_fence_data; ++ pcmk__action_result_t result; ++} async_fence_data = { NULL, }; + + static int + handle_level(stonith_t *st, char *target, int fence_level, +@@ -76,14 +76,13 @@ handle_level(stonith_t *st, char *target, int fence_level, + static void + notify_callback(stonith_t * st, stonith_event_t * e) + { +- if (e->result != pcmk_ok) { +- return; +- } ++ if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { + +- if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) && +- pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { +- +- async_fence_data.rc = e->result; ++ pcmk__set_result(&async_fence_data.result, ++ stonith__event_exit_status(e), ++ stonith__event_execution_status(e), ++ stonith__event_exit_reason(e)); + g_main_loop_quit(mainloop); + } + } +@@ -91,8 +90,9 @@ notify_callback(stonith_t * st, stonith_event_t * e) + static void + fence_callback(stonith_t * stonith, stonith_callback_data_t * data) + { +- async_fence_data.rc = data->rc; +- ++ pcmk__set_result(&async_fence_data.result, stonith__exit_status(data), ++ stonith__execution_status(data), ++ stonith__exit_reason(data)); + g_main_loop_quit(mainloop); + } + +@@ -106,6 +106,8 @@ async_fence_helper(gpointer user_data) + if (rc != pcmk_ok) { + fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_NOT_CONNECTED, NULL); + return TRUE; + } + +@@ -121,6 +123,8 @@ async_fence_helper(gpointer user_data) + + if (call_id < 0) { + g_main_loop_quit(mainloop); ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, ++ PCMK_EXEC_ERROR, pcmk_strerror(call_id)); + return TRUE; + } + +@@ -146,7 +150,8 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + async_fence_data.timeout = timeout; + async_fence_data.tolerance = tolerance; + async_fence_data.delay = delay; +- async_fence_data.rc = pcmk_err_generic; ++ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, PCMK_EXEC_UNKNOWN, ++ NULL); + + trig = mainloop_add_trigger(G_PRIORITY_HIGH, async_fence_helper, NULL); + mainloop_set_trigger(trig); +@@ -156,7 +161,7 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- return pcmk_legacy2rc(async_fence_data.rc); ++ return stonith__result2rc(&async_fence_data.result); + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 008868fae5d1b0d6d8dc61f7acfb3856801ddd52 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:36:10 -0600 +Subject: [PATCH 02/11] Refactor: libpacemaker: add exit reason to high-level + fencing API + +Nothing uses it as of this commit +--- + include/pacemaker.h | 5 ++++- + include/pcmki/pcmki_fence.h | 5 ++++- + lib/pacemaker/pcmk_fence.c | 10 +++++++--- + tools/stonith_admin.c | 6 +++--- + 4 files changed, 18 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index a8523c969e..0daa4c5945 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -189,12 +189,15 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max. ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index d4cef68f5c..c3da0361d7 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -28,12 +28,15 @@ + * again. + * \param[in] delay Apply a fencing delay. Value -1 means disable also any + * static/random fencing delays from pcmk_delay_base/max ++ * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code ++ * \note If \p reason is not NULL, the caller is responsible for freeing its ++ * returned value. + */ + int pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay); ++ int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 125e1b268b..dbf084fb6b 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -139,7 +139,7 @@ async_fence_helper(gpointer user_data) + int + pcmk__fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -161,6 +161,9 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + ++ if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { ++ *reason = strdup(async_fence_data.result.exit_reason); ++ } + return stonith__result2rc(&async_fence_data.result); + } + +@@ -168,9 +171,10 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + int + pcmk_fence_action(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, unsigned int tolerance, +- int delay) ++ int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, delay); ++ return pcmk__fence_action(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 2d48326e1b..fdc7c46d49 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -571,17 +571,17 @@ main(int argc, char **argv) + + case 'B': + rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'F': + rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'U': + rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay); ++ options.tolerance*1000, options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 7570510f9985ba75ef73fb824f28109e135ace0a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:40:48 -0600 +Subject: [PATCH 03/11] Refactor: libpacemaker: rename high-level fencing API + +Rename pcmk_fence_action() to pcmk_request_fencing(), and its internal +equivalent pcmk__fence_action() to pcmk__request_fencing(). The change is +backward-compatible because pcmk_fence_action() has not been exposed publicly +yet. + +"Fence action" can be easily confused with libcrmservice actions, liblrmd +actions, libstonithd actions, scheduler actions, and so forth. + +Also, the new name makes it clearer that the caller is requesting that the +cluster perform fencing, and not directly performing fencing. +--- + include/pacemaker.h | 20 ++++++++++---------- + include/pcmki/pcmki_fence.h | 16 ++++++++-------- + lib/pacemaker/pcmk_fence.c | 16 ++++++++-------- + tools/stonith_admin.c | 18 ++++++++++++------ + 4 files changed, 38 insertions(+), 32 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index 0daa4c5945..e581f975a9 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -177,27 +177,27 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + #ifdef BUILD_PUBLIC_LIBPACEMAKER + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action +- * again. ++ * again + * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max. ++ * static/random fencing delays from pcmk_delay_base/max + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index c3da0361d7..e3a7e27264 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -13,14 +13,14 @@ + # include + + /*! +- * \brief Perform a STONITH action. ++ * \brief Ask the cluster to perform fencing + * +- * \note This is the internal version of pcmk_fence_action(). External users ++ * \note This is the internal version of pcmk_request_fencing(). External users + * of the pacemaker API should use that function instead. + * +- * \param[in] st A connection to the STONITH API. +- * \param[in] target The node receiving the action. +- * \param[in] action The action to perform. ++ * \param[in] st A connection to the fencer API ++ * \param[in] target The node that should be fenced ++ * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? + * \param[in] timeout How long to wait for the operation to complete (in ms). + * \param[in] tolerance If a successful action for \p target happened within +@@ -34,9 +34,9 @@ + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. + */ +-int pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason); ++int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason); + + /*! + * \brief List the fencing operations that have occurred for a specific node. +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index dbf084fb6b..1b7feb54b2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -137,9 +137,9 @@ async_fence_helper(gpointer user_data) + } + + int +-pcmk__fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk__request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; + +@@ -169,12 +169,12 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, + + #ifdef BUILD_PUBLIC_LIBPACEMAKER + int +-pcmk_fence_action(stonith_t *st, const char *target, const char *action, +- const char *name, unsigned int timeout, unsigned int tolerance, +- int delay, char **reason) ++pcmk_request_fencing(stonith_t *st, const char *target, const char *action, ++ const char *name, unsigned int timeout, ++ unsigned int tolerance, int delay, char **reason) + { +- return pcmk__fence_action(st, target, action, name, timeout, tolerance, +- delay, reason); ++ return pcmk__request_fencing(st, target, action, name, timeout, tolerance, ++ delay, reason); + } + #endif + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index fdc7c46d49..56948b3875 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -570,18 +570,24 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "reboot", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'F': +- rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "off", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'U': +- rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, +- options.tolerance*1000, options.delay, NULL); ++ rc = pcmk__request_fencing(st, target, "on", name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); + break; + + case 'h': +-- +2.27.0 + + +From 247eb303df934944c0b72b162bb661cee6e0ed8b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:52:37 -0600 +Subject: [PATCH 04/11] Refactor: tools: drop unnecessary string duplication in + stonith_admin + +--- + tools/stonith_admin.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 56948b3875..c11e302e76 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -360,8 +360,6 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + +- name = strdup(crm_system_name); +- + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -496,7 +494,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, name, NULL); ++ rc = st->cmds->connect(st, crm_system_name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -570,21 +568,21 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", name, ++ rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", name, ++ rc = pcmk__request_fencing(st, target, "off", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", name, ++ rc = pcmk__request_fencing(st, target, "on", crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, + options.delay, NULL); +@@ -619,7 +617,6 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } +- free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + + +From a7888bf6868d8d9d9c77f65ae9983cf748bb0548 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 15:56:34 -0600 +Subject: [PATCH 05/11] Refactor: tools: functionize requesting fencing in + stonith_admin + +... to reduce code duplication and improve readability +--- + tools/stonith_admin.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index c11e302e76..f738a9c888 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -331,6 +331,18 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + return context; + } + ++// \return Standard Pacemaker return code ++static int ++request_fencing(stonith_t *st, const char *target, const char *command) ++{ ++ int rc = pcmk__request_fencing(st, target, command, crm_system_name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, NULL); ++ ++ return rc; ++} ++ + int + main(int argc, char **argv) + { +@@ -568,24 +580,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "reboot"); + break; + + case 'F': +- rc = pcmk__request_fencing(st, target, "off", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "off"); + break; + + case 'U': +- rc = pcmk__request_fencing(st, target, "on", crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, NULL); ++ rc = request_fencing(st, target, "on"); + break; + + case 'h': +-- +2.27.0 + + +From 2da32df780983ec1197e857eed5eeb5bf1101889 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:05:19 -0600 +Subject: [PATCH 06/11] Feature: tools: display failure reasons for + stonith_admin fencing commands + +Previously, stonith_admin's --fence/--unfence/--reboot options did not output +any error message on failure. Now, they do, including the exit reason, if +available. +--- + tools/stonith_admin.c | 30 +++++++++++++++++++++++++----- + 1 file changed, 25 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index f738a9c888..5590faf11e 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -333,13 +333,33 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { + + // \return Standard Pacemaker return code + static int +-request_fencing(stonith_t *st, const char *target, const char *command) ++request_fencing(stonith_t *st, const char *target, const char *command, ++ GError **error) + { ++ char *reason = NULL; + int rc = pcmk__request_fencing(st, target, command, crm_system_name, + options.timeout * 1000, + options.tolerance * 1000, +- options.delay, NULL); ++ options.delay, &reason); + ++ if (rc != pcmk_rc_ok) { ++ const char *rc_str = pcmk_rc_str(rc); ++ ++ // If reason is identical to return code string, don't display it twice ++ if (pcmk__str_eq(rc_str, reason, pcmk__str_none)) { ++ free(reason); ++ reason = NULL; ++ } ++ ++ g_set_error(error, PCMK__RC_ERROR, rc, ++ "Couldn't %sfence %s: %s%s%s%s", ++ ((strcmp(command, "on") == 0)? "un" : ""), ++ target, pcmk_rc_str(rc), ++ ((reason == NULL)? "" : " ("), ++ ((reason == NULL)? "" : reason), ++ ((reason == NULL)? "" : ")")); ++ } ++ free(reason); + return rc; + } + +@@ -580,15 +600,15 @@ main(int argc, char **argv) + break; + + case 'B': +- rc = request_fencing(st, target, "reboot"); ++ rc = request_fencing(st, target, "reboot", &error); + break; + + case 'F': +- rc = request_fencing(st, target, "off"); ++ rc = request_fencing(st, target, "off", &error); + break; + + case 'U': +- rc = request_fencing(st, target, "on"); ++ rc = request_fencing(st, target, "on", &error); + break; + + case 'h': +-- +2.27.0 + + +From 2d99eba4c326d3b13dbbe446971ea5febd5d05be Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Dec 2021 16:08:49 -0600 +Subject: [PATCH 07/11] Feature: libpacemaker: return exit reason for fencer + connection failures + +... instead of outputting to stderr directly, so that the caller (i.e. +stonith_admin) can output the error in the correct output format. +--- + lib/pacemaker/pcmk_fence.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 1b7feb54b2..d17b07cda2 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -104,10 +104,9 @@ async_fence_helper(gpointer user_data) + int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); + + if (rc != pcmk_ok) { +- fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); + g_main_loop_quit(mainloop); + pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, +- PCMK_EXEC_NOT_CONNECTED, NULL); ++ PCMK_EXEC_NOT_CONNECTED, pcmk_strerror(rc)); + return TRUE; + } + +-- +2.27.0 + + +From 4480ef0602f47450bdddfbde360a6a8327710927 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:39:39 -0600 +Subject: [PATCH 08/11] Low: libpacemaker: compare fence action names + case-sensitively + +--- + lib/pacemaker/pcmk_fence.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index d17b07cda2..2a8f50a555 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -77,7 +77,7 @@ static void + notify_callback(stonith_t * st, stonith_event_t * e) + { + if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) +- && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { ++ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_none)) { + + pcmk__set_result(&async_fence_data.result, + stonith__event_exit_status(e), +@@ -549,7 +549,7 @@ pcmk__reduce_fence_history(stonith_history_t *history) + if ((hp->state == st_done) || (hp->state == st_failed)) { + /* action not in progress */ + if (pcmk__str_eq(hp->target, np->target, pcmk__str_casei) && +- pcmk__str_eq(hp->action, np->action, pcmk__str_casei) && ++ pcmk__str_eq(hp->action, np->action, pcmk__str_none) && + (hp->state == np->state) && + ((hp->state == st_done) || + pcmk__str_eq(hp->delegate, np->delegate, pcmk__str_casei))) { +-- +2.27.0 + + +From fe4c65a3b9e715c2b535709f989f2369d3637b78 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 09:45:24 -0600 +Subject: [PATCH 09/11] Refactor: libpacemaker: avoid unnecessary string + duplication + +... and don't leave any dynamic memory hanging around +--- + lib/pacemaker/pcmk_fence.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c +index 2a8f50a555..260fa5ab8e 100644 +--- a/lib/pacemaker/pcmk_fence.c ++++ b/lib/pacemaker/pcmk_fence.c +@@ -141,6 +141,7 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + unsigned int tolerance, int delay, char **reason) + { + crm_trigger_t *trig; ++ int rc = pcmk_rc_ok; + + async_fence_data.st = st; + async_fence_data.name = strdup(name); +@@ -160,10 +161,14 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + + free(async_fence_data.name); + +- if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { +- *reason = strdup(async_fence_data.result.exit_reason); ++ if (reason != NULL) { ++ // Give the caller ownership of the exit reason ++ *reason = async_fence_data.result.exit_reason; ++ async_fence_data.result.exit_reason = NULL; + } +- return stonith__result2rc(&async_fence_data.result); ++ rc = stonith__result2rc(&async_fence_data.result); ++ pcmk__reset_result(&async_fence_data.result); ++ return rc; + } + + #ifdef BUILD_PUBLIC_LIBPACEMAKER +-- +2.27.0 + + +From 7b7af07796f05a1adabdac655582be2e17106f81 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:07:10 -0600 +Subject: [PATCH 10/11] Doc: libpacemaker: improve pcmk__request_fencing() + doxygen block + +--- + include/pacemaker.h | 6 ++++-- + include/pcmki/pcmki_fence.h | 15 +++++++++------ + 2 files changed, 13 insertions(+), 8 deletions(-) + +diff --git a/include/pacemaker.h b/include/pacemaker.h +index e581f975a9..266a844892 100644 +--- a/include/pacemaker.h ++++ b/include/pacemaker.h +@@ -187,8 +187,10 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); + * \param[in] tolerance If a successful action for \p target happened within + * this many ms, return 0 without performing the action + * again +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code +diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h +index e3a7e27264..4a2fe3c481 100644 +--- a/include/pcmki/pcmki_fence.h ++++ b/include/pcmki/pcmki_fence.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -22,17 +22,20 @@ + * \param[in] target The node that should be fenced + * \param[in] action The fencing action (on, off, reboot) to perform + * \param[in] name Who requested the fence action? +- * \param[in] timeout How long to wait for the operation to complete (in ms). ++ * \param[in] timeout How long to wait for the operation to complete (in ms) + * \param[in] tolerance If a successful action for \p target happened within +- * this many ms, return 0 without performing the action +- * again. +- * \param[in] delay Apply a fencing delay. Value -1 means disable also any +- * static/random fencing delays from pcmk_delay_base/max ++ * this many milliseconds, return success without ++ * performing the action again ++ * \param[in] delay Apply this delay (in milliseconds) before initiating the ++ * fencing action (a value of -1 applies no delay and also ++ * disables any fencing delay from pcmk_delay_base and ++ * pcmk_delay_max) + * \param[out] reason If not NULL, where to put descriptive failure reason + * + * \return Standard Pacemaker return code + * \note If \p reason is not NULL, the caller is responsible for freeing its + * returned value. ++ * \todo delay is eventually used with g_timeout_add() and should be guint + */ + int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, + const char *name, unsigned int timeout, +-- +2.27.0 + + +From 61fb7271712e1246eb6d9472dc1afc7cd10e0a79 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 17 Jan 2022 10:18:02 -0600 +Subject: [PATCH 11/11] Fix: tools: get stonith_admin -T option working again + +Regression introduced in 2.0.3 by 3910b6fec + +This reverts commit 247eb303df934944c0b72b162bb661cee6e0ed8b +("Refactor: tools: drop unnecessary string duplication in stonith_admin") +and fixes a regression introduced when stonith_admin was converted to use +GOption. + +The -T option is intended to override the client name passed to the fencer API, +but the client name was set to the default (crm_system_name) after option +processing had already been done, so any value for -T was overwritten by the +default, and its memory was leaked. + +This commit sets the default only if -T was not used. +--- + tools/stonith_admin.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c +index 5590faf11e..54774b6fee 100644 +--- a/tools/stonith_admin.c ++++ b/tools/stonith_admin.c +@@ -337,10 +337,10 @@ request_fencing(stonith_t *st, const char *target, const char *command, + GError **error) + { + char *reason = NULL; +- int rc = pcmk__request_fencing(st, target, command, crm_system_name, +- options.timeout * 1000, +- options.tolerance * 1000, +- options.delay, &reason); ++ int rc = pcmk__request_fencing(st, target, command, name, ++ options.timeout * 1000, ++ options.tolerance * 1000, ++ options.delay, &reason); + + if (rc != pcmk_rc_ok) { + const char *rc_str = pcmk_rc_str(rc); +@@ -392,6 +392,10 @@ main(int argc, char **argv) + + pcmk__cli_init_logging("stonith_admin", args->verbosity); + ++ if (name == NULL) { ++ name = strdup(crm_system_name); ++ } ++ + rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_ERROR; +@@ -526,7 +530,7 @@ main(int argc, char **argv) + if (st == NULL) { + rc = -ENOMEM; + } else if (!no_connect) { +- rc = st->cmds->connect(st, crm_system_name, NULL); ++ rc = st->cmds->connect(st, name, NULL); + } + if (rc < 0) { + out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); +@@ -640,6 +644,7 @@ main(int argc, char **argv) + out->finish(out, exit_code, true, NULL); + pcmk__output_free(out); + } ++ free(name); + stonith_key_value_freeall(options.params, 1, 1); + + if (st != NULL) { +-- +2.27.0 + diff --git a/SOURCES/017-watchdog-fixes.patch b/SOURCES/017-watchdog-fixes.patch deleted file mode 100644 index d3df876..0000000 --- a/SOURCES/017-watchdog-fixes.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 61eb9c240004d1dbd0b5973e2fecda3686bb4c53 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 10 Aug 2021 09:06:55 +0200 -Subject: [PATCH 1/2] Build: rpm: package fence_watchdog in base-package - ---- - rpm/pacemaker.spec.in | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in -index f58357a77..0c569b9ca 100644 ---- a/rpm/pacemaker.spec.in -+++ b/rpm/pacemaker.spec.in -@@ -734,6 +734,7 @@ exit 0 - %{_sbindir}/crm_attribute - %{_sbindir}/crm_master - %{_sbindir}/fence_legacy -+%{_sbindir}/fence_watchdog - - %doc %{_mandir}/man7/pacemaker-controld.* - %doc %{_mandir}/man7/pacemaker-schedulerd.* -@@ -797,7 +798,6 @@ exit 0 - %{_sbindir}/crm_simulate - %{_sbindir}/crm_report - %{_sbindir}/crm_ticket --%{_sbindir}/fence_watchdog - %{_sbindir}/stonith_admin - # "dirname" is owned by -schemas, which is a prerequisite - %{_datadir}/pacemaker/report.collector --- -2.27.0 - - -From 88e75d5b98df197fa731e7642434951a24a67095 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 10 Aug 2021 09:10:23 +0200 -Subject: [PATCH 2/2] Fix: fence_watchdog: fix version output needed for - help2man - ---- - daemons/fenced/fence_watchdog.in | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/daemons/fenced/fence_watchdog.in b/daemons/fenced/fence_watchdog.in -index c83304f1d..700065e0e 100755 ---- a/daemons/fenced/fence_watchdog.in -+++ b/daemons/fenced/fence_watchdog.in -@@ -12,6 +12,7 @@ import sys - import atexit - import getopt - -+AGENT_VERSION = "1.0.0" - SHORT_DESC = "Dummy watchdog fence agent" - LONG_DESC = """fence_watchdog just provides - meta-data - actual fencing is done by the pacemaker internal watchdog agent.""" --- -2.27.0 - diff --git a/SOURCES/018-controller.patch b/SOURCES/018-controller.patch deleted file mode 100644 index a2094e3..0000000 --- a/SOURCES/018-controller.patch +++ /dev/null @@ -1,122 +0,0 @@ -From ee7eba6a7a05bdf0a12d60ebabb334d8ee021101 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 9 Aug 2021 14:48:57 -0500 -Subject: [PATCH] Fix: controller: ensure lost node's transient attributes are - cleared without DC - -Previously, peer_update_callback() cleared a lost node's transient attributes -if either the local node is DC, or there is no DC. - -However, that left the possibility of the DC being lost at the same time as -another node -- the local node would still have fsa_our_dc set while processing -the leave notifications, so no node would clear the attributes for the non-DC -node. - -Now, the controller has its own CPG configuration change callback, which sets a -global boolean before calling the usual one, so that peer_update_callback() can -know when the DC has been lost. ---- - daemons/controld/controld_callbacks.c | 4 +- - daemons/controld/controld_corosync.c | 57 ++++++++++++++++++++++++++- - 2 files changed, 59 insertions(+), 2 deletions(-) - -diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c -index af24856ae..e564b3dcd 100644 ---- a/daemons/controld/controld_callbacks.c -+++ b/daemons/controld/controld_callbacks.c -@@ -99,6 +99,8 @@ node_alive(const crm_node_t *node) - - #define state_text(state) ((state)? (const char *)(state) : "in unknown state") - -+bool controld_dc_left = false; -+ - void - peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) - { -@@ -217,7 +219,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d - cib_scope_local); - } - -- } else if (AM_I_DC || (fsa_our_dc == NULL)) { -+ } else if (AM_I_DC || controld_dc_left || (fsa_our_dc == NULL)) { - /* This only needs to be done once, so normally the DC should do - * it. However if there is no DC, every node must do it, since - * there is no other way to ensure some one node does it. -diff --git a/daemons/controld/controld_corosync.c b/daemons/controld/controld_corosync.c -index db99630fb..c5ab6580a 100644 ---- a/daemons/controld/controld_corosync.c -+++ b/daemons/controld/controld_corosync.c -@@ -87,6 +87,61 @@ crmd_cs_destroy(gpointer user_data) - } - } - -+extern bool controld_dc_left; -+ -+/*! -+ * \brief Handle a Corosync notification of a CPG configuration change -+ * -+ * \param[in] handle CPG connection -+ * \param[in] cpg_name CPG group name -+ * \param[in] member_list List of current CPG members -+ * \param[in] member_list_entries Number of entries in \p member_list -+ * \param[in] left_list List of CPG members that left -+ * \param[in] left_list_entries Number of entries in \p left_list -+ * \param[in] joined_list List of CPG members that joined -+ * \param[in] joined_list_entries Number of entries in \p joined_list -+ */ -+static void -+cpg_membership_callback(cpg_handle_t handle, const struct cpg_name *cpg_name, -+ const struct cpg_address *member_list, -+ size_t member_list_entries, -+ const struct cpg_address *left_list, -+ size_t left_list_entries, -+ const struct cpg_address *joined_list, -+ size_t joined_list_entries) -+{ -+ /* When nodes leave CPG, the DC clears their transient node attributes. -+ * -+ * However if there is no DC, or the DC is among the nodes that left, each -+ * remaining node needs to do the clearing, to ensure it gets done. -+ * Otherwise, the attributes would persist when the nodes rejoin, which -+ * could have serious consequences for unfencing, agents that use attributes -+ * for internal logic, etc. -+ * -+ * Here, we set a global boolean if the DC is among the nodes that left, for -+ * use by the peer callback. -+ */ -+ if (fsa_our_dc != NULL) { -+ crm_node_t *peer = pcmk__search_cluster_node_cache(0, fsa_our_dc); -+ -+ if (peer != NULL) { -+ for (int i = 0; i < left_list_entries; ++i) { -+ if (left_list[i].nodeid == peer->id) { -+ controld_dc_left = true; -+ break; -+ } -+ } -+ } -+ } -+ -+ // Process the change normally, which will call the peer callback as needed -+ pcmk_cpg_membership(handle, cpg_name, member_list, member_list_entries, -+ left_list, left_list_entries, -+ joined_list, joined_list_entries); -+ -+ controld_dc_left = false; -+} -+ - extern gboolean crm_connect_corosync(crm_cluster_t * cluster); - - gboolean -@@ -95,7 +150,7 @@ crm_connect_corosync(crm_cluster_t * cluster) - if (is_corosync_cluster()) { - crm_set_status_callback(&peer_update_callback); - cluster->cpg.cpg_deliver_fn = crmd_cs_dispatch; -- cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership; -+ cluster->cpg.cpg_confchg_fn = cpg_membership_callback; - cluster->destroy = crmd_cs_destroy; - - if (crm_cluster_connect(cluster)) { --- -2.27.0 - diff --git a/SOURCES/018-failure-messages.patch b/SOURCES/018-failure-messages.patch new file mode 100644 index 0000000..3a2f249 --- /dev/null +++ b/SOURCES/018-failure-messages.patch @@ -0,0 +1,796 @@ +From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 18 Jan 2022 16:04:49 -0600 +Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not + connected" status + +PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor +connection", but it can also now mean no fencer connection, so change it to +"Internal communication failure" which is probably less mysterious to end users +anyway (especially since it should be accompanied by a more descriptive exit +reason). +--- + include/crm/common/results.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/crm/common/results.h b/include/crm/common/results.h +index 873faf5c43..3d322a7ce6 100644 +--- a/include/crm/common/results.h ++++ b/include/crm/common/results.h +@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status) + case PCMK_EXEC_ERROR_HARD: return "Hard error"; + case PCMK_EXEC_ERROR_FATAL: return "Fatal error"; + case PCMK_EXEC_NOT_INSTALLED: return "Not installed"; +- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection"; ++ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure"; + case PCMK_EXEC_INVALID: return "Cannot execute now"; + case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device"; + case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable"; +-- +2.27.0 + + +From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:12:36 -0600 +Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error + redefinitions + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +This was a bad idea, since there's no way to ensure the defined values don't +conflict with existing system codes. However, we use a number of them, so it's +probably best to keep them, at least until we can make a backward compatibility +break. + +However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those. +--- + include/portability.h | 12 ------------ + lib/common/results.c | 9 ++++++--- + 2 files changed, 6 insertions(+), 15 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index 9a60c583a7..ee065a376d 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -131,10 +131,6 @@ typedef union + # define EREMOTEIO 193 + # endif + +-# ifndef EUNATCH +-# define EUNATCH 194 +-# endif +- + # ifndef ENOKEY + # define ENOKEY 195 + # endif +@@ -147,14 +143,6 @@ typedef union + # define ETIME 197 + # endif + +-# ifndef ENOSR +-# define ENOSR 198 +-# endif +- +-# ifndef ENOSTR +-# define ENOSTR 199 +-# endif +- + # ifndef EKEYREJECTED + # define EKEYREJECTED 200 + # endif +diff --git a/lib/common/results.c b/lib/common/results.c +index 6d120694cd..96cd4e5659 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -118,9 +118,6 @@ pcmk_strerror(int rc) + case EREMOTEIO: + return "Remote I/O error"; + /* coverity[dead_error_condition] False positive on non-Linux */ +- case EUNATCH: +- return "Protocol driver not attached"; +- /* coverity[dead_error_condition] False positive on non-Linux */ + case ENOKEY: + return "Required key not available"; + } +@@ -342,8 +339,12 @@ pcmk_rc_name(int rc) + case ENOMSG: return "ENOMSG"; + case ENOPROTOOPT: return "ENOPROTOOPT"; + case ENOSPC: return "ENOSPC"; ++#ifdef ENOSR + case ENOSR: return "ENOSR"; ++#endif ++#ifdef ENOSTR + case ENOSTR: return "ENOSTR"; ++#endif + case ENOSYS: return "ENOSYS"; + case ENOTBLK: return "ENOTBLK"; + case ENOTCONN: return "ENOTCONN"; +@@ -376,7 +377,9 @@ pcmk_rc_name(int rc) + case ETIME: return "ETIME"; + case ETIMEDOUT: return "ETIMEDOUT"; + case ETXTBSY: return "ETXTBSY"; ++#ifdef EUNATCH + case EUNATCH: return "EUNATCH"; ++#endif + case EUSERS: return "EUSERS"; + /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ + case EXDEV: return "EXDEV"; +-- +2.27.0 + + +From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:33:12 -0600 +Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h + error codes + +portability.h defines some system error codes that might not be present on +non-Linux systems. + +Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when +the system doesn't have the value, so we can detect that when relevant. + +Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values. +--- + include/portability.h | 8 ++++++++ + lib/common/results.c | 32 ++++++++++++++++++++++++++++++-- + 2 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/include/portability.h b/include/portability.h +index ee065a376d..5d5fbf21cb 100644 +--- a/include/portability.h ++++ b/include/portability.h +@@ -116,34 +116,42 @@ typedef union + # include + + # ifndef ENOTUNIQ ++# define PCMK__ENOTUNIQ + # define ENOTUNIQ 190 + # endif + + # ifndef ECOMM ++# define PCMK__ECOMM + # define ECOMM 191 + # endif + + # ifndef ELIBACC ++# define PCMK__ELIBACC + # define ELIBACC 192 + # endif + + # ifndef EREMOTEIO ++# define PCMK__EREMOTIO + # define EREMOTEIO 193 + # endif + + # ifndef ENOKEY ++# define PCMK__ENOKEY + # define ENOKEY 195 + # endif + + # ifndef ENODATA ++# define PCMK__ENODATA + # define ENODATA 196 + # endif + + # ifndef ETIME ++# define PCMK__ETIME + # define ETIME 197 + # endif + + # ifndef EKEYREJECTED ++# define PCMK__EKEYREJECTED + # define EKEYREJECTED 200 + # endif + +diff --git a/lib/common/results.c b/lib/common/results.c +index 96cd4e5659..bcf289d0d6 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -395,9 +395,9 @@ pcmk_rc_name(int rc) + #ifdef EISNAM // Not available on OS X, Illumos, Solaris + case EISNAM: return "EISNAM"; + case EKEYEXPIRED: return "EKEYEXPIRED"; +- case EKEYREJECTED: return "EKEYREJECTED"; + case EKEYREVOKED: return "EKEYREVOKED"; + #endif ++ case EKEYREJECTED: return "EKEYREJECTED"; + case EL2HLT: return "EL2HLT"; + case EL2NSYNC: return "EL2NSYNC"; + case EL3HLT: return "EL3HLT"; +@@ -443,7 +443,35 @@ pcmk_rc_str(int rc) + if (rc < 0) { + return "Unknown error"; + } +- return strerror(rc); ++ ++ // Handle values that could be defined by system or by portability.h ++ switch (rc) { ++#ifdef PCMK__ENOTUNIQ ++ case ENOTUNIQ: return "Name not unique on network"; ++#endif ++#ifdef PCMK__ECOMM ++ case ECOMM: return "Communication error on send"; ++#endif ++#ifdef PCMK__ELIBACC ++ case ELIBACC: return "Can not access a needed shared library"; ++#endif ++#ifdef PCMK__EREMOTEIO ++ case EREMOTEIO: return "Remote I/O error"; ++#endif ++#ifdef PCMK__ENOKEY ++ case ENOKEY: return "Required key not available"; ++#endif ++#ifdef PCMK__ENODATA ++ case ENODATA: return "No data available"; ++#endif ++#ifdef PCMK__ETIME ++ case ETIME: return "Timer expired"; ++#endif ++#ifdef PCMK__EKEYREJECTED ++ case EKEYREJECTED: return "Key was rejected by service"; ++#endif ++ default: return strerror(rc); ++ } + } + + // This returns negative values for errors +-- +2.27.0 + + +From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:39:19 -0600 +Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of + pcmk_rc_str() + +... to reduce code duplication. This causes minor differences in the string for +a few values. +--- + lib/common/results.c | 67 +------------------------------------------- + 1 file changed, 1 insertion(+), 66 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index bcf289d0d6..b2c6e8d553 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -57,72 +57,7 @@ pcmk_errorname(int rc) + const char * + pcmk_strerror(int rc) + { +- if (rc == 0) { +- return "OK"; +- } +- +- rc = abs(rc); +- +- // Of course rc > 0 ... unless someone passed INT_MIN as rc +- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) { +- return strerror(rc); +- } +- +- switch (rc) { +- case pcmk_err_generic: +- return "Generic Pacemaker error"; +- case pcmk_err_no_quorum: +- return "Operation requires quorum"; +- case pcmk_err_schema_validation: +- return "Update does not conform to the configured schema"; +- case pcmk_err_transform_failed: +- return "Schema transform failed"; +- case pcmk_err_old_data: +- return "Update was older than existing configuration"; +- case pcmk_err_diff_failed: +- return "Application of an update diff failed"; +- case pcmk_err_diff_resync: +- return "Application of an update diff failed, requesting a full refresh"; +- case pcmk_err_cib_modified: +- return "The on-disk configuration was manually modified"; +- case pcmk_err_cib_backup: +- return "Could not archive the previous configuration"; +- case pcmk_err_cib_save: +- return "Could not save the new configuration to disk"; +- case pcmk_err_cib_corrupt: +- return "Could not parse on-disk configuration"; +- case pcmk_err_multiple: +- return "Resource active on multiple nodes"; +- case pcmk_err_node_unknown: +- return "Node not found"; +- case pcmk_err_already: +- return "Situation already as requested"; +- case pcmk_err_bad_nvpair: +- return "Bad name/value pair given"; +- case pcmk_err_schema_unchanged: +- return "Schema is already the latest available"; +- case pcmk_err_unknown_format: +- return "Unknown output format"; +- +- /* The following cases will only be hit on systems for which they are non-standard */ +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOTUNIQ: +- return "Name not unique on network"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ECOMM: +- return "Communication error on send"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ELIBACC: +- return "Can not access a needed shared library"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case EREMOTEIO: +- return "Remote I/O error"; +- /* coverity[dead_error_condition] False positive on non-Linux */ +- case ENOKEY: +- return "Required key not available"; +- } +- crm_err("Unknown error code: %d", rc); +- return "Unknown error"; ++ return pcmk_rc_str(pcmk_legacy2rc(rc)); + } + + // Standard Pacemaker API return codes +-- +2.27.0 + + +From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 15:41:24 -0600 +Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error" + +... which is unhelpful and annoying to users +--- + lib/common/results.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/common/results.c b/lib/common/results.c +index b2c6e8d553..5ffac76549 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -376,7 +376,7 @@ pcmk_rc_str(int rc) + return pcmk__rcs[pcmk_rc_error - rc].desc; + } + if (rc < 0) { +- return "Unknown error"; ++ return "Error"; + } + + // Handle values that could be defined by system or by portability.h +@@ -768,7 +768,7 @@ bz2_strerror(int rc) + case BZ_OUTBUFF_FULL: + return "output data will not fit into the buffer provided"; + } +- return "Unknown error"; ++ return "Data compression error"; + } + + crm_exit_t +-- +2.27.0 + + +From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:01:39 -0600 +Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog + device + +--- + lib/fencing/st_client.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index b1de912b2a..a0f3119f3b 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) + * we drop in here - so as not to make remote nodes + * panic on that answer + */ +- crm_warn("watchdog-fencing-query failed"); ++ if (rc == -ENODEV) { ++ crm_notice("Cluster does not have watchdog fencing device"); ++ } else { ++ crm_warn("Could not check for watchdog fencing device: %s", ++ pcmk_strerror(rc)); ++ } + } else if (list[0] == '\0') { + rv = TRUE; + } else { +-- +2.27.0 + + +From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 20 Dec 2021 16:22:49 -0600 +Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for + recent change + +--- + lib/common/tests/results/pcmk__results_test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c +index 57a520c501..e08d4b6261 100644 +--- a/lib/common/tests/results/pcmk__results_test.c ++++ b/lib/common/tests/results/pcmk__results_test.c +@@ -30,7 +30,7 @@ static void + test_for_pcmk_rc_str(void **state) { + assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format"); + assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK"); +- assert_string_equal(pcmk_rc_str(-1), "Unknown error"); ++ assert_string_equal(pcmk_rc_str(-1), "Error"); + } + + static void +-- +2.27.0 + + +From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 10:20:38 -0600 +Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent + changes + +--- + cts/lab/CTStests.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py +index 62c832eb45..f4be998cfb 100644 +--- a/cts/lab/CTStests.py ++++ b/cts/lab/CTStests.py +@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver): + r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", + r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", + r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", +- r"error: Result of monitor operation for .* on remote-.*: No executor connection", ++ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure", + ] + + ignore_pats.extend(RemoteDriver.errorstoignore(self)) +-- +2.27.0 + + +From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 15:40:49 -0600 +Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation + less chatty + +Other messages with the same info will already be logged at higher severity +--- + daemons/controld/controld_execd.c | 3 +-- + daemons/controld/controld_te_actions.c | 7 ++----- + include/pcmki/pcmki_sched_utils.h | 3 +-- + lib/pacemaker/pcmk_injections.c | 3 +-- + lib/pacemaker/pcmk_sched_actions.c | 12 +++++------- + 5 files changed, 10 insertions(+), 18 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 15784e7687..52157fa5d4 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_ + caller_version = CRM_FEATURE_SET; + } + +- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version); + xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc, +- fsa_our_uname, src, LOG_DEBUG); ++ fsa_our_uname, src); + if (xml_op == NULL) { + return TRUE; + } +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index 63b7c72359..b0bcb8b2e4 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action) + lrmd_event_data_t *op = NULL; + xmlNode *state = NULL; + xmlNode *rsc = NULL; +- xmlNode *xml_op = NULL; + xmlNode *action_rsc = NULL; + + int rc = pcmk_ok; +@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action) + op->user_data = pcmk__transition_key(transition_graph->id, action->id, + target_rc, te_uuid); + +- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, +- target, __func__, LOG_INFO); ++ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, ++ __func__); + lrmd_free_event(op); + +- crm_log_xml_trace(xml_op, "Action timeout"); +- + rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); + fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); + free_xml(state); +diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h +index 68d60fc7db..144424a609 100644 +--- a/include/pcmki/pcmki_sched_utils.h ++++ b/include/pcmki/pcmki_sched_utils.h +@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor + + xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event, + const char *caller_version, int target_rc, +- const char *node, const char *origin, +- int level); ++ const char *node, const char *origin); + + # define LOAD_STOPPED "load_stopped" + +diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c +index 678c3f5dd2..1aa90a5a0b 100644 +--- a/lib/pacemaker/pcmk_sched_transition.c ++++ b/lib/pacemaker/pcmk_sched_transition.c +@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) + { + return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET, +- target_rc, NULL, crm_system_name, +- LOG_TRACE); ++ target_rc, NULL, crm_system_name); + } + + static xmlNode * +diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c +index f8200b0efc..4f63d3374d 100644 +--- a/lib/pacemaker/pcmk_sched_utils.c ++++ b/lib/pacemaker/pcmk_sched_utils.c +@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update) + * \param[in] target_rc Expected result of operation + * \param[in] node Name of node on which operation was performed + * \param[in] origin Arbitrary description of update source +- * \param[in] level A log message will be logged at this level + * + * \return Newly created XML node for history update + */ + xmlNode * + pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *caller_version, int target_rc, +- const char *node, const char *origin, int level) ++ const char *node, const char *origin) + { + char *key = NULL; + char *magic = NULL; +@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, + const char *task = NULL; + + CRM_CHECK(op != NULL, return NULL); +- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)", +- origin, op->rsc_id, op->op_type, +- pcmk_exec_status_str(op->op_status), op->interval_ms); +- +- crm_trace("DC version: %s", caller_version); ++ crm_trace("Creating history XML for %s-interval %s action for %s on %s " ++ "(DC version: %s, origin: %s)", ++ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id, ++ ((node == NULL)? "no node" : node), caller_version, origin); + + task = op->op_type; + +-- +2.27.0 + + +From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Dec 2021 17:09:44 -0600 +Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal + timeouts + +Functionize the part of controld_record_action_timeout() that creates a fake +executor event, into a new function synthesize_timeout_event(), and have it set +a more detailed exit reason describing what timed out. +--- + daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------ + 1 file changed, 48 insertions(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index b0bcb8b2e4..de2fbb82bf 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action) + return TRUE; + } + ++/*! ++ * \internal ++ * \brief Synthesize an executor event for a resource action timeout ++ * ++ * \param[in] action Resource action that timed out ++ * \param[in] target_rc Expected result of action that timed out ++ * ++ * Synthesize an executor event for a resource action timeout. (If the executor ++ * gets a timeout while waiting for a resource action to complete, that will be ++ * reported via the usual callback. This timeout means we didn't hear from the ++ * executor itself or the controller that relayed the action to the executor.) ++ * ++ * \return Newly created executor event for result of \p action ++ * \note The caller is responsible for freeing the return value using ++ * lrmd_free_event(). ++ */ ++static lrmd_event_data_t * ++synthesize_timeout_event(crm_action_t *action, int target_rc) ++{ ++ lrmd_event_data_t *op = NULL; ++ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); ++ const char *reason = NULL; ++ char *dynamic_reason = NULL; ++ ++ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) { ++ reason = "Local executor did not return result in time"; ++ } else { ++ const char *router_node = NULL; ++ ++ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); ++ if (router_node == NULL) { ++ router_node = target; ++ } ++ dynamic_reason = crm_strdup_printf("Controller on %s did not return " ++ "result in time", router_node); ++ reason = dynamic_reason; ++ } ++ ++ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, ++ PCMK_OCF_UNKNOWN_ERROR, reason); ++ op->call_id = -1; ++ op->user_data = pcmk__transition_key(transition_graph->id, action->id, ++ target_rc, te_uuid); ++ free(dynamic_reason); ++ return op; ++} ++ + void + controld_record_action_timeout(crm_action_t *action) + { +@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action) + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS); + crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER); + +- /* If the executor gets a timeout while waiting for the action to complete, +- * that will be reported via the usual callback. This timeout means that we +- * didn't hear from the executor or the controller that relayed the action +- * to the executor. +- */ +- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, +- PCMK_OCF_UNKNOWN_ERROR, +- "Cluster communication timeout " +- "(no response from executor)"); +- op->call_id = -1; +- op->user_data = pcmk__transition_key(transition_graph->id, action->id, +- target_rc, te_uuid); +- ++ op = synthesize_timeout_event(action, target_rc); + pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, + __func__); + lrmd_free_event(op); +-- +2.27.0 + + +From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 16:35:06 -0600 +Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing + timeouts + +Troubleshooting timeouts is one of the more difficult aspects of cluster +maintenance. We want to give as much of a hint as possible, but for fencing in +particular it is difficult because an operation might involve multiple retries +of multiple devices. + +Barring another major project to track exactly which devices, retries, etc., +were used in a given operation, these changes in wording are probably the best +we can do. +--- + daemons/fenced/fenced_remote.c | 8 +++++--- + lib/fencing/st_client.c | 2 +- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 1e237150c5..6eebb7381e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + } else { +- finalize_timed_out_op(userdata, "Fencing could not be completed " +- "within overall timeout"); ++ finalize_timed_out_op(userdata, "Fencing did not complete within a " ++ "total timeout based on the " ++ "configured timeout and retries for " ++ "any devices attempted"); + } + return G_SOURCE_REMOVE; + } +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index a0f3119f3b..718739b321 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) + if (msg == NULL) { + // Fencer didn't reply in time + pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, +- "Timeout waiting for reply from fencer"); ++ "Fencer accepted request but did not reply in time"); + CRM_LOG_ASSERT(call_id > 0); + + } else { +-- +2.27.0 + + +From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:09:09 -0600 +Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for + timeouts + +The services library doesn't have enough information about an action to say +(for example) what configuration parameters might be relevant, but we can at +least distinguish what kind of agent timed out. +--- + lib/services/services_linux.c | 12 +++++++++++- + lib/services/systemd.c | 2 +- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c +index f15eee860e..d6aafcfe46 100644 +--- a/lib/services/services_linux.c ++++ b/lib/services/services_linux.c +@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, + parse_exit_reason_from_stderr(op); + + } else if (mainloop_child_timeout(p)) { ++ const char *reason = NULL; ++ ++ if (op->rsc != NULL) { ++ reason = "Resource agent did not complete in time"; ++ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH, ++ pcmk__str_none)) { ++ reason = "Fence agent did not complete in time"; ++ } else { ++ reason = "Process did not complete in time"; ++ } + crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); + services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, +- "Process did not exit within specified timeout"); ++ reason); + + } else if (op->cancel) { + /* If an in-flight recurring operation was killed because it was +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 27a3b376db..d87b287424 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p) + crm_info("%s action for systemd unit %s named '%s' timed out", + op->action, op->agent, op->rsc); + services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, +- "Systemd action did not complete within specified timeout"); ++ "Systemd unit action did not complete in time"); + services__finalize_async_op(op); + return FALSE; + } +-- +2.27.0 + diff --git a/SOURCES/019-corosync-tracking.patch b/SOURCES/019-corosync-tracking.patch new file mode 100644 index 0000000..ac3ca96 --- /dev/null +++ b/SOURCES/019-corosync-tracking.patch @@ -0,0 +1,29 @@ +From e8bf0161b872267f1bb7143a9866fdc15ec218f2 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 18 Jan 2022 16:35:24 +0100 +Subject: [PATCH] Fix: corosync: Repeat corosync_cfg_trackstart + +corosync_cfg_trackstart can fail with CS_ERR_TRY_AGAIN failure so +(similarly as for corosync_cfg_local_get, ...) handle failure with +using cs_repeat macro. +--- + daemons/pacemakerd/pcmkd_corosync.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c +index 7990bc43c5..cd7a40321d 100644 +--- a/daemons/pacemakerd/pcmkd_corosync.c ++++ b/daemons/pacemakerd/pcmkd_corosync.c +@@ -186,7 +186,8 @@ cluster_connect_cfg(void) + crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid); + + #ifdef HAVE_COROSYNC_CFG_TRACKSTART +- rc = corosync_cfg_trackstart(cfg_handle, 0); ++ retries = 0; ++ cs_repeat(retries, 30, rc = corosync_cfg_trackstart(cfg_handle, 0)); + if (rc != CS_OK) { + crm_crit("Could not enable Corosync CFG shutdown tracker: %s " CRM_XS " rc=%d", + cs_strerror(rc), rc); +-- +2.27.0 + diff --git a/SOURCES/019-crm_resource.patch b/SOURCES/019-crm_resource.patch deleted file mode 100644 index 237dde2..0000000 --- a/SOURCES/019-crm_resource.patch +++ /dev/null @@ -1,114 +0,0 @@ -From b4e426a016a4d7c9ade39e60a83644fc537bce26 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 11 Aug 2021 12:10:32 +0200 -Subject: [PATCH 1/2] Fix: crm_resource: translate LSB rc to exit code and fix - resources_find_service_class() call - ---- - tools/crm_resource_runtime.c | 16 ++++++++++++---- - 1 file changed, 12 insertions(+), 4 deletions(-) - -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index ce037c514..e9d8aa687 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1718,10 +1718,10 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); - } else if (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, - pcmk__str_casei) && !pcmk__str_eq( -- resources_find_service_class(rsc_name), PCMK_RESOURCE_CLASS_LSB, -+ resources_find_service_class(rsc_type), PCMK_RESOURCE_CLASS_LSB, - pcmk__str_casei)) { - out->err(out, "Sorry, the %s option doesn't support %s resources", -- rsc_action, resources_find_service_class(rsc_name)); -+ rsc_action, resources_find_service_class(rsc_type)); - crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); - } - -@@ -1798,9 +1798,17 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - if (services_action_sync(op)) { - exit_code = op->rc; - -+ /* Lookup exit code based on rc for LSB resources */ -+ if (( pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_LSB, pcmk__str_casei) || -+ (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei) && -+ pcmk__str_eq(resources_find_service_class(rsc_type), PCMK_RESOURCE_CLASS_LSB, pcmk__str_casei)) ) && -+ pcmk__str_eq(rsc_action, "force-check", pcmk__str_casei)) { -+ exit_code = services_get_ocf_exitcode(action, exit_code); -+ } -+ - out->message(out, "resource-agent-action", resource_verbose, rsc_class, -- rsc_prov, rsc_type, rsc_name, rsc_action, override_hash, op->rc, -- op->status, op->stdout_data, op->stderr_data); -+ rsc_prov, rsc_type, rsc_name, rsc_action, override_hash, -+ exit_code, op->status, op->stdout_data, op->stderr_data); - } else { - exit_code = op->rc == 0 ? CRM_EX_ERROR : op->rc; - } --- -2.27.0 - - -From 9a6beb74adfb4710fb3a4e588bef79a562c101f3 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Thu, 12 Aug 2021 18:54:30 +0200 -Subject: [PATCH 2/2] Refactor: crm_resource: simplify rsc_class logic by - getting actual class early if it's of class "service" - ---- - tools/crm_resource_runtime.c | 23 +++++++++-------------- - 1 file changed, 9 insertions(+), 14 deletions(-) - -diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c -index e9d8aa687..13b78b6b9 100644 ---- a/tools/crm_resource_runtime.c -+++ b/tools/crm_resource_runtime.c -@@ -1702,26 +1702,23 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - int timeout_ms, int resource_verbose, gboolean force, - int check_level) - { -+ const char *class = NULL; - const char *action = NULL; - GHashTable *params_copy = NULL; - crm_exit_t exit_code = CRM_EX_OK; - svc_action_t *op = NULL; - -- if (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { -+ class = !pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei) ? -+ rsc_class : resources_find_service_class(rsc_type); -+ -+ if (pcmk__str_eq(class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { - out->err(out, "Sorry, the %s option doesn't support %s resources yet", -- rsc_action, rsc_class); -+ rsc_action, class); - crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); -- } else if (pcmk__strcase_any_of(rsc_class, PCMK_RESOURCE_CLASS_SYSTEMD, -+ } else if (pcmk__strcase_any_of(class, PCMK_RESOURCE_CLASS_SYSTEMD, - PCMK_RESOURCE_CLASS_UPSTART, PCMK_RESOURCE_CLASS_NAGIOS, NULL)) { - out->err(out, "Sorry, the %s option doesn't support %s resources", -- rsc_action, rsc_class); -- crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); -- } else if (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, -- pcmk__str_casei) && !pcmk__str_eq( -- resources_find_service_class(rsc_type), PCMK_RESOURCE_CLASS_LSB, -- pcmk__str_casei)) { -- out->err(out, "Sorry, the %s option doesn't support %s resources", -- rsc_action, resources_find_service_class(rsc_type)); -+ rsc_action, class); - crm_exit(CRM_EX_UNIMPLEMENT_FEATURE); - } - -@@ -1799,9 +1796,7 @@ cli_resource_execute_from_params(pcmk__output_t *out, const char *rsc_name, - exit_code = op->rc; - - /* Lookup exit code based on rc for LSB resources */ -- if (( pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_LSB, pcmk__str_casei) || -- (pcmk__str_eq(rsc_class, PCMK_RESOURCE_CLASS_SERVICE, pcmk__str_casei) && -- pcmk__str_eq(resources_find_service_class(rsc_type), PCMK_RESOURCE_CLASS_LSB, pcmk__str_casei)) ) && -+ if (pcmk__str_eq(class, PCMK_RESOURCE_CLASS_LSB, pcmk__str_casei) && - pcmk__str_eq(rsc_action, "force-check", pcmk__str_casei)) { - exit_code = services_get_ocf_exitcode(action, exit_code); - } --- -2.27.0 - diff --git a/SOURCES/020-fence_watchdog.patch b/SOURCES/020-fence_watchdog.patch deleted file mode 100644 index 76abe27..0000000 --- a/SOURCES/020-fence_watchdog.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 46dd1118cae948649e000b2159e8e92623520ad9 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Thu, 19 Aug 2021 09:28:54 +0200 -Subject: [PATCH] Fix: fence_watchdog: fix malformed xml in metadata - ---- - daemons/fenced/fence_watchdog.in | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/fenced/fence_watchdog.in b/daemons/fenced/fence_watchdog.in -index 700065e0e..eefa7395e 100755 ---- a/daemons/fenced/fence_watchdog.in -+++ b/daemons/fenced/fence_watchdog.in -@@ -124,7 +124,7 @@ def metadata(avail_opt, options): - for option, dummy in sorted_options(avail_opt): - if "shortdesc" in ALL_OPT[option]: - print(' ') -+ '" required="' + ALL_OPT[option]["required"] + '">') - - default = "" - default_name_arg = "-" + ALL_OPT[option]["getopt"][:-1] --- -2.27.0 - diff --git a/SOURCES/020-systemd-unit.patch b/SOURCES/020-systemd-unit.patch new file mode 100644 index 0000000..a425ae3 --- /dev/null +++ b/SOURCES/020-systemd-unit.patch @@ -0,0 +1,41 @@ +From e316840a7e1d2a72e3089ee194334244c959905a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 09:53:53 -0600 +Subject: [PATCH] Fix: pacemakerd: tweak systemd unit respawn settings + +If pacemaker exits immediately after starting, wait 1 second before trying to +respawn, since the default of 100ms is a bit aggressive for a Pacemaker +cluster. + +Also, allow 5 attempts in 25 seconds before giving up. +--- + daemons/pacemakerd/pacemaker.service.in | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in +index 0363a2259c..3fd53d9ffb 100644 +--- a/daemons/pacemakerd/pacemaker.service.in ++++ b/daemons/pacemakerd/pacemaker.service.in +@@ -31,6 +31,9 @@ After=rsyslog.service + After=corosync.service + Requires=corosync.service + ++# If Pacemaker respawns repeatedly, give up after this many tries in this time ++StartLimitBurst=5 ++StartLimitIntervalSec=25s + + [Install] + WantedBy=multi-user.target +@@ -57,6 +60,9 @@ TasksMax=infinity + # resource. Sending -KILL will just get the node fenced + SendSIGKILL=no + ++# Systemd's default of respawning a failed service after 100ms is too aggressive ++RestartSec=1s ++ + # If we ever hit the StartLimitInterval/StartLimitBurst limit, and the + # admin wants to stop the cluster while pacemakerd is not running, it + # might be a good idea to enable the ExecStopPost directive below. +-- +2.27.0 + diff --git a/SOURCES/021-failure-messages.patch b/SOURCES/021-failure-messages.patch new file mode 100644 index 0000000..fab1013 --- /dev/null +++ b/SOURCES/021-failure-messages.patch @@ -0,0 +1,1338 @@ +From 9ee3d6c9b0aba6aae022cc152a3b3472fe388fa3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 16:44:32 -0600 +Subject: [PATCH 01/15] Refactor: fencer: add exit reason to fencing operation + object + +In order to pass a fencing action's exit reason with the action history, +we need the exit reason in remote_fencing_op_t. Nothing sets or uses it as of +this commit. +--- + daemons/fenced/fenced_remote.c | 2 ++ + daemons/fenced/pacemaker-fenced.h | 4 +++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 6eebb7381e..0fa9706140 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -260,6 +260,8 @@ free_remote_op(gpointer data) + } + g_list_free_full(op->automatic_list, free); + g_list_free(op->duplicates); ++ ++ pcmk__reset_result(&op->result); + free(op); + } + +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 502fcc9a29..1a5c933ea7 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. +@@ -151,6 +151,8 @@ typedef struct remote_fencing_op_s { + /*! The point at which the remote operation completed(nsec) */ + long long completed_nsec; + ++ /*! The (potentially intermediate) result of the operation */ ++ pcmk__action_result_t result; + } remote_fencing_op_t; + + void fenced_broadcast_op_result(remote_fencing_op_t *op, +-- +2.27.0 + + +From 97a2c318866adc5ef5e426c5c3b753df1fa3ab66 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:08:42 -0600 +Subject: [PATCH 02/15] Refactor: fencer: track full result in + remote_fencing_op_t + +Now that remote_fencing_op_t has a place for the full result, +set it before calling finalize_op(), instead of passing a separate result +object to finalize_op(). + +As a bonus, this simplifies the memory management, reducing the chance of +mistakes. +--- + daemons/fenced/fenced_remote.c | 161 ++++++++++++++++----------------- + 1 file changed, 77 insertions(+), 84 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 0fa9706140..30edbff890 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -82,8 +82,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + static void request_peer_fencing(remote_fencing_op_t *op, + peer_device_info_t *peer, + pcmk__action_result_t *result); +-static void finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup); ++static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, + const peer_device_info_t *chosen_peer); +@@ -485,7 +484,9 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + other->client_name, other->originator, + pcmk_exec_status_str(result->execution_status), + other->id); +- finalize_op(other, data, result, true); ++ pcmk__set_result(&other->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(other, data, true); + + } else { + // Possible if (for example) it timed out already +@@ -520,20 +521,20 @@ delegate_from_xml(xmlNode *xml) + * + * \param[in] op Fencer operation that completed + * \param[in] data If not NULL, XML reply of last delegated fencing operation +- * \param[in] result Full operation result + * \param[in] dup Whether this operation is a duplicate of another + * (in which case, do not broadcast the result) ++ * ++ * \note The operation result should be set before calling this function. + */ + static void +-finalize_op(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result, bool dup) ++finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + { + int level = LOG_ERR; + const char *subt = NULL; + xmlNode *local_data = NULL; + gboolean op_merged = FALSE; + +- CRM_CHECK((op != NULL) && (result != NULL), return); ++ CRM_CHECK((op != NULL), return); + + if (op->notify_sent) { + // Most likely, this is a timed-out action that eventually completed +@@ -557,11 +558,11 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + local_data = data; + + } else if (op->delegate == NULL) { +- switch (result->execution_status) { ++ switch (op->result.execution_status) { + case PCMK_EXEC_NO_FENCE_DEVICE: + break; + case PCMK_EXEC_INVALID: +- if (result->exit_status == CRM_EX_EXPIRED) { ++ if (op->result.exit_status == CRM_EX_EXPIRED) { + break; + } + // else fall through +@@ -581,12 +582,12 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, result, op_merged); ++ fenced_broadcast_op_result(op, &op->result, op_merged); + free_xml(local_data); + return; + } + +- if (pcmk__result_ok(result) || dup ++ if (pcmk__result_ok(&op->result) || dup + || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + level = LOG_NOTICE; + } +@@ -595,16 +596,17 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, + (op->target? op->target : ""), + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, +- (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), +- pcmk_exec_status_str(result->execution_status), +- ((result->exit_reason == NULL)? "" : ": "), +- ((result->exit_reason == NULL)? "" : result->exit_reason), ++ (op_merged? " (merged)" : ""), ++ crm_exit_str(op->result.exit_status), ++ pcmk_exec_status_str(op->result.execution_status), ++ ((op->result.exit_reason == NULL)? "" : ": "), ++ ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, result); ++ handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, result); ++ finalize_op_duplicates(op, data, &op->result); + } + + /* Free non-essential parts of the record +@@ -634,7 +636,6 @@ static gboolean + remote_op_watchdog_done(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + +@@ -642,8 +643,8 @@ remote_op_watchdog_done(gpointer userdata) + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->id); + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, NULL, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, NULL, false); + return G_SOURCE_REMOVE; + } + +@@ -676,8 +677,6 @@ remote_op_timeout_one(gpointer userdata) + static void + finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + op->op_timer_total = 0; + + crm_debug("Action '%s' targeting %s for client %s timed out " +@@ -690,13 +689,12 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) + * devices, and return success. + */ + op->state = st_done; +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } else { + op->state = st_failed; +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); + } +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); ++ finalize_op(op, NULL, false); + } + + /*! +@@ -1094,13 +1092,9 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) + set_fencing_completed(op); + op->delegate = strdup("a human"); + +- { +- // For the fencer's purposes, the fencing operation is done +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- finalize_op(op, msg, &result, false); +- } ++ // For the fencer's purposes, the fencing operation is done ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + + /* For the requester's purposes, the operation is still pending. The + * actual result will be sent asynchronously via the operation's done_cb(). +@@ -1279,16 +1273,11 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, + switch (op->state) { + case st_failed: + // advance_topology_level() exhausted levels +- { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, +- "All topology levels failed"); +- crm_warn("Could not request peer fencing (%s) targeting %s " +- CRM_XS " id=%.8s", op->action, op->target, op->id); +- finalize_op(op, NULL, &result, false); +- pcmk__reset_result(&result); +- } ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, ++ "All topology levels failed"); ++ crm_warn("Could not request peer fencing (%s) targeting %s " ++ CRM_XS " id=%.8s", op->action, op->target, op->id); ++ finalize_op(op, NULL, false); + return op; + + case st_duplicate: +@@ -1613,10 +1602,6 @@ static void + advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + xmlNode *msg) + { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); +- + /* Advance to the next device at this topology level, if any */ + if (op->devices) { + op->devices = op->devices->next; +@@ -1644,6 +1629,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + } + + if (op->devices) { ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; ++ ++ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1659,7 +1648,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ finalize_op(op, msg, false); + } + } + +@@ -1868,7 +1858,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- finalize_op(op, NULL, result, false); ++ pcmk__set_result(&op->result, result->exit_status, ++ result->execution_status, result->exit_reason); ++ finalize_op(op, NULL, false); + + } else { + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " +@@ -2245,31 +2237,34 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Could be for an event that began before we started */ + /* TODO: Record the op for later querying */ + crm_info("Received peer result of unknown or expired operation %s", id); +- goto done; ++ pcmk__reset_result(&result); ++ return; + } + ++ op->result = result; // The operation takes ownership of the result ++ + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { + crm_err("Received outdated reply for device %s (instead of %s) to " + "fence (%s) %s. Operation already timed out at peer level.", + device, (const char *) op->devices->data, op->action, op->target); +- goto done; ++ return; + } + + if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { + crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " + CRM_XS " id=%.8s", + op->action, op->target, op->client_name, op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")", op->id); +- if (pcmk__result_ok(&result)) { ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")", op->id); ++ if (pcmk__result_ok(&op->result)) { + op->state = st_done; + } else { + op->state = st_failed; + } +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { + /* If this isn't a remote level broadcast, and we are not the +@@ -2277,7 +2272,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_err("Received non-broadcast fencing result for operation %.8s " + "we do not own (device %s targeting %s)", + op->id, device, op->target); +- goto done; ++ return; + } + + if (pcmk_is_set(op->call_options, st_opt_topology)) { +@@ -2286,58 +2281,58 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", + op->action, op->target, device, op->client_name, + op->originator, +- pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : " (", +- (result.exit_reason == NULL)? "" : result.exit_reason, +- (result.exit_reason == NULL)? "" : ")"); ++ pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : " (", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, ++ (op->result.exit_reason == NULL)? "" : ")"); + + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if (op->state == st_done) { +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + +- if ((op->phase == 2) && !pcmk__result_ok(&result)) { ++ if ((op->phase == 2) && !pcmk__result_ok(&op->result)) { + /* A remapped "on" failed, but the node was already turned off + * successfully, so ignore the error and continue. + */ + crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " + "after successful 'off'", +- device, pcmk_exec_status_str(result.execution_status), +- (result.exit_reason == NULL)? "" : ": ", +- (result.exit_reason == NULL)? "" : result.exit_reason, ++ device, pcmk_exec_status_str(op->result.execution_status), ++ (op->result.exit_reason == NULL)? "" : ": ", ++ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, + op->target); +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + } + +- if (pcmk__result_ok(&result)) { ++ if (pcmk__result_ok(&op->result)) { + /* An operation completed successfully. Try another device if + * necessary, otherwise mark the operation as done. */ + advance_topology_device_in_level(op, device, msg); +- goto done; ++ return; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (advance_topology_level(op, false) != pcmk_rc_ok) { + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + } + } + +- } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { ++ } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) { + crm_trace("All done for %s", op->target); + op->state = st_done; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + +- } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) ++ } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT) + && (op->devices == NULL)) { + /* If the operation timed out don't bother retrying other peers. */ + op->state = st_failed; +- finalize_op(op, msg, &result, false); +- goto done; ++ finalize_op(op, msg, false); ++ return; + + } else { + /* fall-through and attempt other fencing action using another peer */ +@@ -2346,10 +2341,8 @@ fenced_process_fencing_reply(xmlNode *msg) + /* Retry on failure */ + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, +- pcmk_exec_status_str(result.execution_status)); +- request_peer_fencing(op, NULL, &result); +-done: +- pcmk__reset_result(&result); ++ pcmk_exec_status_str(op->result.execution_status)); ++ request_peer_fencing(op, NULL, &op->result); + } + + gboolean +-- +2.27.0 + + +From c59d062154f7c9e15e90929a20ea244d7efd7247 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:11:12 -0600 +Subject: [PATCH 03/15] Refactor: fencer: drop redundant argument from + finalize_op_duplicates() + +... now that the result is in the op +--- + daemons/fenced/fenced_remote.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 30edbff890..8b496e1042 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -468,11 +468,9 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data) + { + for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { + remote_fencing_op_t *other = iter->data; +@@ -482,10 +480,11 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, + crm_debug("Performing duplicate notification for %s@%s: %s " + CRM_XS " id=%.8s", + other->client_name, other->originator, +- pcmk_exec_status_str(result->execution_status), ++ pcmk_exec_status_str(op->result.execution_status), + other->id); +- pcmk__set_result(&other->result, result->exit_status, +- result->execution_status, result->exit_reason); ++ pcmk__set_result(&other->result, op->result.exit_status, ++ op->result.execution_status, ++ op->result.exit_reason); + finalize_op(other, data, true); + + } else { +@@ -606,7 +605,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + handle_local_reply_and_notify(op, data, &op->result); + + if (!dup) { +- finalize_op_duplicates(op, data, &op->result); ++ finalize_op_duplicates(op, data); + } + + /* Free non-essential parts of the record +-- +2.27.0 + + +From 6c49675855323a52a534afa112a0861ba2e3b1ad Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:15:17 -0600 +Subject: [PATCH 04/15] Refactor: fencer: drop redundant argument from + fenced_broadcast_op_result() + +... now that the op includes the result +--- + daemons/fenced/fenced_history.c | 9 +++------ + daemons/fenced/fenced_remote.c | 8 +++----- + daemons/fenced/pacemaker-fenced.h | 3 +-- + 3 files changed, 7 insertions(+), 13 deletions(-) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 0157deadb3..5cacf36ca8 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2021 the Pacemaker project contributors ++ * Copyright 2009-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -359,8 +359,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + } + + if (remote_history) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + init_stonith_remote_op_hash_table(&stonith_remote_op_list); + + updated |= g_hash_table_size(remote_history); +@@ -378,10 +376,10 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() + * from setting a delegate + */ +- pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, ++ pcmk__set_result(&op->result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, + "Initiated by earlier fencer " + "process and presumed failed"); +- fenced_broadcast_op_result(op, &result, false); ++ fenced_broadcast_op_result(op, false); + } + + g_hash_table_iter_steal(&iter); +@@ -396,7 +394,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + */ + } + +- pcmk__reset_result(&result); + g_hash_table_destroy(remote_history); /* remove what is left */ + } + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8b496e1042..fb5a5e980e 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -390,16 +390,14 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + * \brief Broadcast a fence result notification to all CPG peers + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * \param[in] op_merged Whether this operation is a duplicate of another + */ + void +-fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged) ++fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, result); ++ xmlNode *notify_data = fencing_result2xml(op, &op->result); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -581,7 +579,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + subt = crm_element_value(data, F_SUBTYPE); + if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { + /* Defer notification until the bcast message arrives */ +- fenced_broadcast_op_result(op, &op->result, op_merged); ++ fenced_broadcast_op_result(op, op_merged); + free_xml(local_data); + return; + } +diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h +index 1a5c933ea7..6213407da3 100644 +--- a/daemons/fenced/pacemaker-fenced.h ++++ b/daemons/fenced/pacemaker-fenced.h +@@ -155,8 +155,7 @@ typedef struct remote_fencing_op_s { + pcmk__action_result_t result; + } remote_fencing_op_t; + +-void fenced_broadcast_op_result(remote_fencing_op_t *op, +- pcmk__action_result_t *result, bool op_merged); ++void fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged); + + // Fencer-specific client flags + enum st_client_flags { +-- +2.27.0 + + +From 73994fc740b8833457b130368db479502d49f285 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:17:33 -0600 +Subject: [PATCH 05/15] Refactor: fencer: drop redundant argument from + handle_local_reply_and_notify() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index fb5a5e980e..2621cb2f19 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -424,11 +424,9 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + * + * \param[in] op Fencer operation that completed + * \param[in] data Top-level XML to add notification to +- * \param[in] result Full operation result + */ + static void +-handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, +- pcmk__action_result_t *result) ++handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + { + xmlNode *notify_data = NULL; + xmlNode *reply = NULL; +@@ -443,15 +441,15 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, + crm_xml_add(data, F_STONITH_TARGET, op->target); + crm_xml_add(data, F_STONITH_OPERATION, op->action); + +- reply = fenced_construct_reply(op->request, data, result); ++ reply = fenced_construct_reply(op->request, data, &op->result); + crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); + + /* Send fencing OP reply to local client that initiated fencing */ + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, result); +- fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); ++ notify_data = fencing_result2xml(op, &op->result); ++ fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); + +@@ -600,7 +598,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) + ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), + op->id); + +- handle_local_reply_and_notify(op, data, &op->result); ++ handle_local_reply_and_notify(op, data); + + if (!dup) { + finalize_op_duplicates(op, data); +-- +2.27.0 + + +From 194056d18d3b550d3a53b94d558ceed03b5e5442 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:18:27 -0600 +Subject: [PATCH 06/15] Refactor: fencer: drop redundant argument from + fencing_result2xml() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 2621cb2f19..8d4f53eef6 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -362,13 +362,12 @@ undo_op_remap(remote_fencing_op_t *op) + * \brief Create notification data XML for a fencing operation result + * + * \param[in] op Fencer operation that completed +- * \param[in] result Full operation result + * + * \return Newly created XML to add as notification data + * \note The caller is responsible for freeing the result. + */ + static xmlNode * +-fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) ++fencing_result2xml(remote_fencing_op_t *op) + { + xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); + +@@ -381,7 +380,7 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) + crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); + +- stonith__xe_set_result(notify_data, result); ++ stonith__xe_set_result(notify_data, &op->result); + return notify_data; + } + +@@ -397,7 +396,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) + { + static int count = 0; + xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); +- xmlNode *notify_data = fencing_result2xml(op, &op->result); ++ xmlNode *notify_data = fencing_result2xml(op); + + count++; + crm_trace("Broadcasting result to peers"); +@@ -448,7 +447,7 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) + do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); + + /* bcast to all local clients that the fencing operation happend */ +- notify_data = fencing_result2xml(op, &op->result); ++ notify_data = fencing_result2xml(op); + fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); + free_xml(notify_data); + fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); +-- +2.27.0 + + +From c5d38cb201a1219ca95127cba9c3a778e31966a2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:35:43 -0600 +Subject: [PATCH 07/15] Refactor: fencer: drop redundant argument from + request_peer_fencing() + +... now that the op includes the result +--- + daemons/fenced/fenced_remote.c | 66 +++++++++++++--------------------- + 1 file changed, 25 insertions(+), 41 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 8d4f53eef6..7fb7695fba 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -80,8 +80,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + int call_options); + + static void request_peer_fencing(remote_fencing_op_t *op, +- peer_device_info_t *peer, +- pcmk__action_result_t *result); ++ peer_device_info_t *peer); + static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); + static int get_op_total_timeout(const remote_fencing_op_t *op, +@@ -646,18 +645,16 @@ static gboolean + remote_op_timeout_one(gpointer userdata) + { + remote_fencing_op_t *op = userdata; +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + op->op_timer_one = 0; + + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%.8s", op->action, op->target, op->client_name, op->id); +- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, ++ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, + "Peer did not return fence result within timeout"); + +- + // Try another device, if appropriate +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + return FALSE; + } + +@@ -730,13 +727,10 @@ remote_op_query_timeout(gpointer data) + crm_debug("Operation %.8s targeting %s already in progress", + op->id, op->target); + } else if (op->query_results) { +- // Result won't be used in this case, but we need to pass something +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- + // Query succeeded, so attempt the actual fencing + crm_debug("Query %.8s targeting %s complete (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + crm_debug("Query %.8s targeting %s timed out (state=%s)", + op->id, op->target, stonith_op_state_str(op->state)); +@@ -1622,11 +1616,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op_phase_on(op); + } + +- if (op->devices) { +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; +- +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ // This function is only called if the previous device succeeded ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + ++ if (op->devices) { + /* Necessary devices remain, so execute the next one */ + crm_trace("Next targeting %s on behalf of %s@%s", + op->target, op->client_name, op->originator); +@@ -1636,13 +1629,12 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, + op->delay = 0; + } + +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } else { + /* We're done with all devices and phases, so finalize operation */ + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); + op->state = st_done; +- pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + finalize_op(op, msg, false); + } + } +@@ -1673,13 +1665,9 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + * \param[in] op Fencing operation to be executed + * \param[in] peer If NULL or topology is in use, choose best peer to execute + * the fencing, otherwise use this peer +- * \param[in] result Full result of previous failed attempt, if any (used as +- * final result only if a previous attempt failed, topology +- * is not in use, and no devices remain to be attempted) + */ + static void +-request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, +- pcmk__action_result_t *result) ++request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) + { + const char *device = NULL; + int timeout; +@@ -1822,27 +1810,26 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + } + +- // This is the only case in which result will be used +- CRM_CHECK(result != NULL, return); +- + if (op->state == st_query) { + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%s", + op->replies, op->action, op->target, op->client_name, + stonith_op_state_str(op->state)); + +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, +- NULL); ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, ++ PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } else { + if (pcmk_is_set(op->call_options, st_opt_topology)) { +- pcmk__reset_result(result); +- pcmk__set_result(result, CRM_EX_ERROR, ++ pcmk__reset_result(&op->result); ++ pcmk__set_result(&op->result, CRM_EX_ERROR, + PCMK_EXEC_NO_FENCE_DEVICE, NULL); + } +- /* ... else use result provided by caller -- overwriting it with +- PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from +- setting the correct delegate if needed. ++ /* ... else use existing result from previous failed attempt ++ * (topology is not in use, and no devices remain to be attempted). ++ * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would ++ * prevent finalize_op() from setting the correct delegate if ++ * needed. + */ + + crm_info("No peers (out of %d) are capable of fencing (%s) %s " +@@ -1852,8 +1839,6 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, + } + + op->state = st_failed; +- pcmk__set_result(&op->result, result->exit_status, +- result->execution_status, result->exit_reason); + finalize_op(op, NULL, false); + + } else { +@@ -2104,7 +2089,6 @@ process_remote_stonith_query(xmlNode * msg) + peer_device_info_t *peer = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); +- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -2139,7 +2123,7 @@ process_remote_stonith_query(xmlNode * msg) + peer = add_result(op, host, ndevices, dev); + } + +- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); ++ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); + + if (pcmk_is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, +@@ -2148,12 +2132,12 @@ process_remote_stonith_query(xmlNode * msg) + if (op->state == st_query && all_topology_devices_found(op)) { + /* All the query results are in for the topology, start the fencing ops. */ + crm_trace("All topology devices found"); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + } + + } else if (op->state == st_query) { +@@ -2165,12 +2149,12 @@ process_remote_stonith_query(xmlNode * msg) + /* we have a verified device living on a peer that is not the target */ + crm_trace("Found %d verified device%s", + nverified, pcmk__plural_s(nverified)); +- request_peer_fencing(op, peer, &result); ++ request_peer_fencing(op, peer); + + } else if (have_all_replies) { + crm_info("All query replies have arrived, continuing (%d expected/%d received) ", + replies_expected, op->replies); +- request_peer_fencing(op, NULL, &result); ++ request_peer_fencing(op, NULL); + + } else { + crm_trace("Waiting for more peer results before launching fencing operation"); +@@ -2336,7 +2320,7 @@ fenced_process_fencing_reply(xmlNode *msg) + crm_trace("Next for %s on behalf of %s@%s (result was: %s)", + op->target, op->originator, op->client_name, + pcmk_exec_status_str(op->result.execution_status)); +- request_peer_fencing(op, NULL, &op->result); ++ request_peer_fencing(op, NULL); + } + + gboolean +-- +2.27.0 + + +From be0a0b652c13161a82b05d3104449b7bfc06e8ac Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 17:56:24 -0600 +Subject: [PATCH 08/15] Feature: fencer: track full result in fencing history + +Add fencing operation results when creating XML in +stonith_local_history_diff_and_merge(), and parse the results from the received +XML in stonith_xml_history_to_list(). + +With this, the fencer now always has full results in its op list, and returns +them in the reply for STONITH_OP_FENCE_HISTORY requests (though nothing uses +that as of this commit). +--- + daemons/fenced/fenced_history.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c +index 5cacf36ca8..3ebf016e67 100644 +--- a/daemons/fenced/fenced_history.c ++++ b/daemons/fenced/fenced_history.c +@@ -257,6 +257,7 @@ stonith_xml_history_to_list(xmlNode *history) + op->completed_nsec = completed_nsec; + crm_element_value_int(xml_op, F_STONITH_STATE, &state); + op->state = (enum op_state) state; ++ stonith__xe_get_result(xml_op, &op->result); + + g_hash_table_replace(rv, id, op); + CRM_LOG_ASSERT(g_hash_table_lookup(rv, id) != NULL); +@@ -355,6 +356,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, + crm_xml_add_ll(entry, F_STONITH_DATE, op->completed); + crm_xml_add_ll(entry, F_STONITH_DATE_NSEC, op->completed_nsec); + crm_xml_add_int(entry, F_STONITH_STATE, op->state); ++ stonith__xe_set_result(entry, &op->result); + } + } + +-- +2.27.0 + + +From afc5292036e212bcfc7475893e0b326b2a69ac58 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 22 Dec 2021 17:17:21 -0600 +Subject: [PATCH 09/15] API: libstonithd: add exit_reason member to + stonith_history_t + +not yet used, but will be +--- + include/crm/stonith-ng.h | 3 ++- + lib/fencing/st_client.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h +index 3fe9cf54f8..2c79bfa579 100644 +--- a/include/crm/stonith-ng.h ++++ b/include/crm/stonith-ng.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -111,6 +111,7 @@ typedef struct stonith_history_s { + time_t completed; + struct stonith_history_s *next; + long completed_nsec; ++ char *exit_reason; + } stonith_history_t; + + typedef struct stonith_s stonith_t; +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 718739b321..57a2e03361 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2021 the Pacemaker project contributors ++ * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -735,6 +735,7 @@ void stonith_history_free(stonith_history_t *history) + free(hp->origin); + free(hp->delegate); + free(hp->client); ++ free(hp->exit_reason); + } + } + +-- +2.27.0 + + +From 1b9e2896322849002a5c0a3a34c9375ea32571d6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 6 Jan 2022 18:04:15 -0600 +Subject: [PATCH 10/15] Feature: fencing: return exit reason with fencing + history + +libstonithd's stonith_t:cmds->history() method now parses exit reasons from the +fencer reply, and returns them in the stonith_history_t results. +--- + lib/fencing/st_client.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 57a2e03361..d229b34805 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -698,6 +698,7 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + stonith_history_t *kvp; + long long completed; + long long completed_nsec = 0L; ++ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + + kvp = calloc(1, sizeof(stonith_history_t)); + kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); +@@ -711,6 +712,11 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, + kvp->completed_nsec = completed_nsec; + crm_element_value_int(op, F_STONITH_STATE, &kvp->state); + ++ stonith__xe_get_result(op, &result); ++ kvp->exit_reason = result.exit_reason; ++ result.exit_reason = NULL; ++ pcmk__reset_result(&result); ++ + if (last) { + last->next = kvp; + } else { +-- +2.27.0 + + +From ba4e77242e9be4ebeb2843b444ee4afad43c29f3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:44:39 -0600 +Subject: [PATCH 11/15] Feature: fencing: display exit reasons with failed + fencing events + +... when available +--- + lib/fencing/st_output.c | 20 ++++++++++++++++---- + tools/crm_mon_curses.c | 9 +++++++-- + 2 files changed, 23 insertions(+), 6 deletions(-) + +diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c +index e484278867..18924d795d 100644 +--- a/lib/fencing/st_output.c ++++ b/lib/fencing/st_output.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -11,6 +11,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -263,8 +264,12 @@ stonith_event_html(pcmk__output_t *out, va_list args) { + char *failed_s = time_t_string(event->completed); + + out->list_item(out, "failed-stonith-event", +- "%s of %s failed : delegate=%s, client=%s, origin=%s, %s='%s' %s", ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", +@@ -296,8 +301,13 @@ stonith_event_text(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- pcmk__indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s' %s\n", ++ pcmk__indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +@@ -341,7 +351,9 @@ stonith_event_xml(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- crm_xml_add(node, "status", "failed"); ++ pcmk__xe_set_props(node, "status", "failed", ++ XML_LRM_ATTR_EXIT_REASON, event->exit_reason, ++ NULL); + break; + + case st_done: +diff --git a/tools/crm_mon_curses.c b/tools/crm_mon_curses.c +index bae3710c44..73c8516a8c 100644 +--- a/tools/crm_mon_curses.c ++++ b/tools/crm_mon_curses.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2019-2021 the Pacemaker project contributors ++ * Copyright 2019-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -463,8 +463,13 @@ stonith_event_console(pcmk__output_t *out, va_list args) { + + switch (event->state) { + case st_failed: +- curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s'%s\n", ++ curses_indented_printf(out, ++ "%s of %s failed%s%s%s: " ++ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", + stonith_action_str(event->action), event->target, ++ (event->exit_reason == NULL)? "" : " (", ++ (event->exit_reason == NULL)? "" : event->exit_reason, ++ (event->exit_reason == NULL)? "" : ")", + event->delegate ? event->delegate : "", + event->client, event->origin, + full_history ? "completed" : "last-failed", buf, +-- +2.27.0 + + +From 8105fb4a3a786780fdf85b3d0308eaf6df1ea434 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:45:22 -0600 +Subject: [PATCH 12/15] Low: schemas: copy fence-event API schema in + preparation for changes + +--- + include/crm/common/output_internal.h | 2 +- + xml/api/fence-event-2.15.rng | 33 ++++++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 1 deletion(-) + create mode 100644 xml/api/fence-event-2.15.rng + +diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h +index 479f0e4b43..8c5dcee17c 100644 +--- a/include/crm/common/output_internal.h ++++ b/include/crm/common/output_internal.h +@@ -27,7 +27,7 @@ extern "C" { + # include + # include + +-# define PCMK__API_VERSION "2.14" ++# define PCMK__API_VERSION "2.15" + + #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) + # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +new file mode 100644 +index 0000000000..e54687cd25 +--- /dev/null ++++ b/xml/api/fence-event-2.15.rng +@@ -0,0 +1,33 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ failed ++ success ++ pending ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.27.0 + + +From 46dd9b74d2ee8f7ab70a0c7fe3a998954d4029e8 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 09:47:16 -0600 +Subject: [PATCH 13/15] Low: schemas: update fence-event API schema for recent + change + +--- + xml/api/fence-event-2.15.rng | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng +index e54687cd25..8e000cafa5 100644 +--- a/xml/api/fence-event-2.15.rng ++++ b/xml/api/fence-event-2.15.rng +@@ -18,6 +18,9 @@ + + + ++ ++ ++ + + + +-- +2.27.0 + + +From 350e71772f67f28af6b67f864cbabc481730035c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 7 Jan 2022 11:32:09 -0600 +Subject: [PATCH 14/15] Build: libstonithd: bump shared library version + +... for stonith_history_t change since 2.1.2. + +The struct should only ever be returned by the library as a pointer, so the +changes can be considered backward-compatible. Normally we wouldn't bump shared +library versions mid-cycle, but this will simplify expected backports of this +change. +--- + lib/fencing/Makefile.am | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am +index 1ffa3e051b..a10ddb88ec 100644 +--- a/lib/fencing/Makefile.am ++++ b/lib/fencing/Makefile.am +@@ -2,7 +2,7 @@ + # Original Author: Sun Jiang Dong + # Copyright 2004 International Business Machines + # +-# with later changes copyright 2004-2021 the Pacemaker project contributors. ++# with later changes copyright 2004-2022 the Pacemaker project contributors. + # The version control history for this file may have further details. + # + # This source code is licensed under the GNU General Public License version 2 +@@ -14,7 +14,7 @@ noinst_HEADERS = fencing_private.h + + lib_LTLIBRARIES = libstonithd.la + +-libstonithd_la_LDFLAGS = -version-info 33:0:7 ++libstonithd_la_LDFLAGS = -version-info 34:0:8 + + libstonithd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) + libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) +-- +2.27.0 + + +From 63ea88620a62ff0759560a02bb5e284ebdd03eb6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 19 Jan 2022 16:53:45 -0600 +Subject: [PATCH 15/15] Low: fencer: reset op result before grabbing new one + +just in case +--- + daemons/fenced/fenced_remote.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 7fb7695fba..dc4649e0fc 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -2219,6 +2219,7 @@ fenced_process_fencing_reply(xmlNode *msg) + return; + } + ++ pcmk__reset_result(&op->result); + op->result = result; // The operation takes ownership of the result + + if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { +-- +2.27.0 + diff --git a/SOURCES/022-memory-leak.patch b/SOURCES/022-memory-leak.patch new file mode 100644 index 0000000..3970dd3 --- /dev/null +++ b/SOURCES/022-memory-leak.patch @@ -0,0 +1,82 @@ +From 8034a203bbff0aa3b53f2946dc58e409bd7246c9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 20 Jan 2022 15:03:31 -0600 +Subject: [PATCH] Fix: scheduler: avoid memory leak when displaying clones + +Previously, pe__clone_default() unconditionally created a hash table for +stopped instances, but didn't free it in every code path. + +Now, only create the table when we have something to put in it and might +actually use it, and ensure it always gets freed. +--- + lib/pengine/clone.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c +index 742e2920b0..920a04c32c 100644 +--- a/lib/pengine/clone.c ++++ b/lib/pengine/clone.c +@@ -761,7 +761,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *only_node = va_arg(args, GList *); + GList *only_rsc = va_arg(args, GList *); + +- GHashTable *stopped = pcmk__strkey_table(free, free); ++ GHashTable *stopped = NULL; + + char *list_text = NULL; + size_t list_text_len = 0; +@@ -818,7 +818,11 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } else if (partially_active == FALSE) { + // List stopped instances when requested (except orphans) + if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) ++ && !pcmk_is_set(show_opts, pcmk_show_clone_detail) + && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); + } + +@@ -873,7 +877,6 @@ pe__clone_default(pcmk__output_t *out, va_list args) + } + + if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { +- g_hash_table_destroy(stopped); + PCMK__OUTPUT_LIST_FOOTER(out, rc); + return pcmk_rc_ok; + } +@@ -948,8 +951,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) + GList *list = g_hash_table_get_values(rsc->allowed_nodes); + + /* Custom stopped table for non-unique clones */ +- g_hash_table_destroy(stopped); +- stopped = pcmk__strkey_table(free, free); ++ if (stopped != NULL) { ++ g_hash_table_destroy(stopped); ++ stopped = NULL; ++ } + + if (list == NULL) { + /* Clusters with symmetrical=false haven't calculated allowed_nodes yet +@@ -972,6 +977,9 @@ pe__clone_default(pcmk__output_t *out, va_list args) + state = "Stopped (disabled)"; + } + ++ if (stopped == NULL) { ++ stopped = pcmk__strkey_table(free, free); ++ } + if (probe_op != NULL) { + int rc; + +@@ -987,7 +995,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) + g_list_free(list); + } + +- if (g_hash_table_size(stopped) > 0) { ++ if (stopped != NULL) { + GList *list = sorted_hash_table_values(stopped); + + clone_header(out, &rc, rsc, clone_data); +-- +2.27.0 + diff --git a/SOURCES/023-regression.patch b/SOURCES/023-regression.patch new file mode 100644 index 0000000..62d2a46 --- /dev/null +++ b/SOURCES/023-regression.patch @@ -0,0 +1,30 @@ +From 16928cfc69136bc56b1574bee9966e0d5de73abd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 26 Jan 2022 09:15:43 -0600 +Subject: [PATCH] Fix: controller: correctly match "node down" events + +regression introduced in 2.1.2 by 03ce7376e + +The symptom that led to this was that removing a remote node connection +resource would lead to the remote node getting fenced when the connection stop +was not recognized as an expected down event. +--- + daemons/controld/controld_te_events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c +index 36fd832ba0..1fd7129922 100644 +--- a/daemons/controld/controld_te_events.c ++++ b/daemons/controld/controld_te_events.c +@@ -304,7 +304,7 @@ match_down_event(const char *target) + gIter2 = gIter2->next) { + + match = (crm_action_t*)gIter2->data; +- if (pcmk_is_set(match->flags, pcmk__graph_action_confirmed)) { ++ if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { + xpath_ret = xpath_search(match->xml, xpath); + if (numXpathResults(xpath_ret) < 1) { + match = NULL; +-- +2.27.0 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index 65a1299..d0b0903 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -35,11 +35,11 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 2.1.0 -%global specversion 8 +%global pcmkversion 2.1.2 +%global specversion 4 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build -%global commit 7c3f660707a495a1331716ad32cd3ac9d9f8ff58 +%global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175 ## Since git v2.11, the extent of abbreviation is autoscaled by default ## (used to be constant of 7), so we need to convey it for non-tags, too. @@ -263,26 +263,29 @@ Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{arch Source1: nagios-agents-metadata-%{nagios_hash}.tar.gz # upstream commits -Patch1: 001-ping-agent.patch -Patch2: 002-pacemakerd-options.patch -Patch3: 003-pacemakerd-output.patch -Patch4: 004-check-level.patch -Patch5: 005-crm_resource.patch -Patch6: 006-crm_simulate.patch -Patch7: 007-unfencing-loop.patch -Patch8: 008-dynamic-list-fencing.patch -Patch9: 009-crm_resource-messages.patch -Patch10: 010-probe-pending.patch -Patch11: 011-crm_attribute-regression.patch -Patch12: 012-string-arguments.patch -Patch13: 013-leaks.patch -Patch14: 014-str-list.patch -Patch15: 015-sbd.patch -Patch16: 016-cts.patch -Patch17: 017-watchdog-fixes.patch -Patch18: 018-controller.patch -Patch19: 019-crm_resource.patch -Patch20: 020-fence_watchdog.patch +Patch1: 001-acl-group-schema.patch +Patch2: 002-fencing-reasons.patch +Patch3: 003-fencing-reasons.patch +Patch4: 004-systemd-metadata.patch +Patch5: 005-fencing-reasons.patch +Patch6: 006-stateful-metadata.patch +Patch7: 007-memory-leak.patch +Patch8: 008-fencing-history.patch +Patch9: 009-fencing-reasons.patch +Patch10: 010-probe-failures.patch +Patch11: 011-fencing-reasons.patch +Patch12: 012-notify-crash.patch +Patch13: 013-probe-failures.patch +Patch14: 014-pcmk_delay_base.patch +Patch15: 015-fencing-reasons.patch +Patch16: 016-fencing-crash.patch +Patch17: 017-fencing-reasons.patch +Patch18: 018-failure-messages.patch +Patch19: 019-corosync-tracking.patch +Patch20: 020-systemd-unit.patch +Patch21: 021-failure-messages.patch +Patch22: 022-memory-leak.patch +Patch23: 023-regression.patch # downstream-only commits #Patch1xx: 1xx-xxxx.patch @@ -342,6 +345,9 @@ BuildRequires: help2man BuildRequires: ncurses-devel BuildRequires: pam-devel +# Required for "make check" +BuildRequires: libcmocka-devel + %if %{systemd_native} BuildRequires: pkgconfig(systemd) %endif @@ -349,8 +355,11 @@ BuildRequires: pkgconfig(systemd) # RH patches are created by git, so we need git to apply them BuildRequires: git -Requires: corosync >= 2.0.0 -BuildRequires: corosync-devel >= 2.0.0 +# The RHEL 8.5+ build root has corosync_cfg_trackstart() available, so +# Pacemaker's configure script will build support for it. Add a hard dependency +# to ensure users have compatible Corosync libraries if they upgrade Pacemaker. +Requires: corosync >= 3.1.1 +BuildRequires: corosync-devel >= 3.1.1 %if %{with stonithd} BuildRequires: %{pkgname_glue_libs}-devel @@ -368,7 +377,7 @@ Provides: pcmk-cluster-manager%{?_isa} = %{version}-%{release} # Bundled bits ## Pacemaker uses the crypto/md5-buffer module from gnulib %if 0%{?fedora} || 0%{?rhel} -Provides: bundled(gnulib) +Provides: bundled(gnulib) = 20200404 %endif %description @@ -646,12 +655,6 @@ find %{buildroot} -name '*o2cb*' -type f -print0 | xargs -0 rm -f rm -f %{buildroot}/%{_sbindir}/notifyServicelogEvent rm -f %{buildroot}/%{_sbindir}/ipmiservicelogd -# Don't ship init scripts for systemd based platforms -%if %{defined _unitdir} -rm -f %{buildroot}/%{_initrddir}/pacemaker -rm -f %{buildroot}/%{_initrddir}/pacemaker_remote -%endif - # Byte-compile Python sources where suitable and the distro procedures known %if %{defined py_byte_compile} %{py_byte_compile %{python_path} %{buildroot}%{_datadir}/pacemaker/tests} @@ -980,6 +983,46 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Wed Jan 26 2022 Ken Gaillot - 2.1.2-4 +- Fix regression in down event detection that affects remote nodes +- Resolves: rhbz2046446 + +* Fri Jan 21 2022 Ken Gaillot - 2.1.2-3 +- Improve display of failed actions +- Handle certain probe failures as stopped instead of failed +- Update pcmk_delay_base description in option meta-data +- Avoid crash when using clone notifications +- Retry Corosync shutdown tracking if first attempt fails +- Resolves: rhbz1470834 +- Resolves: rhbz1506372 +- Resolves: rhbz2027370 +- Resolves: rhbz2039675 +- Resolves: rhbz2042550 + +* Thu Dec 16 2021 Ken Gaillot - 2.1.2-2 +- Correctly get metadata for systemd agent names that end in '@' +- Use correct OCF 1.1 syntax in ocf:pacemaker:Stateful meta-data +- Fix regression in displayed times in crm_mon's fence history +- Resolves: rhbz2003151 +- Resolves: rhbz2027370 +- Resolves: rhbz2032027 + +* Tue Nov 30 2021 Ken Gaillot - 2.1.2-1 +- Allow per-host fence delays for a single fence device +- Use OCF 1.1 enum type in cluster option metadata for better validation +- crm-resource --force-* now works with LSB resources +- Allow spaces in pcmk_host_map +- ACL group names are no longer restricted to a unique XML id +- Rebase on upstream 2.1.2 +- Ensure upgrades get compatible Corosync libraries +- Resolves: rhbz1082146 +- Resolves: rhbz1281463 +- Resolves: rhbz1346014 +- Resolves: rhbz1376538 +- Resolves: rhbz1384420 +- Resolves: rhbz2011973 +- Resolves: rhbz2027006 + * Fri Aug 20 2021 Ken Gaillot - 2.1.0-8 - Fix XML issue in fence_watchdog meta-data - Resolves: rhbz1443666