From f2e51898735b5e9990464141fc4aea3dd83f5067 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Thu, 4 Nov 2021 14:36:41 -0400 Subject: [PATCH 01/21] Refactor: scheduler: Use bool in unpack_rsc_op. Previously, we were using bool but TRUE/FALSE. Instead, use the actual values. --- lib/pengine/unpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index b1e84110a2..ecc7275e15 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -3671,7 +3671,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, const char *task = NULL; const char *task_key = NULL; const char *exit_reason = NULL; - bool expired = FALSE; + bool expired = false; pe_resource_t *parent = rsc; enum action_fail_response failure_strategy = action_fail_recover; @@ -3727,7 +3727,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, if ((status != PCMK_EXEC_NOT_INSTALLED) && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { - expired = TRUE; + expired = true; } if (!strcmp(task, CRMD_ACTION_STATUS)) { -- 2.27.0 From 4c961b8e670d336a368c7fd1535c247e40c6b48e Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Thu, 4 Nov 2021 15:07:01 -0400 Subject: [PATCH 02/21] Refactor: scheduler: Add functions for determining if an op is a probe. --- include/crm/common/util.h | 3 + lib/common/operations.c | 21 +++++++ lib/common/tests/operations/Makefile.am | 6 +- .../tests/operations/pcmk_is_probe_test.c | 37 +++++++++++++ .../tests/operations/pcmk_xe_is_probe_test.c | 55 +++++++++++++++++++ lib/pengine/unpack.c | 12 ++-- lib/pengine/utils.c | 5 +- 7 files changed, 127 insertions(+), 12 deletions(-) create mode 100644 lib/common/tests/operations/pcmk_is_probe_test.c create mode 100644 lib/common/tests/operations/pcmk_xe_is_probe_test.c diff --git a/include/crm/common/util.h b/include/crm/common/util.h index 2728b64492..fbea6e560c 100644 --- a/include/crm/common/util.h +++ b/include/crm/common/util.h @@ -72,6 +72,9 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, const char *timeout); #define CRM_DEFAULT_OP_TIMEOUT_S "20s" +bool pcmk_is_probe(const char *task, guint interval); +bool pcmk_xe_is_probe(xmlNode *xml_op); + int compare_version(const char *version1, const char *version2); /* coverity[+kill] */ diff --git a/lib/common/operations.c b/lib/common/operations.c index 366c189702..978df79082 100644 --- a/lib/common/operations.c +++ b/lib/common/operations.c @@ -537,3 +537,24 @@ pcmk__is_fencing_action(const char *action) { return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); } + +bool +pcmk_is_probe(const char *task, guint interval) +{ + if (task == NULL) { + return false; + } + + return (interval == 0) && pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none); +} + +bool +pcmk_xe_is_probe(xmlNode *xml_op) +{ + const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); + const char *interval_ms_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS); + int interval_ms; + + pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); + return pcmk_is_probe(task, interval_ms); +} diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am index c8814ff0a8..2e3d0b0679 100644 --- a/lib/common/tests/operations/Makefile.am +++ b/lib/common/tests/operations/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2020 the Pacemaker project contributors +# Copyright 2020-2021 the Pacemaker project contributors # # The version control history for this file may have further details. # @@ -12,6 +12,8 @@ LDADD = $(top_builddir)/lib/common/libcrmcommon.la -lcmocka include $(top_srcdir)/mk/tap.mk # Add "_test" to the end of all test program names to simplify .gitignore. -check_PROGRAMS = parse_op_key_test +check_PROGRAMS = parse_op_key_test \ + pcmk_is_probe_test \ + pcmk_xe_is_probe_test TESTS = $(check_PROGRAMS) diff --git a/lib/common/tests/operations/pcmk_is_probe_test.c b/lib/common/tests/operations/pcmk_is_probe_test.c new file mode 100644 index 0000000000..9b449f1a70 --- /dev/null +++ b/lib/common/tests/operations/pcmk_is_probe_test.c @@ -0,0 +1,37 @@ +/* + * Copyright 2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include + +#include +#include +#include +#include +#include +#include + +static void +is_probe_test(void **state) +{ + assert_false(pcmk_is_probe(NULL, 0)); + assert_false(pcmk_is_probe("", 0)); + assert_false(pcmk_is_probe("blahblah", 0)); + assert_false(pcmk_is_probe("monitor", 1)); + assert_true(pcmk_is_probe("monitor", 0)); +} + +int main(int argc, char **argv) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(is_probe_test), + }; + + cmocka_set_message_output(CM_OUTPUT_TAP); + return cmocka_run_group_tests(tests, NULL, NULL); +} diff --git a/lib/common/tests/operations/pcmk_xe_is_probe_test.c b/lib/common/tests/operations/pcmk_xe_is_probe_test.c new file mode 100644 index 0000000000..0283d1c145 --- /dev/null +++ b/lib/common/tests/operations/pcmk_xe_is_probe_test.c @@ -0,0 +1,55 @@ +/* + * Copyright 2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include + +#include +#include +#include +#include +#include +#include + +static void +op_is_probe_test(void **state) +{ + xmlNode *node = NULL; + + assert_false(pcmk_xe_is_probe(NULL)); + + node = string2xml(""); + assert_false(pcmk_xe_is_probe(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_is_probe(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_is_probe(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_is_probe(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_is_probe(node)); + free_xml(node); +} + +int main(int argc, char **argv) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(op_is_probe_test), + }; + + cmocka_set_message_output(CM_OUTPUT_TAP); + return cmocka_run_group_tests(tests, NULL, NULL); +} diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index ecc7275e15..7c0c66e696 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -83,7 +83,6 @@ is_dangling_guest_node(pe_node_t *node) return FALSE; } - /*! * \brief Schedule a fence action for a node * @@ -2984,7 +2983,6 @@ static void unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { - guint interval_ms = 0; bool is_probe = false; pe_action_t *action = NULL; @@ -2998,10 +2996,7 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x *last_failure = xml_op; - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { - is_probe = true; - } + is_probe = pcmk_xe_is_probe(xml_op); if (exit_reason == NULL) { exit_reason = ""; @@ -3163,8 +3158,9 @@ determine_op_status( } crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { - is_probe = true; + is_probe = pcmk_xe_is_probe(xml_op); + + if (is_probe) { task = "probe"; } diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index c5eda3898e..07753e173a 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -1066,8 +1066,7 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai { int timeout_ms = 0; const char *value = NULL; - bool is_probe = pcmk__str_eq(action->task, RSC_STATUS, pcmk__str_casei) - && (interval_ms == 0); + bool is_probe = false; #if ENABLE_VERSIONED_ATTRS pe_rsc_action_details_t *rsc_details = NULL; #endif @@ -1094,6 +1093,8 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai CRM_CHECK(action && action->rsc, return); + is_probe = pcmk_is_probe(action->task, interval_ms); + // Cluster-wide pe__unpack_dataset_nvpairs(data_set->op_defaults, XML_TAG_META_SETS, &rule_data, action->meta, NULL, FALSE, data_set); -- 2.27.0 From 09f32df97ab5064a15ba5a1fb3970d5c64ee7b30 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 14:47:22 -0500 Subject: [PATCH 03/21] Refactor: scheduler: Move setting interval_ms in determine_op_status. This can now happen in the only place it's being used. --- lib/pengine/unpack.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 7c0c66e696..b9986d2462 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -3142,7 +3142,6 @@ static int determine_op_status( pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) { - guint interval_ms = 0; bool is_probe = false; int result = PCMK_EXEC_DONE; const char *key = get_op_key(xml_op); @@ -3157,7 +3156,6 @@ determine_op_status( exit_reason = ""; } - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); is_probe = pcmk_xe_is_probe(xml_op); if (is_probe) { @@ -3230,12 +3228,17 @@ determine_op_status( result = PCMK_EXEC_ERROR_FATAL; break; - case PCMK_OCF_UNIMPLEMENT_FEATURE: + case PCMK_OCF_UNIMPLEMENT_FEATURE: { + guint interval_ms = 0; + crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); + if (interval_ms > 0) { result = PCMK_EXEC_NOT_SUPPORTED; break; } // fall through + } + case PCMK_OCF_NOT_INSTALLED: case PCMK_OCF_INVALID_PARAM: case PCMK_OCF_INSUFFICIENT_PRIV: -- 2.27.0 From 6c8f47453afd6c100fddc45187faff17e15f7bfe Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 14:57:57 -0500 Subject: [PATCH 04/21] Refactor: scheduler: Add pcmk_xe_mask_failed_probe. Given an xmlNodePtr for a resource operation, this function will determine whether it is a failed probe operation that should not be displayed in crm_mon (or other places, I suppose) or not. --- include/crm/common/util.h | 1 + lib/common/operations.c | 17 ++ lib/common/tests/operations/Makefile.am | 3 +- .../pcmk_xe_mask_probe_failure_test.c | 162 ++++++++++++++++++ 4 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c diff --git a/include/crm/common/util.h b/include/crm/common/util.h index fbea6e560c..784069ba1b 100644 --- a/include/crm/common/util.h +++ b/include/crm/common/util.h @@ -74,6 +74,7 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, bool pcmk_is_probe(const char *task, guint interval); bool pcmk_xe_is_probe(xmlNode *xml_op); +bool pcmk_xe_mask_probe_failure(xmlNode *xml_op); int compare_version(const char *version1, const char *version2); diff --git a/lib/common/operations.c b/lib/common/operations.c index 978df79082..54482b8863 100644 --- a/lib/common/operations.c +++ b/lib/common/operations.c @@ -558,3 +558,20 @@ pcmk_xe_is_probe(xmlNode *xml_op) pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); return pcmk_is_probe(task, interval_ms); } + +bool +pcmk_xe_mask_probe_failure(xmlNode *xml_op) +{ + int status = PCMK_EXEC_UNKNOWN; + int rc = PCMK_OCF_OK; + + if (!pcmk_xe_is_probe(xml_op)) { + return false; + } + + crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); + crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); + + return rc == PCMK_OCF_NOT_INSTALLED || rc == PCMK_OCF_INVALID_PARAM || + status == PCMK_EXEC_NOT_INSTALLED; +} diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am index 2e3d0b0679..457c5f7c7a 100644 --- a/lib/common/tests/operations/Makefile.am +++ b/lib/common/tests/operations/Makefile.am @@ -14,6 +14,7 @@ include $(top_srcdir)/mk/tap.mk # Add "_test" to the end of all test program names to simplify .gitignore. check_PROGRAMS = parse_op_key_test \ pcmk_is_probe_test \ - pcmk_xe_is_probe_test + pcmk_xe_is_probe_test \ + pcmk_xe_mask_probe_failure_test TESTS = $(check_PROGRAMS) diff --git a/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c new file mode 100644 index 0000000000..a13f6d98f4 --- /dev/null +++ b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c @@ -0,0 +1,162 @@ +/* + * Copyright 2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include + +#include +#include +#include +#include +#include +#include + +static void +op_is_not_probe_test(void **state) { + xmlNode *node = NULL; + + /* Not worth testing this thoroughly since it's just a duplicate of whether + * pcmk_op_is_probe works or not. + */ + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); +} + +static void +op_does_not_have_right_values_test(void **state) { + xmlNode *node = NULL; + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); +} + +static void +check_values_test(void **state) { + xmlNode *node = NULL; + + /* PCMK_EXEC_NOT_SUPPORTED */ + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + /* PCMK_EXEC_DONE */ + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + /* PCMK_EXEC_NOT_INSTALLED */ + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + /* PCMK_EXEC_ERROR */ + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + /* PCMK_EXEC_ERROR_HARD */ + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + /* PCMK_EXEC_ERROR_FATAL */ + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_true(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); + + node = string2xml(""); + assert_false(pcmk_xe_mask_probe_failure(node)); + free_xml(node); +} + +int main(int argc, char **argv) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(op_is_not_probe_test), + cmocka_unit_test(op_does_not_have_right_values_test), + cmocka_unit_test(check_values_test), + }; + + cmocka_set_message_output(CM_OUTPUT_TAP); + return cmocka_run_group_tests(tests, NULL, NULL); +} -- 2.27.0 From c9ce1aaf93cd20bb01e80102dda0ffffb07e6472 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Wed, 1 Dec 2021 14:26:31 -0500 Subject: [PATCH 05/21] Refactor: scheduler: Combine op status and rc remapping into one function. Well, not quite. Doing the remapping is complicated enough to where it makes sense to have them in separate functions. However, they can both be called from a single new function that takes the place of the previous two calls in unpack_rsc_op. --- lib/pengine/unpack.c | 157 ++++++++++++++++++++----------------------- 1 file changed, 72 insertions(+), 85 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index b9986d2462..b659f319fb 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -3121,36 +3121,68 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x /*! * \internal - * \brief Remap operation status based on action result + * \brief Remap informational monitor results and operation status * - * Given an action result, determine an appropriate operation status for the - * purposes of responding to the action (the status provided by the executor is - * not directly usable since the executor does not know what was expected). + * For the monitor results, certain OCF codes are for providing extended information + * to the user about services that aren't yet failed but not entirely healthy either. + * These must be treated as the "normal" result by Pacemaker. + * + * For operation status, the action result can be used to determine an appropriate + * status for the purposes of responding to the action. The status provided by the + * executor is not directly usable since the executor does not know what was expected. * + * \param[in] xml_op Operation history entry XML from CIB status * \param[in,out] rsc Resource that operation history entry is for - * \param[in] rc Actual return code of operation - * \param[in] target_rc Expected return code of operation * \param[in] node Node where operation was executed - * \param[in] xml_op Operation history entry XML from CIB status - * \param[in,out] on_fail What should be done about the result * \param[in] data_set Current cluster working set + * \param[in,out] on_fail What should be done about the result + * \param[in] target_rc Expected return code of operation + * \param[in,out] rc Actual return code of operation + * \param[in,out] status Operation execution status + * + * \note If the result is remapped and the node is not shutting down or failed, + * the operation will be recorded in the data set's list of failed operations + * to highlight it for the user. * - * \return Operation status based on return code and action info * \note This may update the resource's current and next role. */ -static int -determine_op_status( - pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) -{ +static void +remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, + pe_working_set_t *data_set, enum action_fail_response *on_fail, + int target_rc, int *rc, int *status) { bool is_probe = false; - int result = PCMK_EXEC_DONE; - const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); + const char *key = get_op_key(xml_op); const char *exit_reason = crm_element_value(xml_op, XML_LRM_ATTR_EXIT_REASON); + if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { + int remapped_rc = pcmk__effective_rc(*rc); + + if (*rc != remapped_rc) { + crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); + if (!node->details->shutdown || node->details->online) { + record_failed_op(xml_op, node, rsc, data_set); + } + + *rc = remapped_rc; + } + } + + /* If the executor reported an operation status of anything but done or + * error, consider that final. But for done or error, we know better whether + * it should be treated as a failure or not, because we know the expected + * result. + */ + if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { + return; + } + CRM_ASSERT(rsc); - CRM_CHECK(task != NULL, return PCMK_EXEC_ERROR); + CRM_CHECK(task != NULL, + *status = PCMK_EXEC_ERROR; return); + + *status = PCMK_EXEC_DONE; if (exit_reason == NULL) { exit_reason = ""; @@ -3171,23 +3203,23 @@ determine_op_status( * those versions or processing of saved CIB files from those versions, * so we do not need to care much about this case. */ - result = PCMK_EXEC_ERROR; + *status = PCMK_EXEC_ERROR; crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", key, node->details->uname); - } else if (target_rc != rc) { - result = PCMK_EXEC_ERROR; + } else if (target_rc != *rc) { + *status = PCMK_EXEC_ERROR; pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", key, node->details->uname, target_rc, services_ocf_exitcode_str(target_rc), - rc, services_ocf_exitcode_str(rc), + *rc, services_ocf_exitcode_str(*rc), (*exit_reason? ": " : ""), exit_reason); } - switch (rc) { + switch (*rc) { case PCMK_OCF_OK: if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { - result = PCMK_EXEC_DONE; + *status = PCMK_EXEC_DONE; pe_rsc_info(rsc, "Probe found %s active on %s at %s", rsc->id, node->details->uname, last_change_str(xml_op)); @@ -3195,10 +3227,10 @@ determine_op_status( break; case PCMK_OCF_NOT_RUNNING: - if (is_probe || (target_rc == rc) + if (is_probe || (target_rc == *rc) || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { - result = PCMK_EXEC_DONE; + *status = PCMK_EXEC_DONE; rsc->role = RSC_ROLE_STOPPED; /* clear any previous failure actions */ @@ -3208,8 +3240,8 @@ determine_op_status( break; case PCMK_OCF_RUNNING_PROMOTED: - if (is_probe && (rc != target_rc)) { - result = PCMK_EXEC_DONE; + if (is_probe && (*rc != target_rc)) { + *status = PCMK_EXEC_DONE; pe_rsc_info(rsc, "Probe found %s active and promoted on %s at %s", rsc->id, node->details->uname, @@ -3221,11 +3253,11 @@ determine_op_status( case PCMK_OCF_DEGRADED_PROMOTED: case PCMK_OCF_FAILED_PROMOTED: rsc->role = RSC_ROLE_PROMOTED; - result = PCMK_EXEC_ERROR; + *status = PCMK_EXEC_ERROR; break; case PCMK_OCF_NOT_CONFIGURED: - result = PCMK_EXEC_ERROR_FATAL; + *status = PCMK_EXEC_ERROR_FATAL; break; case PCMK_OCF_UNIMPLEMENT_FEATURE: { @@ -3233,7 +3265,7 @@ determine_op_status( crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if (interval_ms > 0) { - result = PCMK_EXEC_NOT_SUPPORTED; + *status = PCMK_EXEC_NOT_SUPPORTED; break; } // fall through @@ -3248,26 +3280,27 @@ determine_op_status( pe_proc_err("No further recovery can be attempted for %s " "because %s on %s failed (%s%s%s) at %s " CRM_XS " rc=%d id=%s", rsc->id, task, - node->details->uname, services_ocf_exitcode_str(rc), + node->details->uname, services_ocf_exitcode_str(*rc), (*exit_reason? ": " : ""), exit_reason, - last_change_str(xml_op), rc, ID(xml_op)); + last_change_str(xml_op), *rc, ID(xml_op)); pe__clear_resource_flags(rsc, pe_rsc_managed); pe__set_resource_flags(rsc, pe_rsc_block); } - result = PCMK_EXEC_ERROR_HARD; + *status = PCMK_EXEC_ERROR_HARD; break; default: - if (result == PCMK_EXEC_DONE) { + if (*status == PCMK_EXEC_DONE) { crm_info("Treating unknown exit status %d from %s of %s " "on %s at %s as failure", - rc, task, rsc->id, node->details->uname, + *rc, task, rsc->id, node->details->uname, last_change_str(xml_op)); - result = PCMK_EXEC_ERROR; + *status = PCMK_EXEC_ERROR; } break; } - return result; + + pe_rsc_trace(rsc, "Remapped %s status to %d", key, *status); } // return TRUE if start or monitor last failure but parameters changed @@ -3622,41 +3655,6 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c } } -/*! - * \internal - * \brief Remap informational monitor results to usual values - * - * Certain OCF result codes are for providing extended information to the - * user about services that aren't yet failed but not entirely healthy either. - * These must be treated as the "normal" result by Pacemaker. - * - * \param[in] rc Actual result of a monitor action - * \param[in] xml_op Operation history XML - * \param[in] node Node that operation happened on - * \param[in] rsc Resource that operation happened to - * \param[in] data_set Cluster working set - * - * \return Result code that pacemaker should use - * - * \note If the result is remapped, and the node is not shutting down or failed, - * the operation will be recorded in the data set's list of failed - * operations, to highlight it for the user. - */ -static int -remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, - const pe_resource_t *rsc, pe_working_set_t *data_set) -{ - int remapped_rc = pcmk__effective_rc(rc); - - if (rc != remapped_rc) { - crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); - if (!node->details->shutdown || node->details->online) { - record_failed_op(xml_op, node, rsc, data_set); - } - } - return remapped_rc; -} - static void unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, xmlNode **last_failure, enum action_fail_response *on_fail, @@ -3712,7 +3710,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, node->details->uname, rsc->id); } - /* It should be possible to call remap_monitor_rc() first then call + /* It should be possible to call remap_operation() first then call * check_operation_expiry() only if rc != target_rc, because there should * never be a fail count without at least one unexpected result in the * resource history. That would be more efficient by avoiding having to call @@ -3729,9 +3727,8 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, expired = true; } - if (!strcmp(task, CRMD_ACTION_STATUS)) { - rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); - } + remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, + &rc, &status); if (expired && (rc != target_rc)) { const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); @@ -3761,16 +3758,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, } } - /* If the executor reported an operation status of anything but done or - * error, consider that final. But for done or error, we know better whether - * it should be treated as a failure or not, because we know the expected - * result. - */ - if(status == PCMK_EXEC_DONE || status == PCMK_EXEC_ERROR) { - status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); - pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); - } - switch (status) { case PCMK_EXEC_CANCELLED: // Should never happen -- 2.27.0 From 9fdca1999872b3930cf18b7d807ddb259f23e8a5 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 15:08:16 -0500 Subject: [PATCH 06/21] Test: cts-cli: Add test output for a native resource with a failed probe op. There are no code changes yet to properly handle displaying these operations, so the results here just reflect the current handling. --- cts/cli/crm_mon-partial.xml | 16 +++++++++++ cts/cli/regression.crm_mon.exp | 50 ++++++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml index e6c6894b6f..b7817e4775 100644 --- a/cts/cli/crm_mon-partial.xml +++ b/cts/cli/crm_mon-partial.xml @@ -60,6 +60,16 @@ + + + + + + + + + + @@ -94,6 +104,9 @@ + + + @@ -135,6 +148,9 @@ + + + diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index 8714f917a9..d12dce3ae8 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3470,7 +3470,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3485,6 +3485,9 @@ Active Resources: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group (1 member inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + +Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3495,7 +3498,7 @@ Active Resources: - + @@ -3548,6 +3551,7 @@ Active Resources: + @@ -3574,6 +3578,9 @@ Active Resources: + + + @@ -3603,6 +3610,9 @@ Active Resources: + + + =#=#=#= End test: XML output of partially active resources - OK (0) =#=#=#= @@ -3614,7 +3624,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3631,6 +3641,10 @@ Full List of Resources: * Resource Group: partially-active-group: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) + * smart-mon (ocf:pacemaker:HealthSMART): Stopped + +Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= @@ -3640,13 +3654,14 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Full List of Resources: + * 0/1 (ocf:pacemaker:HealthSMART): Active * 1/1 (stonith:fence_xvm): Active cluster01 * Clone Set: ping-clone [ping]: * Started: [ cluster01 ] @@ -3676,6 +3691,8 @@ Operations: * (3) monitor: interval="30000ms" * dummy-1: migration-threshold=1000000: * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe * Node: cluster01: * Fencing: migration-threshold=1000000: * (15) start @@ -3695,6 +3712,9 @@ Operations: * Node: httpd-bundle-0@cluster02: * httpd: migration-threshold=1000000: * (1) start + +Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3704,7 +3724,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3722,7 +3742,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3741,7 +3761,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3759,7 +3779,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3777,7 +3797,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Node cluster01: online: @@ -3806,6 +3826,7 @@ Inactive Resources: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: * 1/2 (ocf:pacemaker:Dummy): Active cluster02 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped Node Attributes: * Node: cluster01: @@ -3826,6 +3847,8 @@ Operations: * (3) monitor: interval="30000ms" * dummy-1: migration-threshold=1000000: * (2) start + * smart-mon: migration-threshold=1000000: + * (9) probe * Node: cluster01: * Fencing: migration-threshold=1000000: * (15) start @@ -3845,6 +3868,9 @@ Operations: * Node: httpd-bundle-0@cluster02: * httpd: migration-threshold=1000000: * (1) start + +Failed Resource Actions: + * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= @@ -3854,7 +3880,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 13 resource instances configured (1 DISABLED) + * 14 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 ] @@ -3865,6 +3891,7 @@ Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * smart-mon (ocf:pacemaker:HealthSMART): Stopped =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= @@ -3875,7 +3902,7 @@ Full List of Resources: - + @@ -3905,6 +3932,7 @@ Full List of Resources: + -- 2.27.0 From 1c54d0bbb74d066d55a56eae28d1a579b8854604 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 15:17:52 -0500 Subject: [PATCH 07/21] Test: cts-cli: Add test output for a cloned resource with a failed probe op. There are no code changes yet to properly handle displaying these operations, so the results here just reflect the current handling. --- cts/cli/crm_mon-partial.xml | 3 +++ cts/cli/regression.crm_mon.exp | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml index b7817e4775..1f9dc156aa 100644 --- a/cts/cli/crm_mon-partial.xml +++ b/cts/cli/crm_mon-partial.xml @@ -107,6 +107,9 @@ + + + diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index d12dce3ae8..d093bd8106 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3488,6 +3488,7 @@ Active Resources: Failed Resource Actions: * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3581,6 +3582,9 @@ Failed Resource Actions: + + + @@ -3612,6 +3616,7 @@ Failed Resource Actions: + @@ -3645,6 +3650,7 @@ Full List of Resources: Failed Resource Actions: * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= @@ -3693,6 +3699,8 @@ Operations: * (2) start * smart-mon: migration-threshold=1000000: * (9) probe + * ping: migration-threshold=1000000: + * (6) probe * Node: cluster01: * Fencing: migration-threshold=1000000: * (15) start @@ -3715,6 +3723,7 @@ Operations: Failed Resource Actions: * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3849,6 +3858,8 @@ Operations: * (2) start * smart-mon: migration-threshold=1000000: * (9) probe + * ping: migration-threshold=1000000: + * (6) probe * Node: cluster01: * Fencing: migration-threshold=1000000: * (15) start @@ -3871,6 +3882,7 @@ Operations: Failed Resource Actions: * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms + * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -- 2.27.0 From 9408f08c07eb531ff84b07bf959f3d681ebf2b78 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 15:48:16 -0500 Subject: [PATCH 08/21] Test: cts-cli: Change the resources in partially-active-group. dummy-2 is now not running because it failed to start due to an unimplemented feature. I don't know what could possibly be unimplemented about a dummy resource, but it's not important. There is also a new dummy-3 resource that acts exactly the same as dummy-2. This preserves checking that the inactive member output can still be displayed. There are no code changes yet to properly handle displaying these operations, so the results here just reflect the current handling. --- cts/cli/crm_mon-partial.xml | 6 +++- cts/cli/regression.crm_mon.exp | 62 +++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml index 1f9dc156aa..1ce80ea58a 100644 --- a/cts/cli/crm_mon-partial.xml +++ b/cts/cli/crm_mon-partial.xml @@ -54,7 +54,8 @@ - + + @@ -104,6 +105,9 @@ + + + diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index d093bd8106..8cf3a1215e 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3470,7 +3470,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3485,8 +3485,10 @@ Active Resources: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group (1 member inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= @@ -3499,12 +3501,12 @@ Failed Resource Actions: - + - + @@ -3546,11 +3548,14 @@ Failed Resource Actions: - + - + + + + @@ -3579,6 +3584,9 @@ Failed Resource Actions: + + + @@ -3615,6 +3623,7 @@ Failed Resource Actions: + @@ -3629,7 +3638,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3645,10 +3654,12 @@ Full List of Resources: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) * smart-mon (ocf:pacemaker:HealthSMART): Stopped Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= @@ -3660,7 +3671,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3676,7 +3687,7 @@ Full List of Resources: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: - * 1/2 (ocf:pacemaker:Dummy): Active cluster02 + * 2/3 (ocf:pacemaker:Dummy): Active cluster02 Node Attributes: * Node: cluster01: @@ -3697,6 +3708,8 @@ Operations: * (3) monitor: interval="30000ms" * dummy-1: migration-threshold=1000000: * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe * smart-mon: migration-threshold=1000000: * (9) probe * ping: migration-threshold=1000000: @@ -3722,6 +3735,7 @@ Operations: * (1) start Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= @@ -3733,7 +3747,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3742,6 +3756,7 @@ Node List: Active Resources: * Resource Group: partially-active-group (1 member inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active group =#=#=#= Begin test: Text output of partially active group, with inactive resources =#=#=#= @@ -3751,7 +3766,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3760,7 +3775,8 @@ Node List: Full List of Resources: * Resource Group: partially-active-group: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active group, with inactive resources =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= @@ -3770,7 +3786,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3788,7 +3804,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3796,7 +3812,10 @@ Node List: Active Resources: * Resource Group: partially-active-group (1 member inactive): - * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) + * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 + +Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= * Passed: crm_mon - Text output of inactive member of partially active group =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= @@ -3806,7 +3825,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Node cluster01: online: @@ -3820,7 +3839,7 @@ Node List: * Resources: * 1 (ocf:heartbeat:IPaddr2): Active * 1 (ocf:heartbeat:docker): Active - * 1 (ocf:pacemaker:Dummy): Active + * 2 (ocf:pacemaker:Dummy): Active * 1 (ocf:pacemaker:remote): Active * GuestNode httpd-bundle-0@cluster02: online: * Resources: @@ -3834,7 +3853,7 @@ Inactive Resources: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: - * 1/2 (ocf:pacemaker:Dummy): Active cluster02 + * 2/3 (ocf:pacemaker:Dummy): Active cluster02 * smart-mon (ocf:pacemaker:HealthSMART): Stopped Node Attributes: @@ -3856,6 +3875,8 @@ Operations: * (3) monitor: interval="30000ms" * dummy-1: migration-threshold=1000000: * (2) start + * dummy-2: migration-threshold=1000000: + * (2) probe * smart-mon: migration-threshold=1000000: * (9) probe * ping: migration-threshold=1000000: @@ -3881,6 +3902,7 @@ Operations: * (1) start Failed Resource Actions: + * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= @@ -3892,7 +3914,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 14 resource instances configured (1 DISABLED) + * 15 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 ] @@ -3914,7 +3936,7 @@ Full List of Resources: - + -- 2.27.0 From 85e76b8bdb4de261a9cb4858eeedd49fba0346a1 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 15:55:51 -0500 Subject: [PATCH 09/21] Test: cts-cli: Add a failed probe on a new dummy-4 resource. This is to verify that these resources which are part of a group are displayed properly. No code changes will be necessary, since groups are just several other resources all in the same pile. There are no code changes yet to properly handle displaying these operations, so the results here just reflect the current handling. --- cts/cli/crm_mon-partial.xml | 4 +++ cts/cli/regression.crm_mon.exp | 51 ++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml index 1ce80ea58a..d4d4a70848 100644 --- a/cts/cli/crm_mon-partial.xml +++ b/cts/cli/crm_mon-partial.xml @@ -60,6 +60,7 @@ + @@ -108,6 +109,9 @@ + + + diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index 8cf3a1215e..c524b199e3 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3470,7 +3470,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3483,12 +3483,13 @@ Active Resources: * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group (1 member inactive): + * Resource Group: partially-active-group (2 members inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= @@ -3501,7 +3502,7 @@ Failed Resource Actions: - + @@ -3548,7 +3549,7 @@ Failed Resource Actions: - + @@ -3556,6 +3557,7 @@ Failed Resource Actions: + @@ -3587,6 +3589,9 @@ Failed Resource Actions: + + + @@ -3624,6 +3629,7 @@ Failed Resource Actions: + @@ -3638,7 +3644,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3656,10 +3662,12 @@ Full List of Resources: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + * dummy-4 (ocf:pacemaker:Dummy): Stopped * smart-mon (ocf:pacemaker:HealthSMART): Stopped Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= @@ -3671,7 +3679,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3687,7 +3695,7 @@ Full List of Resources: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: - * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 Node Attributes: * Node: cluster01: @@ -3710,6 +3718,8 @@ Operations: * (2) start * dummy-2: migration-threshold=1000000: * (2) probe + * dummy-4: migration-threshold=1000000: + * (2) probe * smart-mon: migration-threshold=1000000: * (9) probe * ping: migration-threshold=1000000: @@ -3736,6 +3746,7 @@ Operations: Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= @@ -3747,14 +3758,14 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Active Resources: - * Resource Group: partially-active-group (1 member inactive): + * Resource Group: partially-active-group (2 members inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= @@ -3766,7 +3777,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] @@ -3777,6 +3788,7 @@ Full List of Resources: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) + * dummy-4 (ocf:pacemaker:Dummy): Stopped =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active group, with inactive resources =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= @@ -3786,14 +3798,14 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Active Resources: - * Resource Group: partially-active-group (1 member inactive): + * Resource Group: partially-active-group (2 members inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 =#=#=#= End test: Text output of active member of partially active group - OK (0) =#=#=#= * Passed: crm_mon - Text output of active member of partially active group @@ -3804,14 +3816,14 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 cluster02 ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Active Resources: - * Resource Group: partially-active-group (1 member inactive): + * Resource Group: partially-active-group (2 members inactive): * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: @@ -3825,7 +3837,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Node cluster01: online: @@ -3853,7 +3865,7 @@ Inactive Resources: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: - * 2/3 (ocf:pacemaker:Dummy): Active cluster02 + * 2/4 (ocf:pacemaker:Dummy): Active cluster02 * smart-mon (ocf:pacemaker:HealthSMART): Stopped Node Attributes: @@ -3877,6 +3889,8 @@ Operations: * (2) start * dummy-2: migration-threshold=1000000: * (2) probe + * dummy-4: migration-threshold=1000000: + * (2) probe * smart-mon: migration-threshold=1000000: * (9) probe * ping: migration-threshold=1000000: @@ -3903,6 +3917,7 @@ Operations: Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= @@ -3914,7 +3929,7 @@ Cluster Summary: * Last updated: * Last change: * 4 nodes configured - * 15 resource instances configured (1 DISABLED) + * 16 resource instances configured (1 DISABLED) Node List: * Online: [ cluster01 ] @@ -3936,7 +3951,7 @@ Full List of Resources: - + -- 2.27.0 From 206d733b6ce8e0ffcad243d282e8baa8c3ff72b4 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Tue, 23 Nov 2021 14:33:47 -0500 Subject: [PATCH 10/21] Test: cts-cli: Add test output for a bundle resource with a failed probe op. This just changes the existing failed bundle resource from not starting to failing with a reason. There are no code changes yet to properly handle displaying these operations, so the results here just reflect the current handling. --- cts/cli/crm_mon-partial.xml | 9 ++++++++ cts/cli/regression.crm_mon.exp | 40 +++++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml index d4d4a70848..5981fc653c 100644 --- a/cts/cli/crm_mon-partial.xml +++ b/cts/cli/crm_mon-partial.xml @@ -178,5 +178,14 @@ + + + + + + + + + diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index c524b199e3..b690a26fb6 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3482,7 +3482,7 @@ Active Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 * Resource Group: partially-active-group (2 members inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 @@ -3492,6 +3492,7 @@ Failed Resource Actions: * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3509,7 +3510,7 @@ Failed Resource Actions: - + @@ -3540,7 +3541,9 @@ Failed Resource Actions: - + + + @@ -3626,12 +3629,18 @@ Failed Resource Actions: + + + + + + @@ -3657,7 +3666,7 @@ Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 * Resource Group: partially-active-group: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 @@ -3670,6 +3679,7 @@ Failed Resource Actions: * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= @@ -3693,7 +3703,7 @@ Full List of Resources: * Stopped: [ cluster02 ] * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 @@ -3743,12 +3753,16 @@ Operations: * Node: httpd-bundle-0@cluster02: * httpd: migration-threshold=1000000: * (1) start + * Node: httpd-bundle-1@cluster01: + * httpd: migration-threshold=1000000: + * (1) probe Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3856,14 +3870,14 @@ Node List: * GuestNode httpd-bundle-0@cluster02: online: * Resources: * 1 (ocf:heartbeat:apache): Active + * GuestNode httpd-bundle-1@cluster01: online: + * Resources: + * 1 (ocf:heartbeat:apache): Active Inactive Resources: * Clone Set: ping-clone [ping]: * Started: [ cluster01 ] * Stopped: [ cluster02 ] - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 * smart-mon (ocf:pacemaker:HealthSMART): Stopped @@ -3914,12 +3928,16 @@ Operations: * Node: httpd-bundle-0@cluster02: * httpd: migration-threshold=1000000: * (1) start + * Node: httpd-bundle-1@cluster01: + * httpd: migration-threshold=1000000: + * (1) probe Failed Resource Actions: * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 + * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= @@ -3939,7 +3957,7 @@ Full List of Resources: * Started: [ cluster01 ] * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 + * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 * smart-mon (ocf:pacemaker:HealthSMART): Stopped =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node @@ -3972,7 +3990,9 @@ Full List of Resources: - + + + -- 2.27.0 From 6240a28d36c0349e3b1d7f52c36106580c53bb01 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Mon, 22 Nov 2021 10:59:10 -0500 Subject: [PATCH 11/21] Test: cts: Add --show-detail to a couple of the crm_mon tests. This straightens out a couple differences in output between running tests locally (where --enable-compat-2.0 is not given, which would automatically add --show-detail) and running tests under mock (where that option is given). Note that this only really matters for failed resource actions, which were not previously output as part of any crm_mon regression test. It is only the patches in this series that have introduced those, and thus this difference. --- cts/cli/regression.crm_mon.exp | 131 ++++++++++++++++++++------------- cts/cts-cli.in | 10 +-- 2 files changed, 83 insertions(+), 58 deletions(-) diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index b690a26fb6..d7b9d98e2c 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3466,33 +3466,42 @@ Operations: =#=#=#= Begin test: Text output of partially active resources =#=#=#= Cluster Summary: * Stack: corosync - * Current DC: cluster02 (version) - partition with quorum + * Current DC: cluster02 (2) (version) - partition with quorum * Last updated: * Last change: * 4 nodes configured * 16 resource instances configured (1 DISABLED) Node List: - * Online: [ cluster01 cluster02 ] + * Online: [ cluster01 (1) cluster02 (2) ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Active Resources: * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] + * ping (ocf:pacemaker:ping): Started cluster01 + * ping (ocf:pacemaker:ping): Stopped * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 + * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 + * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 + * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 + * Replica[1] + * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 + * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 + * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 + * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 * Resource Group: partially-active-group (2 members inactive): * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms + * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms + * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms + * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3649,24 +3658,32 @@ Failed Resource Actions: =#=#=#= Begin test: Text output of partially active resources, with inactive resources =#=#=#= Cluster Summary: * Stack: corosync - * Current DC: cluster02 (version) - partition with quorum + * Current DC: cluster02 (2) (version) - partition with quorum * Last updated: * Last change: * 4 nodes configured * 16 resource instances configured (1 DISABLED) Node List: - * Online: [ cluster01 cluster02 ] + * Online: [ cluster01 (1) cluster02 (2) ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Full List of Resources: * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] - * Stopped: [ cluster02 ] + * ping (ocf:pacemaker:ping): Started cluster01 + * ping (ocf:pacemaker:ping): Stopped * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 + * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 + * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 + * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 + * Replica[1] + * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 + * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 + * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 + * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 * Resource Group: partially-active-group: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 @@ -3675,46 +3692,54 @@ Full List of Resources: * smart-mon (ocf:pacemaker:HealthSMART): Stopped Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms + * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms + * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms + * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= Cluster Summary: * Stack: corosync - * Current DC: cluster02 (version) - partition with quorum + * Current DC: cluster02 (2) (version) - partition with quorum * Last updated: * Last change: * 4 nodes configured * 16 resource instances configured (1 DISABLED) Node List: - * Online: [ cluster01 cluster02 ] + * Online: [ cluster01 (1) cluster02 (2) ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Full List of Resources: * 0/1 (ocf:pacemaker:HealthSMART): Active * 1/1 (stonith:fence_xvm): Active cluster01 * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] - * Stopped: [ cluster02 ] + * ping (ocf:pacemaker:ping): Started cluster01 + * ping (ocf:pacemaker:ping): Stopped * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 + * Replica[0] + * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 + * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 + * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 + * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 + * Replica[1] + * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 + * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 + * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 + * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 Node Attributes: - * Node: cluster01: + * Node: cluster01 (1): * pingd : 1000 - * Node: cluster02: + * Node: cluster02 (2): * pingd : 1000 Operations: - * Node: cluster02: + * Node: cluster02 (2): * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: * (2) start * (3) monitor: interval="60000ms" @@ -3734,7 +3759,7 @@ Operations: * (9) probe * ping: migration-threshold=1000000: * (6) probe - * Node: cluster01: + * Node: cluster01 (1): * Fencing: migration-threshold=1000000: * (15) start * (20) monitor: interval="60000ms" @@ -3758,11 +3783,11 @@ Operations: * (1) probe Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms + * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms + * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms + * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3826,14 +3851,14 @@ Active Resources: =#=#=#= Begin test: Text output of inactive member of partially active group =#=#=#= Cluster Summary: * Stack: corosync - * Current DC: cluster02 (version) - partition with quorum + * Current DC: cluster02 (2) (version) - partition with quorum * Last updated: * Last change: * 4 nodes configured * 16 resource instances configured (1 DISABLED) Node List: - * Online: [ cluster01 cluster02 ] + * Online: [ cluster01 (1) cluster02 (2) ] * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] Active Resources: @@ -3841,27 +3866,27 @@ Active Resources: * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= * Passed: crm_mon - Text output of inactive member of partially active group =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= Cluster Summary: * Stack: corosync - * Current DC: cluster02 (version) - partition with quorum + * Current DC: cluster02 (2) (version) - partition with quorum * Last updated: * Last change: * 4 nodes configured * 16 resource instances configured (1 DISABLED) Node List: - * Node cluster01: online: + * Node cluster01 (1): online: * Resources: * 1 (ocf:heartbeat:IPaddr2): Active * 1 (ocf:heartbeat:docker): Active * 1 (ocf:pacemaker:ping): Active * 1 (ocf:pacemaker:remote): Active * 1 (stonith:fence_xvm): Active - * Node cluster02: online: + * Node cluster02 (2): online: * Resources: * 1 (ocf:heartbeat:IPaddr2): Active * 1 (ocf:heartbeat:docker): Active @@ -3876,20 +3901,20 @@ Node List: Inactive Resources: * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] - * Stopped: [ cluster02 ] + * ping (ocf:pacemaker:ping): Started cluster01 + * ping (ocf:pacemaker:ping): Stopped * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 * smart-mon (ocf:pacemaker:HealthSMART): Stopped Node Attributes: - * Node: cluster01: + * Node: cluster01 (1): * pingd : 1000 - * Node: cluster02: + * Node: cluster02 (2): * pingd : 1000 Operations: - * Node: cluster02: + * Node: cluster02 (2): * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: * (2) start * (3) monitor: interval="60000ms" @@ -3909,7 +3934,7 @@ Operations: * (9) probe * ping: migration-threshold=1000000: * (6) probe - * Node: cluster01: + * Node: cluster01 (1): * Fencing: migration-threshold=1000000: * (15) start * (20) monitor: interval="60000ms" @@ -3933,11 +3958,11 @@ Operations: * (1) probe Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms + * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms + * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms + * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= diff --git a/cts/cts-cli.in b/cts/cts-cli.in index d32bfb7ed1..457816afab 100755 --- a/cts/cts-cli.in +++ b/cts/cts-cli.in @@ -420,7 +420,7 @@ function test_crm_mon() { export CIB_file="$test_home/cli/crm_mon-partial.xml" desc="Text output of partially active resources" - cmd="crm_mon -1" + cmd="crm_mon -1 --show-detail" test_assert $CRM_EX_OK 0 desc="XML output of partially active resources" @@ -428,13 +428,13 @@ function test_crm_mon() { test_assert_validate $CRM_EX_OK 0 desc="Text output of partially active resources, with inactive resources" - cmd="crm_mon -1 -r" + cmd="crm_mon -1 -r --show-detail" test_assert $CRM_EX_OK 0 # XML already includes inactive resources desc="Complete brief text output, with inactive resources" - cmd="crm_mon -1 -r --include=all --brief" + cmd="crm_mon -1 -r --include=all --brief --show-detail" test_assert $CRM_EX_OK 0 # XML does not have a brief output option @@ -452,11 +452,11 @@ function test_crm_mon() { test_assert $CRM_EX_OK 0 desc="Text output of inactive member of partially active group" - cmd="crm_mon -1 --resource=dummy-2" + cmd="crm_mon -1 --resource=dummy-2 --show-detail" test_assert $CRM_EX_OK 0 desc="Complete brief text output grouped by node, with inactive resources" - cmd="crm_mon -1 -r --include=all --group-by-node --brief" + cmd="crm_mon -1 -r --include=all --group-by-node --brief --show-detail" test_assert $CRM_EX_OK 0 desc="Text output of partially active resources, with inactive resources, filtered by node" -- 2.27.0 From da14053e5957d84ed0647688d37733adc2f988a3 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Mon, 29 Nov 2021 15:05:42 -0500 Subject: [PATCH 12/21] Test: scheduler: Add tests for failed probe operations. This adds identical sets of tests for primitive resources and cloned resources. For the moment, the output reflects the current state of the code. No changes have been made to properly handle these operations yet. Each set has three resources, and each is set up with a slightly different configuration of probe failures: (1) - Maskable probe failure on each node. (2) - Maskable probe failure on one node, successful "not running" probe on the other node. The resource should be started on the node where "not running" was returned. (3) - Maskable probe failure on one node, non-maskable probe failure on the other node. The resource should not be running anywhere, and should be stopped on the node with the non-maskable failure. --- cts/cts-scheduler.in | 2 + cts/scheduler/dot/failed-probe-clone.dot | 30 ++++ cts/scheduler/dot/failed-probe-primitive.dot | 4 + cts/scheduler/exp/failed-probe-clone.exp | 141 ++++++++++++++++++ cts/scheduler/exp/failed-probe-primitive.exp | 20 +++ .../scores/failed-probe-clone.scores | 33 ++++ .../scores/failed-probe-primitive.scores | 9 ++ .../summary/failed-probe-clone.summary | 46 ++++++ .../summary/failed-probe-primitive.summary | 27 ++++ cts/scheduler/xml/failed-probe-clone.xml | 110 ++++++++++++++ cts/scheduler/xml/failed-probe-primitive.xml | 71 +++++++++ 11 files changed, 493 insertions(+) create mode 100644 cts/scheduler/dot/failed-probe-clone.dot create mode 100644 cts/scheduler/dot/failed-probe-primitive.dot create mode 100644 cts/scheduler/exp/failed-probe-clone.exp create mode 100644 cts/scheduler/exp/failed-probe-primitive.exp create mode 100644 cts/scheduler/scores/failed-probe-clone.scores create mode 100644 cts/scheduler/scores/failed-probe-primitive.scores create mode 100644 cts/scheduler/summary/failed-probe-clone.summary create mode 100644 cts/scheduler/summary/failed-probe-primitive.summary create mode 100644 cts/scheduler/xml/failed-probe-clone.xml create mode 100644 cts/scheduler/xml/failed-probe-primitive.xml diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in index 17fd6cefdf..3abcbc6c9d 100644 --- a/cts/cts-scheduler.in +++ b/cts/cts-scheduler.in @@ -113,6 +113,8 @@ TESTS = [ [ "probe-3", "Probe (pending node)" ], [ "probe-4", "Probe (pending node + stopped resource)" ], [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], + [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], + [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], [ "standby", "Standby" ], [ "comments", "Comments" ], ], diff --git a/cts/scheduler/dot/failed-probe-clone.dot b/cts/scheduler/dot/failed-probe-clone.dot new file mode 100644 index 0000000000..90536b46ed --- /dev/null +++ b/cts/scheduler/dot/failed-probe-clone.dot @@ -0,0 +1,30 @@ + digraph "g" { +"ping-1_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] +"ping-1_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] +"ping-2-clone_running_0" [ style=bold color="green" fontcolor="orange"] +"ping-2-clone_start_0" -> "ping-2-clone_running_0" [ style = bold] +"ping-2-clone_start_0" -> "ping-2_start_0 cluster02" [ style = bold] +"ping-2-clone_start_0" [ style=bold color="green" fontcolor="orange"] +"ping-2_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] +"ping-2_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] +"ping-2_monitor_10000 cluster02" [ style=bold color="green" fontcolor="black"] +"ping-2_start_0 cluster02" -> "ping-2-clone_running_0" [ style = bold] +"ping-2_start_0 cluster02" -> "ping-2_monitor_10000 cluster02" [ style = bold] +"ping-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] +"ping-3-clone_running_0" [ style=dashed color="red" fontcolor="orange"] +"ping-3-clone_start_0" -> "ping-3-clone_running_0" [ style = dashed] +"ping-3-clone_start_0" -> "ping-3_start_0 " [ style = dashed] +"ping-3-clone_start_0" [ style=dashed color="red" fontcolor="orange"] +"ping-3-clone_stop_0" -> "ping-3-clone_stopped_0" [ style = bold] +"ping-3-clone_stop_0" -> "ping-3_stop_0 cluster01" [ style = bold] +"ping-3-clone_stop_0" [ style=bold color="green" fontcolor="orange"] +"ping-3-clone_stopped_0" -> "ping-3-clone_start_0" [ style = dashed] +"ping-3-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] +"ping-3_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] +"ping-3_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] +"ping-3_start_0 " -> "ping-3-clone_running_0" [ style = dashed] +"ping-3_start_0 " [ style=dashed color="red" fontcolor="black"] +"ping-3_stop_0 cluster01" -> "ping-3-clone_stopped_0" [ style = bold] +"ping-3_stop_0 cluster01" -> "ping-3_start_0 " [ style = dashed] +"ping-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] +} diff --git a/cts/scheduler/dot/failed-probe-primitive.dot b/cts/scheduler/dot/failed-probe-primitive.dot new file mode 100644 index 0000000000..6e0c83216a --- /dev/null +++ b/cts/scheduler/dot/failed-probe-primitive.dot @@ -0,0 +1,4 @@ + digraph "g" { +"dummy-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] +"dummy-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] +} diff --git a/cts/scheduler/exp/failed-probe-clone.exp b/cts/scheduler/exp/failed-probe-clone.exp new file mode 100644 index 0000000000..6be18935bf --- /dev/null +++ b/cts/scheduler/exp/failed-probe-clone.exp @@ -0,0 +1,141 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cts/scheduler/exp/failed-probe-primitive.exp b/cts/scheduler/exp/failed-probe-primitive.exp new file mode 100644 index 0000000000..d0d8aa44dc --- /dev/null +++ b/cts/scheduler/exp/failed-probe-primitive.exp @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/cts/scheduler/scores/failed-probe-clone.scores b/cts/scheduler/scores/failed-probe-clone.scores new file mode 100644 index 0000000000..7418b7f153 --- /dev/null +++ b/cts/scheduler/scores/failed-probe-clone.scores @@ -0,0 +1,33 @@ + +pcmk__clone_allocate: ping-1-clone allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-1-clone allocation score on cluster02: -INFINITY +pcmk__clone_allocate: ping-1:0 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-1:0 allocation score on cluster02: -INFINITY +pcmk__clone_allocate: ping-1:1 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-1:1 allocation score on cluster02: -INFINITY +pcmk__clone_allocate: ping-2-clone allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-2-clone allocation score on cluster02: 0 +pcmk__clone_allocate: ping-2:0 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-2:0 allocation score on cluster02: 0 +pcmk__clone_allocate: ping-2:1 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-2:1 allocation score on cluster02: 0 +pcmk__clone_allocate: ping-3-clone allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-3-clone allocation score on cluster02: -INFINITY +pcmk__clone_allocate: ping-3:0 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-3:0 allocation score on cluster02: -INFINITY +pcmk__clone_allocate: ping-3:1 allocation score on cluster01: -INFINITY +pcmk__clone_allocate: ping-3:1 allocation score on cluster02: -INFINITY +pcmk__native_allocate: Fencing allocation score on cluster01: 0 +pcmk__native_allocate: Fencing allocation score on cluster02: 0 +pcmk__native_allocate: ping-1:0 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-1:0 allocation score on cluster02: -INFINITY +pcmk__native_allocate: ping-1:1 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-1:1 allocation score on cluster02: -INFINITY +pcmk__native_allocate: ping-2:0 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-2:0 allocation score on cluster02: 0 +pcmk__native_allocate: ping-2:1 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-2:1 allocation score on cluster02: -INFINITY +pcmk__native_allocate: ping-3:0 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-3:0 allocation score on cluster02: -INFINITY +pcmk__native_allocate: ping-3:1 allocation score on cluster01: -INFINITY +pcmk__native_allocate: ping-3:1 allocation score on cluster02: -INFINITY diff --git a/cts/scheduler/scores/failed-probe-primitive.scores b/cts/scheduler/scores/failed-probe-primitive.scores new file mode 100644 index 0000000000..f313029451 --- /dev/null +++ b/cts/scheduler/scores/failed-probe-primitive.scores @@ -0,0 +1,9 @@ + +pcmk__native_allocate: Fencing allocation score on cluster01: 0 +pcmk__native_allocate: Fencing allocation score on cluster02: 0 +pcmk__native_allocate: dummy-1 allocation score on cluster01: -INFINITY +pcmk__native_allocate: dummy-1 allocation score on cluster02: -INFINITY +pcmk__native_allocate: dummy-2 allocation score on cluster01: -INFINITY +pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 +pcmk__native_allocate: dummy-3 allocation score on cluster01: -INFINITY +pcmk__native_allocate: dummy-3 allocation score on cluster02: -INFINITY diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary new file mode 100644 index 0000000000..ca15c302aa --- /dev/null +++ b/cts/scheduler/summary/failed-probe-clone.summary @@ -0,0 +1,46 @@ +Current cluster status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: + * Stopped: [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: + * Stopped: [ cluster01 cluster02 ] + * Clone Set: ping-3-clone [ping-3]: + * ping-3 (ocf:pacemaker:ping): FAILED cluster01 + * Stopped: [ cluster02 ] + +Transition Summary: + * Start ping-2:0 ( cluster02 ) + * Stop ping-3:0 ( cluster01 ) due to node availability + +Executing Cluster Transition: + * Cluster action: clear_failcount for ping-1 on cluster02 + * Cluster action: clear_failcount for ping-1 on cluster01 + * Cluster action: clear_failcount for ping-2 on cluster02 + * Cluster action: clear_failcount for ping-2 on cluster01 + * Pseudo action: ping-2-clone_start_0 + * Cluster action: clear_failcount for ping-3 on cluster01 + * Cluster action: clear_failcount for ping-3 on cluster02 + * Pseudo action: ping-3-clone_stop_0 + * Resource action: ping-2 start on cluster02 + * Pseudo action: ping-2-clone_running_0 + * Resource action: ping-3 stop on cluster01 + * Pseudo action: ping-3-clone_stopped_0 + * Resource action: ping-2 monitor=10000 on cluster02 + +Revised Cluster Status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * Clone Set: ping-1-clone [ping-1]: + * Stopped: [ cluster01 cluster02 ] + * Clone Set: ping-2-clone [ping-2]: + * Started: [ cluster02 ] + * Stopped: [ cluster01 ] + * Clone Set: ping-3-clone [ping-3]: + * Stopped: [ cluster01 cluster02 ] diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary new file mode 100644 index 0000000000..a634e7f00b --- /dev/null +++ b/cts/scheduler/summary/failed-probe-primitive.summary @@ -0,0 +1,27 @@ +Current cluster status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * dummy-1 (ocf:pacemaker:Dummy): Stopped + * dummy-2 (ocf:pacemaker:Dummy): Stopped + * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 + +Transition Summary: + * Start dummy-2 ( cluster02 ) + * Stop dummy-3 ( cluster01 ) due to node availability + +Executing Cluster Transition: + * Resource action: dummy-2 start on cluster02 + * Resource action: dummy-3 stop on cluster01 + +Revised Cluster Status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * dummy-1 (ocf:pacemaker:Dummy): Stopped + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-3 (ocf:pacemaker:Dummy): Stopped diff --git a/cts/scheduler/xml/failed-probe-clone.xml b/cts/scheduler/xml/failed-probe-clone.xml new file mode 100644 index 0000000000..f677585bab --- /dev/null +++ b/cts/scheduler/xml/failed-probe-clone.xml @@ -0,0 +1,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cts/scheduler/xml/failed-probe-primitive.xml b/cts/scheduler/xml/failed-probe-primitive.xml new file mode 100644 index 0000000000..0c2f6416f5 --- /dev/null +++ b/cts/scheduler/xml/failed-probe-primitive.xml @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- 2.27.0 From 271d50e7d6b0ee5ef670b571c6d7aae9272b75ad Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Thu, 11 Nov 2021 13:57:05 -0500 Subject: [PATCH 13/21] Feature: scheduler: Don't output failed resource probes... in the crm_mon "Failed Resource Actions" section. It is expected that these one-off probes will fail, in which case displaying them in that section can just come across as confusing to the user. And update the crm_mon test output to account for these changes. See: rhbz#1506372 --- cts/cli/regression.crm_mon.exp | 20 -------------------- lib/pengine/pe_output.c | 4 ++++ 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index d7b9d98e2c..b1643f8b29 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3498,10 +3498,6 @@ Active Resources: Failed Resource Actions: * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms - * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms - * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms - * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms - * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3646,10 +3642,6 @@ Failed Resource Actions: - - - - @@ -3693,10 +3685,6 @@ Full List of Resources: Failed Resource Actions: * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms - * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms - * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms - * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms - * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= @@ -3784,10 +3772,6 @@ Operations: Failed Resource Actions: * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms - * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms - * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms - * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms - * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3959,10 +3943,6 @@ Operations: Failed Resource Actions: * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms - * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms - * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms - * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms - * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c index 715e001d51..84684598dd 100644 --- a/lib/pengine/pe_output.c +++ b/lib/pengine/pe_output.c @@ -1370,6 +1370,10 @@ failed_action_list(pcmk__output_t *out, va_list args) { continue; } + if (pcmk_xe_mask_probe_failure(xml_op)) { + continue; + } + id = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); if (parse_op_key(id ? id : ID(xml_op), &rsc, NULL, NULL) == FALSE) { continue; -- 2.27.0 From 90f641b9223c64701d494297ce3dd3382365acb8 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Tue, 9 Nov 2021 10:11:19 -0500 Subject: [PATCH 14/21] Feature: scheduler: Add a function for finding a failed probe action... for a given resource ID. Optionally, a node ID can also be given to restrict the failed probe action to one run on the given node. Otherwise, just the first failed probe action for the resource ID will be returned. See: rhbz#1506372 --- include/crm/pengine/internal.h | 2 ++ lib/pengine/utils.c | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h index 8c8fbaca90..58dd2e8727 100644 --- a/include/crm/pengine/internal.h +++ b/include/crm/pengine/internal.h @@ -574,4 +574,6 @@ gboolean pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean che gboolean pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); +xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); + #endif diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index 07753e173a..3151f0120b 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -2569,3 +2569,45 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { return resources; } + +xmlNode * +pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) +{ + const char *rsc_id = rsc->id; + + for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; + xml_op = pcmk__xml_next(xml_op)) { + const char *value = NULL; + char *op_id = NULL; + + /* This resource operation is not a failed probe. */ + if (!pcmk_xe_mask_probe_failure(xml_op)) { + continue; + } + + /* This resource operation was not run on the given node. Note that if name is + * NULL, this will always succeed. + */ + value = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); + if (value == NULL || !pcmk__str_eq(value, name, pcmk__str_casei|pcmk__str_null_matches)) { + continue; + } + + /* This resource operation has no operation_key. */ + value = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); + if (!parse_op_key(value ? value : ID(xml_op), &op_id, NULL, NULL)) { + continue; + } + + /* This resource operation's ID does not match the rsc_id we are looking for. */ + if (!pcmk__str_eq(op_id, rsc_id, pcmk__str_none)) { + free(op_id); + continue; + } + + free(op_id); + return xml_op; + } + + return NULL; +} -- 2.27.0 From 2ad9774fe994554243078b131799fed0d1a6dffd Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Tue, 9 Nov 2021 15:43:24 -0500 Subject: [PATCH 15/21] Feature: scheduler: Display the reason why a native rsc probe failed. If inactive resources are being shown, add an extra blurb of text to any stopped resources that have a failed probe action indicating why the probe failed. And then add a new primitive resource to crm_mon-partial.xml with a failed probe operation and update the expected test output. See: rhbz#1506372 --- cts/cli/regression.crm_mon.exp | 10 +++++----- cts/scheduler/summary/failed-probe-primitive.summary | 8 ++++---- cts/scheduler/summary/multiply-active-stonith.summary | 2 +- lib/pengine/native.c | 11 +++++++++++ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index b1643f8b29..4333caa11c 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3680,8 +3680,8 @@ Full List of Resources: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) - * dummy-4 (ocf:pacemaker:Dummy): Stopped - * smart-mon (ocf:pacemaker:HealthSMART): Stopped + * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) Failed Resource Actions: * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms @@ -3811,7 +3811,7 @@ Full List of Resources: * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) - * dummy-4 (ocf:pacemaker:Dummy): Stopped + * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active group, with inactive resources =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= @@ -3889,7 +3889,7 @@ Inactive Resources: * ping (ocf:pacemaker:ping): Stopped * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) Node Attributes: * Node: cluster01 (1): @@ -3963,7 +3963,7 @@ Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped + * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary index a634e7f00b..ea8edae494 100644 --- a/cts/scheduler/summary/failed-probe-primitive.summary +++ b/cts/scheduler/summary/failed-probe-primitive.summary @@ -4,8 +4,8 @@ Current cluster status: * Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 - * dummy-1 (ocf:pacemaker:Dummy): Stopped - * dummy-2 (ocf:pacemaker:Dummy): Stopped + * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) + * dummy-2 (ocf:pacemaker:Dummy): Stopped (not installed) * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 Transition Summary: @@ -22,6 +22,6 @@ Revised Cluster Status: * Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 - * dummy-1 (ocf:pacemaker:Dummy): Stopped + * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-3 (ocf:pacemaker:Dummy): Stopped + * dummy-3 (ocf:pacemaker:Dummy): Stopped (not installed) diff --git a/cts/scheduler/summary/multiply-active-stonith.summary b/cts/scheduler/summary/multiply-active-stonith.summary index 8ce21d68ee..ec37de03b0 100644 --- a/cts/scheduler/summary/multiply-active-stonith.summary +++ b/cts/scheduler/summary/multiply-active-stonith.summary @@ -25,4 +25,4 @@ Revised Cluster Status: * Full List of Resources: * fencer (stonith:fence_ipmilan): Started node3 - * rsc1 (lsb:rsc1): Stopped + * rsc1 (lsb:rsc1): Stopped (not installed) diff --git a/lib/pengine/native.c b/lib/pengine/native.c index 36121c527f..a95c90c09a 100644 --- a/lib/pengine/native.c +++ b/lib/pengine/native.c @@ -599,6 +599,17 @@ pcmk__native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node g_string_append_printf(outstr, " %s", node->details->uname); } + // Failed probe operation + if (native_displayable_role(rsc) == RSC_ROLE_STOPPED) { + xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node ? node->details->uname : NULL); + if (probe_op != NULL) { + int rc; + + pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); + g_string_append_printf(outstr, " (%s) ", services_ocf_exitcode_str(rc)); + } + } + // Flags, as: ( [...]) if (node && !(node->details->online) && node->details->unclean) { have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); -- 2.27.0 From b9ca2e834ee01b35c03f153438ef8828b609fb38 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Thu, 18 Nov 2021 10:41:42 -0500 Subject: [PATCH 16/21] Refactor: scheduler: Rearrange pe__clone_default. Instead of the single stopped list, maintain a hash table where the keys are nodes and the values are the status of the node. For now, this is just "Stopped" or "Stopped (disabled)" but in the future will be expanded to cover failed probe operations. --- lib/pengine/clone.c | 103 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 82 insertions(+), 21 deletions(-) diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c index 5569c6b6e9..58fb24d24e 100644 --- a/lib/pengine/clone.c +++ b/lib/pengine/clone.c @@ -28,6 +28,55 @@ #define UNPROMOTED_INSTANCES RSC_ROLE_UNPROMOTED_S #endif +static GList * +sorted_hash_table_values(GHashTable *table) +{ + GList *retval = NULL; + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init(&iter, table); + while (g_hash_table_iter_next(&iter, &key, &value)) { + if (!g_list_find_custom(retval, value, (GCompareFunc) strcmp)) { + retval = g_list_prepend(retval, (char *) value); + } + } + + retval = g_list_sort(retval, (GCompareFunc) strcmp); + return retval; +} + +static GList * +nodes_with_status(GHashTable *table, const char *status) +{ + GList *retval = NULL; + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init(&iter, table); + while (g_hash_table_iter_next(&iter, &key, &value)) { + if (!strcmp((char *) value, status)) { + retval = g_list_prepend(retval, key); + } + } + + retval = g_list_sort(retval, (GCompareFunc) pcmk__numeric_strcasecmp); + return retval; +} + +static char * +node_list_to_str(GList *list) +{ + char *retval = NULL; + size_t len = 0; + + for (GList *iter = list; iter != NULL; iter = iter->next) { + pcmk__add_word(&retval, &len, (char *) iter->data); + } + + return retval; +} + static void clone_header(pcmk__output_t *out, int *rc, pe_resource_t *rsc, clone_variant_data_t *clone_data) { @@ -710,10 +759,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) GList *only_node = va_arg(args, GList *); GList *only_rsc = va_arg(args, GList *); + GHashTable *stopped = pcmk__strkey_table(free, free); + char *list_text = NULL; - char *stopped_list = NULL; size_t list_text_len = 0; - size_t stopped_list_len = 0; GList *promoted_list = NULL; GList *started_list = NULL; @@ -768,7 +817,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) // List stopped instances when requested (except orphans) if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { - pcmk__add_word(&stopped_list, &stopped_list_len, child_rsc->id); + g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); } } else if (is_set_recursive(child_rsc, pe_rsc_orphan, TRUE) @@ -822,7 +871,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) } if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { - free(stopped_list); + g_hash_table_destroy(stopped); PCMK__OUTPUT_LIST_FOOTER(out, rc); return pcmk_rc_ok; } @@ -890,23 +939,15 @@ pe__clone_default(pcmk__output_t *out, va_list args) } if (pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { - const char *state = "Stopped"; - enum rsc_role_e role = configured_role(rsc); - - if (role == RSC_ROLE_STOPPED) { - state = "Stopped (disabled)"; - } - if (!pcmk_is_set(rsc->flags, pe_rsc_unique) && (clone_data->clone_max > active_instances)) { GList *nIter; GList *list = g_hash_table_get_values(rsc->allowed_nodes); - /* Custom stopped list for non-unique clones */ - free(stopped_list); - stopped_list = NULL; - stopped_list_len = 0; + /* Custom stopped table for non-unique clones */ + g_hash_table_destroy(stopped); + stopped = pcmk__strkey_table(free, free); if (list == NULL) { /* Clusters with symmetrical=false haven't calculated allowed_nodes yet @@ -922,19 +963,39 @@ pe__clone_default(pcmk__output_t *out, va_list args) if (pe_find_node(rsc->running_on, node->details->uname) == NULL && pcmk__str_in_list(node->details->uname, only_node, pcmk__str_star_matches|pcmk__str_casei)) { - pcmk__add_word(&stopped_list, &stopped_list_len, - node->details->uname); + const char *state = "Stopped"; + + if (configured_role(rsc) == RSC_ROLE_STOPPED) { + state = "Stopped (disabled)"; + } + + g_hash_table_insert(stopped, strdup(node->details->uname), + strdup(state)); } } g_list_free(list); } - if (stopped_list != NULL) { + if (g_hash_table_size(stopped) > 0) { + GList *list = sorted_hash_table_values(stopped); + clone_header(out, &rc, rsc, clone_data); - out->list_item(out, NULL, "%s: [ %s ]", state, stopped_list); - free(stopped_list); - stopped_list_len = 0; + for (GList *status_iter = list; status_iter != NULL; status_iter = status_iter->next) { + const char *status = status_iter->data; + GList *nodes = nodes_with_status(stopped, status); + char *str = node_list_to_str(nodes); + + if (str != NULL) { + out->list_item(out, NULL, "%s: [ %s ]", status, str); + free(str); + } + + g_list_free(nodes); + } + + g_list_free(list); + g_hash_table_destroy(stopped); /* If there are no instances of this clone (perhaps because there are no * nodes configured), simply output the clone header by itself. This can -- 2.27.0 From 0228a64cea412936fb8ee91b0f83f9800048d3ba Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Fri, 19 Nov 2021 10:06:18 -0500 Subject: [PATCH 17/21] Feature: scheduler: Display the reason why a clone rsc probe failed. This is similar to the previous commit that adds reasons for primitive resources. See: rhbz#1506372 --- cts/cli/regression.crm_mon.exp | 8 +++---- .../summary/failed-probe-clone.summary | 14 +++++++------ include/crm/pengine/internal.h | 2 ++ lib/pengine/clone.c | 21 +++++++++++++++++-- lib/pengine/utils.c | 7 +++++++ 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index 4333caa11c..5688500ce5 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3479,7 +3479,7 @@ Node List: Active Resources: * Clone Set: ping-clone [ping]: * ping (ocf:pacemaker:ping): Started cluster01 - * ping (ocf:pacemaker:ping): Stopped + * ping (ocf:pacemaker:ping): Stopped (not installed) * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * Replica[0] @@ -3663,7 +3663,7 @@ Node List: Full List of Resources: * Clone Set: ping-clone [ping]: * ping (ocf:pacemaker:ping): Started cluster01 - * ping (ocf:pacemaker:ping): Stopped + * ping (ocf:pacemaker:ping): Stopped (not installed) * Fencing (stonith:fence_xvm): Started cluster01 * Container bundle set: httpd-bundle [pcmk:http]: * Replica[0] @@ -3705,7 +3705,7 @@ Full List of Resources: * 1/1 (stonith:fence_xvm): Active cluster01 * Clone Set: ping-clone [ping]: * ping (ocf:pacemaker:ping): Started cluster01 - * ping (ocf:pacemaker:ping): Stopped + * ping (ocf:pacemaker:ping): Stopped (not installed) * Container bundle set: httpd-bundle [pcmk:http]: * Replica[0] * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 @@ -3886,7 +3886,7 @@ Node List: Inactive Resources: * Clone Set: ping-clone [ping]: * ping (ocf:pacemaker:ping): Started cluster01 - * ping (ocf:pacemaker:ping): Stopped + * ping (ocf:pacemaker:ping): Stopped (not installed) * Resource Group: partially-active-group: * 2/4 (ocf:pacemaker:Dummy): Active cluster02 * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary index ca15c302aa..febee14400 100644 --- a/cts/scheduler/summary/failed-probe-clone.summary +++ b/cts/scheduler/summary/failed-probe-clone.summary @@ -5,12 +5,13 @@ Current cluster status: * Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Clone Set: ping-1-clone [ping-1]: - * Stopped: [ cluster01 cluster02 ] + * Stopped (not installed): [ cluster01 cluster02 ] * Clone Set: ping-2-clone [ping-2]: - * Stopped: [ cluster01 cluster02 ] + * Stopped: [ cluster02 ] + * Stopped (not installed): [ cluster01 ] * Clone Set: ping-3-clone [ping-3]: * ping-3 (ocf:pacemaker:ping): FAILED cluster01 - * Stopped: [ cluster02 ] + * Stopped (not installed): [ cluster02 ] Transition Summary: * Start ping-2:0 ( cluster02 ) @@ -38,9 +39,10 @@ Revised Cluster Status: * Full List of Resources: * Fencing (stonith:fence_xvm): Started cluster01 * Clone Set: ping-1-clone [ping-1]: - * Stopped: [ cluster01 cluster02 ] + * Stopped (not installed): [ cluster01 cluster02 ] * Clone Set: ping-2-clone [ping-2]: * Started: [ cluster02 ] - * Stopped: [ cluster01 ] + * Stopped (not installed): [ cluster01 ] * Clone Set: ping-3-clone [ping-3]: - * Stopped: [ cluster01 cluster02 ] + * Stopped: [ cluster01 ] + * Stopped (not installed): [ cluster02 ] diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h index 58dd2e8727..2b20da6e5f 100644 --- a/include/crm/pengine/internal.h +++ b/include/crm/pengine/internal.h @@ -576,4 +576,6 @@ gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean ch xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); +const char *pe__clone_child_id(pe_resource_t *rsc); + #endif diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c index 58fb24d24e..ef4bdc0edf 100644 --- a/lib/pengine/clone.c +++ b/lib/pengine/clone.c @@ -963,14 +963,23 @@ pe__clone_default(pcmk__output_t *out, va_list args) if (pe_find_node(rsc->running_on, node->details->uname) == NULL && pcmk__str_in_list(node->details->uname, only_node, pcmk__str_star_matches|pcmk__str_casei)) { + xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node->details->uname); const char *state = "Stopped"; if (configured_role(rsc) == RSC_ROLE_STOPPED) { state = "Stopped (disabled)"; } - g_hash_table_insert(stopped, strdup(node->details->uname), - strdup(state)); + if (probe_op != NULL) { + int rc; + + pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); + g_hash_table_insert(stopped, strdup(node->details->uname), + crm_strdup_printf("Stopped (%s)", services_ocf_exitcode_str(rc))); + } else { + g_hash_table_insert(stopped, strdup(node->details->uname), + strdup(state)); + } } } g_list_free(list); @@ -1113,3 +1122,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent return !passes; } + +const char * +pe__clone_child_id(pe_resource_t *rsc) +{ + clone_variant_data_t *clone_data = NULL; + get_clone_variant_data(clone_data, rsc); + return ID(clone_data->xml_obj_child); +} diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index 3151f0120b..6c4f3b6971 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -2573,8 +2573,15 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { xmlNode * pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) { + pe_resource_t *parent = uber_parent(rsc); const char *rsc_id = rsc->id; + if (rsc->variant == pe_clone) { + rsc_id = pe__clone_child_id(rsc); + } else if (parent->variant == pe_clone) { + rsc_id = pe__clone_child_id(parent); + } + for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; xml_op = pcmk__xml_next(xml_op)) { const char *value = NULL; -- 2.27.0 From cf8b01da93fce87526617fefdcee6eb9f6ecdbd1 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Wed, 24 Nov 2021 10:57:05 -0500 Subject: [PATCH 18/21] Test: cts-cli: Update the last-rc-change sed expression. This can now occur in both the XML output (where it's wrapped in double quotes) and the text output (where it's wrapped in single quotes and followed by a comma). In addition, a plus or minus can occur in the time string. The "{0,1}" syntax takes the place of a "?" for marking the optional comma. In FreeBSD sed, "?" doesn't mean anything special. --- cts/cli/regression.crm_mon.exp | 12 ++++++------ cts/cts-cli.in | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp index 5688500ce5..957758832d 100644 --- a/cts/cli/regression.crm_mon.exp +++ b/cts/cli/regression.crm_mon.exp @@ -3497,7 +3497,7 @@ Active Resources: * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources =#=#=#= Begin test: XML output of partially active resources =#=#=#= @@ -3641,7 +3641,7 @@ Failed Resource Actions: - + @@ -3684,7 +3684,7 @@ Full List of Resources: * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Text output of partially active resources, with inactive resources =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= @@ -3771,7 +3771,7 @@ Operations: * (1) probe Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output, with inactive resources =#=#=#= Begin test: Text output of partially active group =#=#=#= @@ -3850,7 +3850,7 @@ Active Resources: * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= * Passed: crm_mon - Text output of inactive member of partially active group =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= @@ -3942,7 +3942,7 @@ Operations: * (1) probe Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms + * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= diff --git a/cts/cts-cli.in b/cts/cts-cli.in index 457816afab..72e9a1e912 100755 --- a/cts/cts-cli.in +++ b/cts/cts-cli.in @@ -1870,7 +1870,7 @@ for t in $tests; do -e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \ -e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \ -e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \ - -e 's/ last-rc-change=\"[A-Za-z0-9: ]*\"//'\ + -e "s/ last-rc-change=['\"][-+A-Za-z0-9: ]*['\"],\{0,1\}//" \ -e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\ -e 's/^Entity: line [0-9][0-9]*: //'\ -e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \ -- 2.27.0 From dea61f1b6507fbc978e040c1555384d8d7ffa9f3 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Wed, 1 Dec 2021 16:23:14 -0500 Subject: [PATCH 19/21] Fix: include: Bump feature set to 3.12.0. This is for the scheduler handling changing regarding maskable probe failures. See: rhbz#1506372. --- include/crm/crm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crm/crm.h b/include/crm/crm.h index 04d2324d75..16b35e9c55 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -66,7 +66,7 @@ extern "C" { * >=3.0.13: Fail counts include operation name and interval * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED */ -# define CRM_FEATURE_SET "3.11.0" +# define CRM_FEATURE_SET "3.12.0" /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and * recipient of a CPG message. This imposes an arbitrary limit on cluster node -- 2.27.0 From fef2c61ef462c221809dc91467ea1e96d5478c74 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Mon, 6 Dec 2021 16:42:15 -0500 Subject: [PATCH 20/21] Feature: scheduler: Handle masked probes in the scheduler. These probe operations get their rc/status codes mapped to not running/done, but still ensures they end up in the list of failed operations so tool output continues to display them properly. Note that failures on bundled resources do not get masked. There are no test case changes for this patch. See: rhbz#1506372. --- lib/pengine/unpack.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index b659f319fb..f3583e97d8 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -3169,6 +3169,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, } } + if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { + *status = PCMK_EXEC_DONE; + *rc = PCMK_OCF_NOT_RUNNING; + } + /* If the executor reported an operation status of anything but done or * error, consider that final. But for done or error, we know better whether * it should be treated as a failure or not, because we know the expected @@ -3567,12 +3572,12 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c CRM_ASSERT(rsc); CRM_ASSERT(xml_op); - if (rc == PCMK_OCF_NOT_RUNNING) { - clear_past_failure = TRUE; - - } else if (rc == PCMK_OCF_NOT_INSTALLED) { + if (rc == PCMK_OCF_NOT_INSTALLED || (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op))) { rsc->role = RSC_ROLE_STOPPED; + } else if (rc == PCMK_OCF_NOT_RUNNING) { + clear_past_failure = TRUE; + } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) { if (last_failure) { const char *op_key = get_op_key(xml_op); @@ -3661,8 +3666,10 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, pe_working_set_t *data_set) { int rc = 0; + int old_rc = 0; int task_id = 0; int target_rc = 0; + int old_target_rc = 0; int status = PCMK_EXEC_UNKNOWN; guint interval_ms = 0; const char *task = NULL; @@ -3671,6 +3678,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, bool expired = false; pe_resource_t *parent = rsc; enum action_fail_response failure_strategy = action_fail_recover; + bool maskable_probe_failure = false; CRM_CHECK(rsc && node && xml_op, return); @@ -3727,10 +3735,22 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, expired = true; } + old_rc = rc; + old_target_rc = target_rc; + remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, &rc, &status); - if (expired && (rc != target_rc)) { + maskable_probe_failure = !pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op); + + if (expired && maskable_probe_failure && old_rc != old_target_rc) { + if (rsc->role <= RSC_ROLE_STOPPED) { + rsc->role = RSC_ROLE_UNKNOWN; + } + + goto done; + + } else if (expired && (rc != target_rc)) { const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); if (interval_ms == 0) { @@ -3758,6 +3778,18 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, } } + if (maskable_probe_failure) { + crm_notice("Treating probe result '%s' for %s on %s as 'not running'", + services_ocf_exitcode_str(rc), rsc->id, node->details->uname); + update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, + on_fail, data_set); + crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); + + record_failed_op(xml_op, node, rsc, data_set); + resource_location(parent, node, -INFINITY, "masked-probe-failure", data_set); + goto done; + } + switch (status) { case PCMK_EXEC_CANCELLED: // Should never happen -- 2.27.0 From ccff6eb60598f389008b0621447056457da79671 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Tue, 4 Jan 2022 10:14:48 -0500 Subject: [PATCH 21/21] Test: scheduler: Add tests for expired, masked probe failures. dummy-1 is a stopped resource with an expired masked probe failure. This probe should be rescheduled. dummy-2 is a started resource with an expired masked probe failure. This probe should not be rescheduled. --- cts/cts-scheduler.in | 1 + .../dot/expired-failed-probe-primitive.dot | 8 ++ .../exp/expired-failed-probe-primitive.exp | 45 ++++++++++++ .../expired-failed-probe-primitive.scores | 7 ++ .../expired-failed-probe-primitive.summary | 26 +++++++ .../xml/expired-failed-probe-primitive.xml | 73 +++++++++++++++++++ 6 files changed, 160 insertions(+) create mode 100644 cts/scheduler/dot/expired-failed-probe-primitive.dot create mode 100644 cts/scheduler/exp/expired-failed-probe-primitive.exp create mode 100644 cts/scheduler/scores/expired-failed-probe-primitive.scores create mode 100644 cts/scheduler/summary/expired-failed-probe-primitive.summary create mode 100644 cts/scheduler/xml/expired-failed-probe-primitive.xml diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in index 3abcbc6c9d..7bc41a0936 100644 --- a/cts/cts-scheduler.in +++ b/cts/cts-scheduler.in @@ -115,6 +115,7 @@ TESTS = [ [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], + [ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ], [ "standby", "Standby" ], [ "comments", "Comments" ], ], diff --git a/cts/scheduler/dot/expired-failed-probe-primitive.dot b/cts/scheduler/dot/expired-failed-probe-primitive.dot new file mode 100644 index 0000000000..610c2b8047 --- /dev/null +++ b/cts/scheduler/dot/expired-failed-probe-primitive.dot @@ -0,0 +1,8 @@ + digraph "g" { +"dummy-1_monitor_0 cluster01" -> "dummy-1_start_0 cluster02" [ style = bold] +"dummy-1_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] +"dummy-1_monitor_0 cluster02" -> "dummy-1_start_0 cluster02" [ style = bold] +"dummy-1_monitor_0 cluster02" [ style=bold color="green" fontcolor="black"] +"dummy-1_start_0 cluster02" [ style=bold color="green" fontcolor="black"] +"dummy-2_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] +} diff --git a/cts/scheduler/exp/expired-failed-probe-primitive.exp b/cts/scheduler/exp/expired-failed-probe-primitive.exp new file mode 100644 index 0000000000..3c2cbfe411 --- /dev/null +++ b/cts/scheduler/exp/expired-failed-probe-primitive.exp @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cts/scheduler/scores/expired-failed-probe-primitive.scores b/cts/scheduler/scores/expired-failed-probe-primitive.scores new file mode 100644 index 0000000000..51ae5510e6 --- /dev/null +++ b/cts/scheduler/scores/expired-failed-probe-primitive.scores @@ -0,0 +1,7 @@ + +pcmk__native_allocate: Fencing allocation score on cluster01: 0 +pcmk__native_allocate: Fencing allocation score on cluster02: 0 +pcmk__native_allocate: dummy-1 allocation score on cluster01: 0 +pcmk__native_allocate: dummy-1 allocation score on cluster02: 0 +pcmk__native_allocate: dummy-2 allocation score on cluster01: 0 +pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 diff --git a/cts/scheduler/summary/expired-failed-probe-primitive.summary b/cts/scheduler/summary/expired-failed-probe-primitive.summary new file mode 100644 index 0000000000..ac0604e84f --- /dev/null +++ b/cts/scheduler/summary/expired-failed-probe-primitive.summary @@ -0,0 +1,26 @@ +Current cluster status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * dummy-1 (ocf:pacemaker:Dummy): Stopped + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 + +Transition Summary: + * Start dummy-1 ( cluster02 ) + +Executing Cluster Transition: + * Resource action: dummy-1 monitor on cluster02 + * Resource action: dummy-1 monitor on cluster01 + * Resource action: dummy-2 monitor on cluster01 + * Resource action: dummy-1 start on cluster02 + +Revised Cluster Status: + * Node List: + * Online: [ cluster01 cluster02 ] + + * Full List of Resources: + * Fencing (stonith:fence_xvm): Started cluster01 + * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 + * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 diff --git a/cts/scheduler/xml/expired-failed-probe-primitive.xml b/cts/scheduler/xml/expired-failed-probe-primitive.xml new file mode 100644 index 0000000000..684aa73f92 --- /dev/null +++ b/cts/scheduler/xml/expired-failed-probe-primitive.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- 2.27.0