diff --git a/SOURCES/007-unfencing.patch b/SOURCES/007-unfencing.patch new file mode 100644 index 0000000..1ce5c0a --- /dev/null +++ b/SOURCES/007-unfencing.patch @@ -0,0 +1,309 @@ +From 28566d6832274c59f27bb7b2f1f54420a3f3d822 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 9 May 2019 20:26:08 -0500 +Subject: [PATCH 1/2] Refactor: libpe_status: functionize unfencing digest code + more + +... for readability, reusability, and avoiding unnecessary function calls or +memory allocation. +--- + lib/pengine/utils.c | 159 ++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 118 insertions(+), 41 deletions(-) + +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 2f4dc1e..f80f8d4 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2080,57 +2080,134 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node, + return data; + } + ++/*! ++ * \internal ++ * \brief Create an unfencing summary for use in special node attribute ++ * ++ * Create a string combining a fence device's resource ID, agent type, and ++ * parameter digest (whether for all parameters or just non-private parameters). ++ * This can be stored in a special node attribute, allowing us to detect changes ++ * in either the agent type or parameters, to know whether unfencing must be ++ * redone or can be safely skipped when the device's history is cleaned. ++ * ++ * \param[in] rsc_id Fence device resource ID ++ * \param[in] agent_type Fence device agent ++ * \param[in] param_digest Fence device parameter digest ++ * ++ * \return Newly allocated string with unfencing digest ++ * \note The caller is responsible for freeing the result. ++ */ ++static inline char * ++create_unfencing_summary(const char *rsc_id, const char *agent_type, ++ const char *param_digest) ++{ ++ return crm_strdup_printf("%s:%s:%s", rsc_id, agent_type, param_digest); ++} ++ ++/*! ++ * \internal ++ * \brief Check whether a node can skip unfencing ++ * ++ * Check whether a fence device's current definition matches a node's ++ * stored summary of when it was last unfenced by the device. ++ * ++ * \param[in] rsc_id Fence device's resource ID ++ * \param[in] agent Fence device's agent type ++ * \param[in] digest_calc Fence device's current parameter digest ++ * \param[in] node_summary Value of node's special unfencing node attribute ++ * (a comma-separated list of unfencing summaries for ++ * all devices that have unfenced this node) ++ * ++ * \return TRUE if digest matches, FALSE otherwise ++ */ ++static bool ++unfencing_digest_matches(const char *rsc_id, const char *agent, ++ const char *digest_calc, const char *node_summary) ++{ ++ bool matches = FALSE; ++ ++ if (rsc_id && agent && digest_calc && node_summary) { ++ char *search_secure = create_unfencing_summary(rsc_id, agent, ++ digest_calc); ++ ++ /* The digest was calculated including the device ID and agent, ++ * so there is no risk of collision using strstr(). ++ */ ++ matches = (strstr(node_summary, search_secure) != NULL); ++ crm_trace("Calculated unfencing digest '%s' %sfound in '%s'", ++ search_secure, matches? "" : "not ", node_summary); ++ free(search_secure); ++ } ++ return matches; ++} ++ ++/* Magic string to use as action name for digest cache entries used for ++ * unfencing checks. This is not a real action name (i.e. "on"), so ++ * check_action_definition() won't confuse these entries with real actions. ++ */ + #define STONITH_DIGEST_TASK "stonith-on" + ++/*! ++ * \internal ++ * \brief Calculate fence device digests and digest comparison result ++ * ++ * \param[in] rsc Fence device resource ++ * \param[in] agent Fence device's agent type ++ * \param[in] node Node with digest cache to use ++ * \param[in] data_set Cluster working set ++ * ++ * \return Node's digest cache entry ++ */ + static op_digest_cache_t * +-fencing_action_digest_cmp(resource_t * rsc, node_t * node, pe_working_set_t * data_set) ++fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent, ++ pe_node_t *node, pe_working_set_t *data_set) + { +- char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); +- op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, node, NULL, data_set); ++ const char *node_summary = NULL; + +- const char *digest_all = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL); +- const char *digest_secure = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE); ++ // Calculate device's current parameter digests ++ char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); ++ op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, ++ node, NULL, data_set); + +- /* No 'reloads' for fencing device changes +- * +- * We use the resource id + agent + digest so that we can detect +- * changes to the agent and/or the parameters used +- */ +- char *search_all = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_all_calc); +- char *search_secure = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_secure_calc); ++ free(key); + +- data->rc = RSC_DIGEST_ALL; +- if (digest_all == NULL) { +- /* it is unknown what the previous op digest was */ ++ // Check whether node has special unfencing summary node attribute ++ node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL); ++ if (node_summary == NULL) { + data->rc = RSC_DIGEST_UNKNOWN; ++ return data; ++ } + +- } else if (strstr(digest_all, search_all)) { ++ // Check whether full parameter digest matches ++ if (unfencing_digest_matches(rsc->id, agent, data->digest_all_calc, ++ node_summary)) { + data->rc = RSC_DIGEST_MATCH; ++ return data; ++ } + +- } else if(digest_secure && data->digest_secure_calc) { +- if(strstr(digest_secure, search_secure)) { +- if (is_set(data_set->flags, pe_flag_stdout)) { +- printf("Only 'private' parameters to %s for unfencing %s changed\n", +- rsc->id, node->details->uname); +- } +- data->rc = RSC_DIGEST_MATCH; ++ // Check whether secure parameter digest matches ++ node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE); ++ if (unfencing_digest_matches(rsc->id, agent, data->digest_secure_calc, ++ node_summary)) { ++ data->rc = RSC_DIGEST_MATCH; ++ if (is_set(data_set->flags, pe_flag_stdout)) { ++ printf("Only 'private' parameters to %s for unfencing %s changed\n", ++ rsc->id, node->details->uname); + } ++ return data; + } + +- if (is_set(data_set->flags, pe_flag_sanitized) +- && is_set(data_set->flags, pe_flag_stdout) +- && (data->rc == RSC_DIGEST_ALL) ++ // Parameters don't match ++ data->rc = RSC_DIGEST_ALL; ++ if (is_set(data_set->flags, (pe_flag_sanitized|pe_flag_stdout)) + && data->digest_secure_calc) { +- printf("Parameters to %s for unfencing %s changed, try '%s:%s:%s'\n", +- rsc->id, node->details->uname, rsc->id, +- (const char *) g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), +- data->digest_secure_calc); +- } +- +- free(key); +- free(search_all); +- free(search_secure); ++ char *digest = create_unfencing_summary(rsc->id, agent, ++ data->digest_secure_calc); + ++ printf("Parameters to %s for unfencing %s changed, try '%s'\n", ++ rsc->id, node->details->uname, digest); ++ free(digest); ++ } + return data; + } + +@@ -2218,9 +2295,6 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe + * + * We may do this for all nodes in the future, but for now + * the check_action_definition() based stuff works fine. +- * +- * Use "stonith-on" to avoid creating cache entries for +- * operations check_action_definition() would look for. + */ + long max = 1024; + long digests_all_offset = 0; +@@ -2232,8 +2306,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe + + for (GListPtr gIter = matches; gIter != NULL; gIter = gIter->next) { + resource_t *match = gIter->data; +- op_digest_cache_t *data = fencing_action_digest_cmp(match, node, data_set); ++ const char *agent = g_hash_table_lookup(match->meta, ++ XML_ATTR_TYPE); ++ op_digest_cache_t *data = NULL; + ++ data = fencing_action_digest_cmp(match, agent, node, data_set); + if(data->rc == RSC_DIGEST_ALL) { + optional = FALSE; + crm_notice("Unfencing %s (remote): because the definition of %s changed", node->details->uname, match->id); +@@ -2244,11 +2321,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe + + digests_all_offset += snprintf( + digests_all+digests_all_offset, max-digests_all_offset, +- "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_all_calc); ++ "%s:%s:%s,", match->id, agent, data->digest_all_calc); + + digests_secure_offset += snprintf( + digests_secure+digests_secure_offset, max-digests_secure_offset, +- "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_secure_calc); ++ "%s:%s:%s,", match->id, agent, data->digest_secure_calc); + } + g_hash_table_insert(stonith_op->meta, + strdup(XML_OP_ATTR_DIGESTS_ALL), +-- +1.8.3.1 + + +From fd6e06ff419c95f4423202163d2d4dca3f03a4c5 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 May 2019 11:57:31 -0500 +Subject: [PATCH 2/2] Fix: libpe_status: calculate secure digests for unfencing + ops + +The calculation of digests for detection of when unfencing is needed reused +rsc_action_digest(). However that would only add secure digests when the +pe_flag_sanitized flag was set, which is only set by crm_simulate, so secure +digests would never be added in normal cluster operation. This led to +node attributes like name="#digests-secure" +value="stonith-fence_compute-fence-nova:fence_compute:(null),". + +Now, rsc_action_digest() takes a new argument to select whether secure digests +are added, which is always set to TRUE when calculating unfencing digests. +--- + lib/pengine/utils.c | 27 ++++++++++++++++++++++----- + 1 file changed, 22 insertions(+), 5 deletions(-) + +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index f80f8d4..5b893f7 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -1936,9 +1936,24 @@ append_versioned_params(xmlNode *versioned_params, const char *ra_version, xmlNo + } + #endif + ++/*! ++ * \internal ++ * \brief Calculate action digests and store in node's digest cache ++ * ++ * \param[in] rsc Resource that action was for ++ * \param[in] task Name of action performed ++ * \param[in] key Action's task key ++ * \param[in] node Node action was performed on ++ * \param[in] xml_op XML of operation in CIB status (if available) ++ * \param[in] calc_secure Whether to calculate secure digest ++ * \param[in] data_set Cluster working set ++ * ++ * \return Pointer to node's digest cache entry ++ */ + static op_digest_cache_t * +-rsc_action_digest(resource_t * rsc, const char *task, const char *key, +- node_t * node, xmlNode * xml_op, pe_working_set_t * data_set) ++rsc_action_digest(pe_resource_t *rsc, const char *task, const char *key, ++ pe_node_t *node, xmlNode *xml_op, bool calc_secure, ++ pe_working_set_t *data_set) + { + op_digest_cache_t *data = NULL; + +@@ -2007,7 +2022,7 @@ rsc_action_digest(resource_t * rsc, const char *task, const char *key, + + data->digest_all_calc = calculate_operation_digest(data->params_all, op_version); + +- if (is_set(data_set->flags, pe_flag_sanitized)) { ++ if (calc_secure) { + data->params_secure = copy_xml(data->params_all); + if(secure_list) { + filter_parameters(data->params_secure, secure_list, FALSE); +@@ -2053,7 +2068,9 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node, + + interval_ms = crm_parse_ms(interval_ms_s); + key = generate_op_key(rsc->id, task, interval_ms); +- data = rsc_action_digest(rsc, task, key, node, xml_op, data_set); ++ data = rsc_action_digest(rsc, task, key, node, xml_op, ++ is_set(data_set->flags, pe_flag_sanitized), ++ data_set); + + data->rc = RSC_DIGEST_MATCH; + if (digest_restart && data->digest_restart_calc && strcmp(data->digest_restart_calc, digest_restart) != 0) { +@@ -2167,7 +2184,7 @@ fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent, + // Calculate device's current parameter digests + char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); + op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, +- node, NULL, data_set); ++ node, NULL, TRUE, data_set); + + free(key); + +-- +1.8.3.1 + diff --git a/SOURCES/008-remote.patch b/SOURCES/008-remote.patch new file mode 100644 index 0000000..9760137 --- /dev/null +++ b/SOURCES/008-remote.patch @@ -0,0 +1,545 @@ +From dad337a96dfeca4dbde7bbd97f99f24956440fc2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Sat, 8 Jun 2019 16:25:04 -0500 +Subject: [PATCH 1/4] Refactor: libpe_status: add function for checking + shutdown attribute + +... to reduce code duplication and allow further reuse +--- + include/crm/pengine/internal.h | 2 ++ + lib/pengine/unpack.c | 8 ++------ + lib/pengine/utils.c | 20 ++++++++++++++++++++ + 3 files changed, 24 insertions(+), 6 deletions(-) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index fd55bb9..a2a3d52 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -359,4 +359,6 @@ void pe__foreach_param_check(pe_working_set_t *data_set, + enum pe_check_parameters, + pe_working_set_t*)); + void pe__free_param_checks(pe_working_set_t *data_set); ++ ++bool pe__shutdown_requested(pe_node_t *node); + #endif +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 081df07..9d13a57 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -909,7 +909,6 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * + const char *resource_discovery_enabled = NULL; + xmlNode *attrs = NULL; + resource_t *rsc = NULL; +- const char *shutdown = NULL; + + if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { + return; +@@ -931,8 +930,7 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * + attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); + add_node_attrs(attrs, this_node, TRUE, data_set); + +- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); +- if (shutdown != NULL && safe_str_neq("0", shutdown)) { ++ if (pe__shutdown_requested(this_node)) { + crm_info("Node %s is shutting down", this_node->details->uname); + this_node->details->shutdown = TRUE; + if (rsc) { +@@ -1392,7 +1390,6 @@ gboolean + determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) + { + gboolean online = FALSE; +- const char *shutdown = NULL; + const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); + + if (this_node == NULL) { +@@ -1402,9 +1399,8 @@ determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set + + this_node->details->shutdown = FALSE; + this_node->details->expected_up = FALSE; +- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); + +- if (shutdown != NULL && safe_str_neq("0", shutdown)) { ++ if (pe__shutdown_requested(this_node)) { + this_node->details->shutdown = TRUE; + + } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 5b893f7..c5fd0f7 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -2510,3 +2510,23 @@ void pe_action_set_reason(pe_action_t *action, const char *reason, bool overwrit + } + } + } ++ ++/*! ++ * \internal ++ * \brief Check whether shutdown has been requested for a node ++ * ++ * \param[in] node Node to check ++ * ++ * \return TRUE if node has shutdown attribute set and nonzero, FALSE otherwise ++ * \note This differs from simply using node->details->shutdown in that it can ++ * be used before that has been determined (and in fact to determine it), ++ * and it can also be used to distinguish requested shutdown from implicit ++ * shutdown of remote nodes by virtue of their connection stopping. ++ */ ++bool ++pe__shutdown_requested(pe_node_t *node) ++{ ++ const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); ++ ++ return shutdown && strcmp(shutdown, "0"); ++} +-- +1.8.3.1 + + +From 1e9903326a59f58d9dd2f2618d709f8aa61e41e9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:37:26 -0500 +Subject: [PATCH 2/4] Fix: scheduler: remote state is failed if node is + shutting down with connection failure + +When determining remote state, if the connection resource is failed and not +being started again, we consider the state to be unknown if the connection has +a reconnect interval, because we won't know whether the connection can be +recovered until the interval expires and we re-attempt connection. + +However, if the node is shutting down at the time, we won't re-attempt +connection, so consider the state failed in that case. (Note that we check the +actual shutdown node attribute, rather than node->details->shutdown, since that +is set for remote nodes whenever the connection is stopping.) + +This avoids a situation where actions that cannot succeed can be scheduled on a +remote node that's shutting down. +--- + daemons/schedulerd/sched_allocate.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/daemons/schedulerd/sched_allocate.c b/daemons/schedulerd/sched_allocate.c +index 3363a72..b7d1b48 100644 +--- a/daemons/schedulerd/sched_allocate.c ++++ b/daemons/schedulerd/sched_allocate.c +@@ -1972,7 +1972,8 @@ get_remote_node_state(pe_node_t *node) + + if ((remote_rsc->next_role == RSC_ROLE_STOPPED) + && remote_rsc->remote_reconnect_ms +- && node->details->remote_was_fenced) { ++ && node->details->remote_was_fenced ++ && !pe__shutdown_requested(node)) { + + /* We won't know whether the connection is recoverable until the + * reconnect interval expires and we reattempt connection. +-- +1.8.3.1 + + +From ea70750d04219618b5feeda04443b27616e441a0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 5 Jun 2019 16:43:19 -0500 +Subject: [PATCH 3/4] Fix: libpe_status: don't order implied stops relative to + a remote connection + +Actions behind a remote connection are ordered relative to any start or stop of +the remote connection. However, if the action is a stop implied due to fencing, +it does not require the remote connection, and the ordering should not be done. + +This avoids a delay in the remote connection recovery if it is failed, e.g. +previously the ordering would look like: + + fence remote node -> implied stop of resource on remote -> stop connection + +Now, the connection stop can proceed simultaneously with the remote node +fencing. +--- + daemons/schedulerd/sched_allocate.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +diff --git a/daemons/schedulerd/sched_allocate.c b/daemons/schedulerd/sched_allocate.c +index b7d1b48..9f82c00 100644 +--- a/daemons/schedulerd/sched_allocate.c ++++ b/daemons/schedulerd/sched_allocate.c +@@ -2065,14 +2065,13 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) + pe_order_implies_first, data_set); + + } else if(state == remote_state_failed) { +- /* We would only be here if the resource is +- * running on the remote node. Since we have no +- * way to stop it, it is necessary to fence the +- * node. ++ /* The resource is active on the node, but since we don't have a ++ * valid connection, the only way to stop the resource is by ++ * fencing the node. There is no need to order the stop relative ++ * to the remote connection, since the stop will become implied ++ * by the fencing. + */ + pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); +- order_action_then_stop(action, remote_rsc, +- pe_order_implies_first, data_set); + + } else if(remote_rsc->next_role == RSC_ROLE_STOPPED) { + /* State must be remote_state_unknown or remote_state_stopped. +-- +1.8.3.1 + + +From 091c367369b892d26fe0de99d35cf521b6249d10 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Sat, 8 Jun 2019 16:51:20 -0500 +Subject: [PATCH 4/4] Test: cts-scheduler: update regression tests for remote + connection ordering change + +Remote connection stops no longer have to wait for implied stops of resources +behind the connection. + +Unchanged from before, if the remote connection stops are implied themselves, +they can be confirmed immediately without waiting for their host's fencing, +because remote connections have "requires" set to "quorum" rather than +"fencing". +--- + cts/scheduler/order-expired-failure.dot | 1 - + cts/scheduler/order-expired-failure.exp | 6 +----- + cts/scheduler/order-expired-failure.summary | 2 +- + cts/scheduler/remote-connection-unrecoverable.dot | 1 - + cts/scheduler/remote-connection-unrecoverable.exp | 6+----- + cts/scheduler/remote-connection-unrecoverable.summary | 2 +- + cts/scheduler/remote-fence-before-reconnect.dot | 1 - + cts/scheduler/remote-fence-before-reconnect.exp | 6 +----- + cts/scheduler/remote-fence-before-reconnect.summary | 2 +- + cts/scheduler/remote-recover-all.dot | 2 -- + cts/scheduler/remote-recover-all.exp | 12 ++---------- + cts/scheduler/remote-recover-all.summary | 4 ++-- + cts/scheduler/remote-recover-no-resources.dot | 1 - + cts/scheduler/remote-recover-no-resources.exp | 6 +----- + cts/scheduler/remote-recover-no-resources.summary | 2 +- + cts/scheduler/remote-recover-unknown.dot | 1 - + cts/scheduler/remote-recover-unknown.exp | 6 +----- + cts/scheduler/remote-recover-unknown.summary | 2 +- + 18 files changed, 14 insertions(+), 53 deletions(-) + +diff --git a/cts/scheduler/order-expired-failure.dot b/cts/scheduler/order-expired-failure.dot +index 2e9963b..5c21d5d 100644 +--- a/cts/scheduler/order-expired-failure.dot ++++ b/cts/scheduler/order-expired-failure.dot +@@ -4,7 +4,6 @@ digraph "g" { + "compute-unfence-trigger-clone_stop_0" [ style=bold color="green" fontcolor="orange"] + "compute-unfence-trigger-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] + "compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "compute-unfence-trigger-clone_stopped_0" [ style = bold] +-"compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "overcloud-novacompute-1_stop_0 controller-1" [ style = bold] + "compute-unfence-trigger_stop_0 overcloud-novacompute-1" [ style=bold color="green" fontcolor="orange"] + "ip-10.0.0.110_monitor_10000 controller-1" [ style=bold color="green" fontcolor="black"] + "ip-10.0.0.110_start_0 controller-1" -> "ip-10.0.0.110_monitor_10000 controller-1" [ style = bold] +diff --git a/cts/scheduler/order-expired-failure.exp b/cts/scheduler/order-expired-failure.exp +index c476bc2..4a50493 100644 +--- a/cts/scheduler/order-expired-failure.exp ++++ b/cts/scheduler/order-expired-failure.exp +@@ -9,11 +9,7 @@ + + + +- +- +- +- +- ++ + + + +diff --git a/cts/scheduler/order-expired-failure.summary b/cts/scheduler/order-expired-failure.summary +index c86bb91..2cf43ed 100644 +--- a/cts/scheduler/order-expired-failure.summary ++++ b/cts/scheduler/order-expired-failure.summary +@@ -52,6 +52,7 @@ Transition Summary: + * Stop compute-unfence-trigger:1 ( overcloud-novacompute-1 ) due to node availability + + Executing cluster transition: ++ * Resource action: overcloud-novacompute-1 stop on controller-1 + * Resource action: stonith-fence_compute-fence-nova stop on controller-2 + * Fencing overcloud-novacompute-1 (reboot) + * Cluster action: clear_failcount for overcloud-novacompute-1 on controller-1 +@@ -62,7 +63,6 @@ Executing cluster transition: + * Resource action: ip-10.0.0.110 monitor=10000 on controller-1 + * Pseudo action: compute-unfence-trigger_stop_0 + * Pseudo action: compute-unfence-trigger-clone_stopped_0 +- * Resource action: overcloud-novacompute-1 stop on controller-1 + Using the original execution date of: 2018-04-09 07:55:35Z + + Revised cluster status: +diff --git a/cts/scheduler/remote-connection-unrecoverable.dot b/cts/scheduler/remote-connection-unrecoverable.dot +index c86bb91..2cf43ed 100644 +--- a/cts/scheduler/remote-connection-unrecoverable.dot ++++ b/cts/scheduler/remote-connection-unrecoverable.dot +@@ -12,7 +12,6 @@ + "rsc1_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] + "rsc1_start_0 node2" -> "rsc1_monitor_10000 node2" [ style = bold] + "rsc1_start_0 node2" [ style=bold color="green" fontcolor="black"] +-"rsc1_stop_0 remote1" -> "remote1_stop_0 node1" [ style = bold] + "rsc1_stop_0 remote1" -> "rsc1_delete_0 remote1" [ style = dashed] + "rsc1_stop_0 remote1" -> "rsc1_start_0 node2" [ style = bold] + "rsc1_stop_0 remote1" -> "rsc2-master_demote_0" [ style = bold] + +diff --git a/cts/scheduler/remote-connection-unrecoverable.exp b/cts/scheduler/remote-connection-unrecoverable.exp +index c86bb91..2cf43ed 100644 +--- a/cts/scheduler/remote-connection-unrecoverable.exp ++++ b/cts/scheduler/remote-connection-unrecoverable.exp +@@ -5,11 +5,7 @@ + + + +- +- +- +- +- ++ + + + + +diff --git a/cts/scheduler/remote-connection-unrecoverable.summary b/cts/scheduler/remote-connection-unrecoverable.summary +index 23fa9ca..caff564 100644 +--- a/cts/scheduler/remote-connection-unrecoverable.summary ++++ b/cts/scheduler/remote-connection-unrecoverable.summary +@@ -21,6 +21,7 @@ Transition Summary: + * Stop rsc2:0 ( Master node1 ) due to node availability + + Executing cluster transition: ++ * Pseudo action: remote1_stop_0 + * Resource action: killer stop on node2 + * Resource action: rsc1 monitor on node2 + * Fencing node1 (reboot) +@@ -29,7 +30,6 @@ Executing cluster transition: + * Resource action: killer monitor=60000 on node2 + * Pseudo action: rsc1_stop_0 + * Pseudo action: rsc2-master_demote_0 +- * Pseudo action: remote1_stop_0 + * Resource action: rsc1 start on node2 + * Pseudo action: rsc2_demote_0 + * Pseudo action: rsc2-master_demoted_0 +diff --git a/cts/scheduler/remote-fence-before-reconnect.dot b/cts/scheduler/remote-fence-before-reconnect.dot +index 4ced43e..5812b7f 100644 +--- a/cts/scheduler/remote-fence-before-reconnect.dot ++++ b/cts/scheduler/remote-fence-before-reconnect.dot +@@ -3,7 +3,6 @@ + "fake2_monitor_10000 c7auto1" [ style=bold color="green" fontcolor="black"] + "fake2_start_0 c7auto1" -> "fake2_monitor_10000 c7auto1" [ style = bold] + "fake2_start_0 c7auto1" [ style=bold color="green" fontcolor="black"] +-"fake2_stop_0 c7auto4" -> "c7auto4_stop_0 c7auto1" [ style = bold] + "fake2_stop_0 c7auto4" -> "fake2_start_0 c7auto1" [ style = bold] + "fake2_stop_0 c7auto4" [ style=bold color="green" fontcolor="orange"] + "stonith 'reboot' c7auto4" -> "fake2_start_0 c7auto1" [ style = bold] +diff --git a/cts/scheduler/remote-fence-before-reconnect.exp b/cts/scheduler/remote-fence-before-reconnect.exp +index f99d9ef..f506f85 100644 +--- a/cts/scheduler/remote-fence-before-reconnect.exp ++++ b/cts/scheduler/remote-fence-before-reconnect.exp +@@ -9,11 +9,7 @@ + + + +- +- +- +- +- ++ + + + +diff --git a/cts/scheduler/remote-fence-before-reconnect.summary b/cts/scheduler/remote-fence-before-reconnect.summary +index f61e18b..03eac20 100644 +--- a/cts/scheduler/remote-fence-before-reconnect.summary ++++ b/cts/scheduler/remote-fence-before-reconnect.summary +@@ -17,9 +17,9 @@ Transition Summary: + * Move fake2 ( c7auto4 -> c7auto1 ) + + Executing cluster transition: ++ * Resource action: c7auto4 stop on c7auto1 + * Fencing c7auto4 (reboot) + * Pseudo action: fake2_stop_0 +- * Resource action: c7auto4 stop on c7auto1 + * Resource action: fake2 start on c7auto1 + * Resource action: fake2 monitor=10000 on c7auto1 + +diff --git a/cts/scheduler/remote-recover-all.dot b/cts/scheduler/remote-recover-all.dot +index deed802..4128b10 100644 +--- a/cts/scheduler/remote-recover-all.dot ++++ b/cts/scheduler/remote-recover-all.dot +@@ -19,7 +19,6 @@ digraph "g" { + "galera_demote_0 galera-2" -> "galera_stop_0 galera-2" [ style = bold] + "galera_demote_0 galera-2" [ style=bold color="green" fontcolor="orange"] + "galera_monitor_10000 galera-0" [ style=bold color="green" fontcolor="black"] +-"galera_stop_0 galera-2" -> "galera-2_stop_0 controller-1" [ style = bold] + "galera_stop_0 galera-2" -> "galera-master_stopped_0" [ style = bold] + "galera_stop_0 galera-2" [ style=bold color="green" fontcolor="orange"] + "haproxy-clone_stop_0" -> "haproxy-clone_stopped_0" [ style = bold] +@@ -60,7 +59,6 @@ digraph "g" { + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] + "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] +-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] + "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] + "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] + "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] +diff --git a/cts/scheduler/remote-recover-all.exp b/cts/scheduler/remote-recover-all.exp +index 8137ffb..0cb51f6 100644 +--- a/cts/scheduler/remote-recover-all.exp ++++ b/cts/scheduler/remote-recover-all.exp +@@ -5,11 +5,7 @@ + + + +- +- +- +- +- ++ + + + +@@ -57,11 +53,7 @@ + + + +- +- +- +- +- ++ + + + +diff --git a/cts/scheduler/remote-recover-all.summary b/cts/scheduler/remote-recover-all.summary +index 2ac0c6a..d095fdd 100644 +--- a/cts/scheduler/remote-recover-all.summary ++++ b/cts/scheduler/remote-recover-all.summary +@@ -56,7 +56,9 @@ Transition Summary: + * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) + + Executing cluster transition: ++ * Pseudo action: messaging-1_stop_0 + * Pseudo action: galera-0_stop_0 ++ * Pseudo action: galera-2_stop_0 + * Pseudo action: galera-master_demote_0 + * Pseudo action: redis-master_pre_notify_stop_0 + * Resource action: stonith-fence_ipmilan-525400bbf613 stop on controller-0 +@@ -94,7 +96,6 @@ Executing cluster transition: + * Resource action: stonith-fence_ipmilan-525400b4f6bd monitor=60000 on controller-0 + * Resource action: stonith-fence_ipmilan-5254005bdbb5 start on controller-2 + * Resource action: galera-0 monitor=20000 on controller-2 +- * Pseudo action: galera-2_stop_0 + * Resource action: rabbitmq notify on messaging-2 + * Resource action: rabbitmq notify on messaging-0 + * Pseudo action: rabbitmq_notified_0 +@@ -107,7 +108,6 @@ Executing cluster transition: + * Resource action: ip-172.17.1.17 start on controller-2 + * Resource action: ip-172.17.4.11 start on controller-2 + * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 +- * Pseudo action: messaging-1_stop_0 + * Pseudo action: redis_notified_0 + * Resource action: ip-172.17.1.14 monitor=10000 on controller-2 + * Resource action: ip-172.17.1.17 monitor=10000 on controller-2 +diff --git a/cts/scheduler/remote-recover-no-resources.dot b/cts/scheduler/remote-recover-no-resources.dot +index ef78aa6..a2f8ce0 100644 +--- a/cts/scheduler/remote-recover-no-resources.dot ++++ b/cts/scheduler/remote-recover-no-resources.dot +@@ -45,7 +45,6 @@ digraph "g" { + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] + "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] +-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] + "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] + "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] + "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] +diff --git a/cts/scheduler/remote-recover-no-resources.exp b/cts/scheduler/remote-recover-no-resources.exp +index 8a67c11..90470fb 100644 +--- a/cts/scheduler/remote-recover-no-resources.exp ++++ b/cts/scheduler/remote-recover-no-resources.exp +@@ -5,11 +5,7 @@ + + + +- +- +- +- +- ++ + + + +diff --git a/cts/scheduler/remote-recover-no-resources.summary b/cts/scheduler/remote-recover-no-resources.summary +index 89da784..18a989b 100644 +--- a/cts/scheduler/remote-recover-no-resources.summary ++++ b/cts/scheduler/remote-recover-no-resources.summary +@@ -54,6 +54,7 @@ Transition Summary: + * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) + + Executing cluster transition: ++ * Pseudo action: messaging-1_stop_0 + * Pseudo action: galera-0_stop_0 + * Pseudo action: galera-2_stop_0 + * Pseudo action: redis-master_pre_notify_stop_0 +@@ -92,7 +93,6 @@ Executing cluster transition: + * Pseudo action: ip-172.17.1.17_stop_0 + * Pseudo action: ip-172.17.4.11_stop_0 + * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 +- * Pseudo action: messaging-1_stop_0 + * Resource action: redis notify on controller-0 + * Resource action: redis notify on controller-2 + * Pseudo action: redis-master_confirmed-post_notify_stopped_0 +diff --git a/cts/scheduler/remote-recover-unknown.dot b/cts/scheduler/remote-recover-unknown.dot +index 5cd760b..29ab59f 100644 +--- a/cts/scheduler/remote-recover-unknown.dot ++++ b/cts/scheduler/remote-recover-unknown.dot +@@ -46,7 +46,6 @@ digraph "g" { + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] + "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] + "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] +-"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] + "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] + "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] + "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] +diff --git a/cts/scheduler/remote-recover-unknown.exp b/cts/scheduler/remote-recover-unknown.exp +index ac6f004..82cb65f7 100644 +--- a/cts/scheduler/remote-recover-unknown.exp ++++ b/cts/scheduler/remote-recover-unknown.exp +@@ -5,11 +5,7 @@ + + + +- +- +- +- +- ++ + + + +diff --git a/cts/scheduler/remote-recover-unknown.summary b/cts/scheduler/remote-recover-unknown.summary +index 2c60713..4d7a411 100644 +--- a/cts/scheduler/remote-recover-unknown.summary ++++ b/cts/scheduler/remote-recover-unknown.summary +@@ -55,6 +55,7 @@ Transition Summary: + * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) + + Executing cluster transition: ++ * Pseudo action: messaging-1_stop_0 + * Pseudo action: galera-0_stop_0 + * Pseudo action: galera-2_stop_0 + * Pseudo action: redis-master_pre_notify_stop_0 +@@ -94,7 +95,6 @@ Executing cluster transition: + * Pseudo action: ip-172.17.1.17_stop_0 + * Pseudo action: ip-172.17.4.11_stop_0 + * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 +- * Pseudo action: messaging-1_stop_0 + * Resource action: redis notify on controller-0 + * Resource action: redis notify on controller-2 + * Pseudo action: redis-master_confirmed-post_notify_stopped_0 +-- +1.8.3.1 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index 5f30f93..4beba17 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -195,7 +195,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} -Release: %{pcmk_release}%{?dist}.3 +Release: %{pcmk_release}%{?dist}.4 %if %{defined _unitdir} License: GPLv2+ and LGPLv2+ %else @@ -217,6 +217,8 @@ Patch3: 003-security-log.patch Patch4: 004-security-active.patch Patch5: 005-security-code.patch Patch6: 006-migration.patch +Patch7: 007-unfencing.patch +Patch8: 008-remote.patch # patches that aren't from upstream Patch100: rhbz-url.patch @@ -231,7 +233,7 @@ Requires: psmisc %endif %{?systemd_requires} -ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 %{arm} +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 Requires: %{python_path} BuildRequires: %{python_name}-devel @@ -885,6 +887,10 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Mon Aug 5 2019 Ken Gaillot - 2.0.1-4.4 +- Handle losing remote node while it is shutting down +- Resolves: rhbz#1734066 + * Thu May 9 2019 Klaus Wenninger - 2.0.1-4.3 - New build with fixed test in gating.yaml - Resolves: rhbz#1694557