|
|
413fc7 |
From eb2854add713f22b083a54aa7caf04be5067b469 Mon Sep 17 00:00:00 2001
|
|
|
413fc7 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
413fc7 |
Date: Tue, 13 Nov 2018 18:05:04 -0600
|
|
|
413fc7 |
Subject: [PATCH] Low: scheduler: order start after particular stonith op
|
|
|
413fc7 |
|
|
|
413fc7 |
Previously, if a resource's status was unknown on a node about to be fenced,
|
|
|
413fc7 |
we ordered the resource start after all_stopped. This approximated stonith_done
|
|
|
413fc7 |
before that was available. However, it makes more sense to order the start
|
|
|
413fc7 |
after the particular stonith op for the node in question.
|
|
|
413fc7 |
|
|
|
413fc7 |
This improves recovery when multiple nodes are being fenced: resources can now
|
|
|
413fc7 |
be recovered from one node when it is successfully fenced, even if the fencing
|
|
|
413fc7 |
of another node fails.
|
|
|
413fc7 |
---
|
|
|
413fc7 |
pengine/native.c | 63 +++++++++++++++++++++++++++++++-------------------------
|
|
|
413fc7 |
1 file changed, 35 insertions(+), 28 deletions(-)
|
|
|
413fc7 |
|
|
|
413fc7 |
diff --git a/pengine/native.c b/pengine/native.c
|
|
|
413fc7 |
index c6c1d55..9ee5990 100644
|
|
|
413fc7 |
--- a/pengine/native.c
|
|
|
413fc7 |
+++ b/pengine/native.c
|
|
|
413fc7 |
@@ -2948,13 +2948,19 @@ native_create_probe(resource_t * rsc, node_t * node, action_t * complete,
|
|
|
413fc7 |
return TRUE;
|
|
|
413fc7 |
}
|
|
|
413fc7 |
|
|
|
413fc7 |
+/*!
|
|
|
413fc7 |
+ * \internal
|
|
|
413fc7 |
+ * \brief Order a resource's start and promote actions relative to fencing
|
|
|
413fc7 |
+ *
|
|
|
413fc7 |
+ * \param[in] rsc Resource to be ordered
|
|
|
413fc7 |
+ * \param[in] stonith_op Fence action
|
|
|
413fc7 |
+ * \param[in] data_set Cluster information
|
|
|
413fc7 |
+ */
|
|
|
413fc7 |
static void
|
|
|
413fc7 |
native_start_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set_t * data_set)
|
|
|
413fc7 |
{
|
|
|
413fc7 |
node_t *target;
|
|
|
413fc7 |
GListPtr gIter = NULL;
|
|
|
413fc7 |
- action_t *all_stopped = get_pseudo_op(ALL_STOPPED, data_set);
|
|
|
413fc7 |
- action_t *stonith_done = get_pseudo_op(STONITH_DONE, data_set);
|
|
|
413fc7 |
|
|
|
413fc7 |
CRM_CHECK(stonith_op && stonith_op->node, return);
|
|
|
413fc7 |
target = stonith_op->node;
|
|
|
413fc7 |
@@ -2962,34 +2968,35 @@ native_start_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set
|
|
|
413fc7 |
for (gIter = rsc->actions; gIter != NULL; gIter = gIter->next) {
|
|
|
413fc7 |
action_t *action = (action_t *) gIter->data;
|
|
|
413fc7 |
|
|
|
413fc7 |
- if(action->needs == rsc_req_nothing) {
|
|
|
413fc7 |
- /* Anything other than start or promote requires nothing */
|
|
|
413fc7 |
-
|
|
|
413fc7 |
- } else if (action->needs == rsc_req_stonith) {
|
|
|
413fc7 |
- order_actions(stonith_done, action, pe_order_optional);
|
|
|
413fc7 |
+ switch (action->needs) {
|
|
|
413fc7 |
+ case rsc_req_nothing:
|
|
|
413fc7 |
+ // Anything other than start or promote requires nothing
|
|
|
413fc7 |
+ break;
|
|
|
413fc7 |
|
|
|
413fc7 |
- } else if (safe_str_eq(action->task, RSC_START)
|
|
|
413fc7 |
- && NULL != pe_hash_table_lookup(rsc->allowed_nodes, target->details->id)
|
|
|
413fc7 |
- && NULL == pe_hash_table_lookup(rsc->known_on, target->details->id)) {
|
|
|
413fc7 |
- /* if known == NULL, then we don't know if
|
|
|
413fc7 |
- * the resource is active on the node
|
|
|
413fc7 |
- * we're about to shoot
|
|
|
413fc7 |
- *
|
|
|
413fc7 |
- * in this case, regardless of action->needs,
|
|
|
413fc7 |
- * the only safe option is to wait until
|
|
|
413fc7 |
- * the node is shot before doing anything
|
|
|
413fc7 |
- * to with the resource
|
|
|
413fc7 |
- *
|
|
|
413fc7 |
- * it's analogous to waiting for all the probes
|
|
|
413fc7 |
- * for rscX to complete before starting rscX
|
|
|
413fc7 |
- *
|
|
|
413fc7 |
- * the most likely explanation is that the
|
|
|
413fc7 |
- * DC died and took its status with it
|
|
|
413fc7 |
- */
|
|
|
413fc7 |
+ case rsc_req_stonith:
|
|
|
413fc7 |
+ order_actions(stonith_op, action, pe_order_optional);
|
|
|
413fc7 |
+ break;
|
|
|
413fc7 |
|
|
|
413fc7 |
- pe_rsc_debug(rsc, "Ordering %s after %s recovery", action->uuid,
|
|
|
413fc7 |
- target->details->uname);
|
|
|
413fc7 |
- order_actions(all_stopped, action, pe_order_optional | pe_order_runnable_left);
|
|
|
413fc7 |
+ case rsc_req_quorum:
|
|
|
413fc7 |
+ if (safe_str_eq(action->task, RSC_START)
|
|
|
413fc7 |
+ && pe_hash_table_lookup(rsc->allowed_nodes, target->details->id)
|
|
|
413fc7 |
+ && NULL == pe_hash_table_lookup(rsc->known_on, target->details->id)) {
|
|
|
413fc7 |
+
|
|
|
413fc7 |
+ /* If we don't know the status of the resource on the node
|
|
|
413fc7 |
+ * we're about to shoot, we have to assume it may be active
|
|
|
413fc7 |
+ * there. Order the resource start after the fencing. This
|
|
|
413fc7 |
+ * is analogous to waiting for all the probes for a resource
|
|
|
413fc7 |
+ * to complete before starting it.
|
|
|
413fc7 |
+ *
|
|
|
413fc7 |
+ * The most likely explanation is that the DC died and took
|
|
|
413fc7 |
+ * its status with it.
|
|
|
413fc7 |
+ */
|
|
|
413fc7 |
+ pe_rsc_debug(rsc, "Ordering %s after %s recovery", action->uuid,
|
|
|
413fc7 |
+ target->details->uname);
|
|
|
413fc7 |
+ order_actions(stonith_op, action,
|
|
|
413fc7 |
+ pe_order_optional | pe_order_runnable_left);
|
|
|
413fc7 |
+ }
|
|
|
413fc7 |
+ break;
|
|
|
413fc7 |
}
|
|
|
413fc7 |
}
|
|
|
413fc7 |
}
|
|
|
413fc7 |
--
|
|
|
413fc7 |
1.8.3.1
|
|
|
413fc7 |
|