From eb2854add713f22b083a54aa7caf04be5067b469 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 13 Nov 2018 18:05:04 -0600 Subject: [PATCH] Low: scheduler: order start after particular stonith op Previously, if a resource's status was unknown on a node about to be fenced, we ordered the resource start after all_stopped. This approximated stonith_done before that was available. However, it makes more sense to order the start after the particular stonith op for the node in question. This improves recovery when multiple nodes are being fenced: resources can now be recovered from one node when it is successfully fenced, even if the fencing of another node fails. --- pengine/native.c | 63 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/pengine/native.c b/pengine/native.c index c6c1d55..9ee5990 100644 --- a/pengine/native.c +++ b/pengine/native.c @@ -2948,13 +2948,19 @@ native_create_probe(resource_t * rsc, node_t * node, action_t * complete, return TRUE; } +/*! + * \internal + * \brief Order a resource's start and promote actions relative to fencing + * + * \param[in] rsc Resource to be ordered + * \param[in] stonith_op Fence action + * \param[in] data_set Cluster information + */ static void native_start_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set_t * data_set) { node_t *target; GListPtr gIter = NULL; - action_t *all_stopped = get_pseudo_op(ALL_STOPPED, data_set); - action_t *stonith_done = get_pseudo_op(STONITH_DONE, data_set); CRM_CHECK(stonith_op && stonith_op->node, return); target = stonith_op->node; @@ -2962,34 +2968,35 @@ native_start_constraints(resource_t * rsc, action_t * stonith_op, pe_working_set for (gIter = rsc->actions; gIter != NULL; gIter = gIter->next) { action_t *action = (action_t *) gIter->data; - if(action->needs == rsc_req_nothing) { - /* Anything other than start or promote requires nothing */ - - } else if (action->needs == rsc_req_stonith) { - order_actions(stonith_done, action, pe_order_optional); + switch (action->needs) { + case rsc_req_nothing: + // Anything other than start or promote requires nothing + break; - } else if (safe_str_eq(action->task, RSC_START) - && NULL != pe_hash_table_lookup(rsc->allowed_nodes, target->details->id) - && NULL == pe_hash_table_lookup(rsc->known_on, target->details->id)) { - /* if known == NULL, then we don't know if - * the resource is active on the node - * we're about to shoot - * - * in this case, regardless of action->needs, - * the only safe option is to wait until - * the node is shot before doing anything - * to with the resource - * - * it's analogous to waiting for all the probes - * for rscX to complete before starting rscX - * - * the most likely explanation is that the - * DC died and took its status with it - */ + case rsc_req_stonith: + order_actions(stonith_op, action, pe_order_optional); + break; - pe_rsc_debug(rsc, "Ordering %s after %s recovery", action->uuid, - target->details->uname); - order_actions(all_stopped, action, pe_order_optional | pe_order_runnable_left); + case rsc_req_quorum: + if (safe_str_eq(action->task, RSC_START) + && pe_hash_table_lookup(rsc->allowed_nodes, target->details->id) + && NULL == pe_hash_table_lookup(rsc->known_on, target->details->id)) { + + /* If we don't know the status of the resource on the node + * we're about to shoot, we have to assume it may be active + * there. Order the resource start after the fencing. This + * is analogous to waiting for all the probes for a resource + * to complete before starting it. + * + * The most likely explanation is that the DC died and took + * its status with it. + */ + pe_rsc_debug(rsc, "Ordering %s after %s recovery", action->uuid, + target->details->uname); + order_actions(stonith_op, action, + pe_order_optional | pe_order_runnable_left); + } + break; } } } -- 1.8.3.1