From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Fri, 15 Apr 2016 15:04:03 -0500 Subject: [PATCH] Fix: crmd: update cache status for guest node whose host is fenced Normally, the remote RA's stop action handles setting the peer cache state to down (along with other side effects of a stop). However, if a guest node's host is fenced, the RA will not be called. Check for the fencing pseudo-action created by the pengine in this case. --- crmd/crmd_lrm.h | 1 + crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++-- crmd/te_actions.c | 4 +++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index 412ce5b..08ba947 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti lrmd_key_value_t * params); void remote_ra_cleanup(lrm_state_t * lrm_state); void remote_ra_fail(const char *node_name); +void remote_ra_process_pseudo(xmlNode *xml); gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index b9c5068..eb995ea 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -226,14 +226,20 @@ remote_node_up(const char *node_name) free_xml(update); } +enum down_opts { + DOWN_KEEP_LRM, + DOWN_ERASE_LRM +}; + /*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node + * \param[in] opts Whether to keep or erase LRM history */ static void -remote_node_down(const char *node_name) +remote_node_down(const char *node_name, const enum down_opts opts) { xmlNode *update; int call_id = 0; @@ -246,6 +252,14 @@ remote_node_down(const char *node_name) /* Purge node's transient attributes */ erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); + /* Normally, the LRM operation history should be kept until the node comes + * back up. However, after a successful fence, we want to clear it, so we + * don't think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { + erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); + } + /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); @@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd) if (ra_data) { if (ra_data->migrate_status != takeover_complete) { /* Stop means down if we didn't successfully migrate elsewhere */ - remote_node_down(cmd->rsc_id); + remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM); } else if (AM_I_DC == FALSE) { /* Only the connection host and DC track node state, * so if the connection migrated elsewhere and we aren't DC, @@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name) } } +/* A guest node fencing implied by host fencing looks like: + * + * + * + * + * + * + * + */ +#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \ + "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \ + "/" XML_CIB_TAG_NODE + +/*! + * \internal + * \brief Check a pseudo-action for Pacemaker Remote node side effects + * + * \param[in] xml XML of pseudo-action to check + */ +void +remote_ra_process_pseudo(xmlNode *xml) +{ + xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE); + + if (numXpathResults(search) == 1) { + xmlNode *result = getXpathResult(search, 0); + + /* Normally, we handle the necessary side effects of a guest node stop + * action when reporting the remote agent's result. However, if the stop + * is implied due to fencing, it will be a fencing pseudo-event, and + * there won't be a result to report. Handle that case here. + * + * This will result in a duplicate call to remote_node_down() if the + * guest stop was real instead of implied, but that shouldn't hurt. + * + * There is still one corner case that isn't handled: if a guest node + * isn't running any resources when its host is fenced, it will appear + * to be cleanly stopped, so there will be no pseudo-fence, and our + * peer cache state will be incorrect unless and until the guest is + * recovered. + */ + if (result) { + const char *remote = ID(result); + + if (remote) { + remote_node_down(remote, DOWN_ERASE_LRM); + } + } + } + freeXpathObject(search); +} diff --git a/crmd/te_actions.c b/crmd/te_actions.c index c971273..01538af 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action) static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { + /* Check action for Pacemaker Remote node side effects */ + remote_ra_process_pseudo(pseudo->xml); + crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id, crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY)); te_action_confirmed(pseudo); -- 1.8.3.1