From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 15 Apr 2016 15:04:03 -0500
Subject: [PATCH] Fix: crmd: update cache status for guest node whose host is
fenced
Normally, the remote RA's stop action handles setting the peer cache state
to down (along with other side effects of a stop). However, if a guest node's
host is fenced, the RA will not be called. Check for the fencing pseudo-action
created by the pengine in this case.
---
crmd/crmd_lrm.h | 1 +
crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++--
crmd/te_actions.c | 4 +++
3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
index 412ce5b..08ba947 100644
--- a/crmd/crmd_lrm.h
+++ b/crmd/crmd_lrm.h
@@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti
lrmd_key_value_t * params);
void remote_ra_cleanup(lrm_state_t * lrm_state);
void remote_ra_fail(const char *node_name);
+void remote_ra_process_pseudo(xmlNode *xml);
gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending);
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
index b9c5068..eb995ea 100644
--- a/crmd/remote_lrmd_ra.c
+++ b/crmd/remote_lrmd_ra.c
@@ -226,14 +226,20 @@ remote_node_up(const char *node_name)
free_xml(update);
}
+enum down_opts {
+ DOWN_KEEP_LRM,
+ DOWN_ERASE_LRM
+};
+
/*!
* \internal
* \brief Handle cluster communication related to pacemaker_remote node leaving
*
* \param[in] node_name Name of lost node
+ * \param[in] opts Whether to keep or erase LRM history
*/
static void
-remote_node_down(const char *node_name)
+remote_node_down(const char *node_name, const enum down_opts opts)
{
xmlNode *update;
int call_id = 0;
@@ -246,6 +252,14 @@ remote_node_down(const char *node_name)
/* Purge node's transient attributes */
erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
+ /* Normally, the LRM operation history should be kept until the node comes
+ * back up. However, after a successful fence, we want to clear it, so we
+ * don't think resources are still running on the node.
+ */
+ if (opts == DOWN_ERASE_LRM) {
+ erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
+ }
+
/* Ensure node is in the remote peer cache with lost state */
node = crm_remote_peer_get(node_name);
CRM_CHECK(node != NULL, return);
@@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd)
if (ra_data) {
if (ra_data->migrate_status != takeover_complete) {
/* Stop means down if we didn't successfully migrate elsewhere */
- remote_node_down(cmd->rsc_id);
+ remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
} else if (AM_I_DC == FALSE) {
/* Only the connection host and DC track node state,
* so if the connection migrated elsewhere and we aren't DC,
@@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name)
}
}
+/* A guest node fencing implied by host fencing looks like:
+ *
+ * <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
+ * on_node="lxc1" on_node_uuid="lxc1">
+ * <attributes CRM_meta_master_lxc_ms="10" CRM_meta_on_node="lxc1"
+ * CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
+ * crm_feature_set="3.0.12"/>
+ * <downed>
+ * <node id="lxc1"/>
+ * </downed>
+ * </pseudo_event>
+ */
+#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
+ "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
+ "/" XML_CIB_TAG_NODE
+
+/*!
+ * \internal
+ * \brief Check a pseudo-action for Pacemaker Remote node side effects
+ *
+ * \param[in] xml XML of pseudo-action to check
+ */
+void
+remote_ra_process_pseudo(xmlNode *xml)
+{
+ xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
+
+ if (numXpathResults(search) == 1) {
+ xmlNode *result = getXpathResult(search, 0);
+
+ /* Normally, we handle the necessary side effects of a guest node stop
+ * action when reporting the remote agent's result. However, if the stop
+ * is implied due to fencing, it will be a fencing pseudo-event, and
+ * there won't be a result to report. Handle that case here.
+ *
+ * This will result in a duplicate call to remote_node_down() if the
+ * guest stop was real instead of implied, but that shouldn't hurt.
+ *
+ * There is still one corner case that isn't handled: if a guest node
+ * isn't running any resources when its host is fenced, it will appear
+ * to be cleanly stopped, so there will be no pseudo-fence, and our
+ * peer cache state will be incorrect unless and until the guest is
+ * recovered.
+ */
+ if (result) {
+ const char *remote = ID(result);
+
+ if (remote) {
+ remote_node_down(remote, DOWN_ERASE_LRM);
+ }
+ }
+ }
+ freeXpathObject(search);
+}
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
index c971273..01538af 100644
--- a/crmd/te_actions.c
+++ b/crmd/te_actions.c
@@ -27,6 +27,7 @@
#include <tengine.h>
#include <crmd_fsa.h>
+#include <crmd_lrm.h>
#include <crmd_messages.h>
#include <crm/cluster.h>
#include <throttle.h>
@@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action)
static gboolean
te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
{
+ /* Check action for Pacemaker Remote node side effects */
+ remote_ra_process_pseudo(pseudo->xml);
+
crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id,
crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY));
te_action_confirmed(pseudo);
--
1.8.3.1