Blob Blame History Raw
From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 15 Apr 2016 15:04:03 -0500
Subject: [PATCH] Fix: crmd: update cache status for guest node whose host is
 fenced

Normally, the remote RA's stop action handles setting the peer cache state
to down (along with other side effects of a stop). However, if a guest node's
host is fenced, the RA will not be called. Check for the fencing pseudo-action
created by the pengine in this case.
---
 crmd/crmd_lrm.h       |  1 +
 crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++--
 crmd/te_actions.c     |  4 +++
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
index 412ce5b..08ba947 100644
--- a/crmd/crmd_lrm.h
+++ b/crmd/crmd_lrm.h
@@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti
                    lrmd_key_value_t * params);
 void remote_ra_cleanup(lrm_state_t * lrm_state);
 void remote_ra_fail(const char *node_name);
+void remote_ra_process_pseudo(xmlNode *xml);
 
 gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending);
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
index b9c5068..eb995ea 100644
--- a/crmd/remote_lrmd_ra.c
+++ b/crmd/remote_lrmd_ra.c
@@ -226,14 +226,20 @@ remote_node_up(const char *node_name)
     free_xml(update);
 }
 
+enum down_opts {
+    DOWN_KEEP_LRM,
+    DOWN_ERASE_LRM
+};
+
 /*!
  * \internal
  * \brief Handle cluster communication related to pacemaker_remote node leaving
  *
  * \param[in] node_name  Name of lost node
+ * \param[in] opts       Whether to keep or erase LRM history
  */
 static void
-remote_node_down(const char *node_name)
+remote_node_down(const char *node_name, const enum down_opts opts)
 {
     xmlNode *update;
     int call_id = 0;
@@ -246,6 +252,14 @@ remote_node_down(const char *node_name)
     /* Purge node's transient attributes */
     erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
 
+    /* Normally, the LRM operation history should be kept until the node comes
+     * back up. However, after a successful fence, we want to clear it, so we
+     * don't think resources are still running on the node.
+     */
+    if (opts == DOWN_ERASE_LRM) {
+        erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
+    }
+
     /* Ensure node is in the remote peer cache with lost state */
     node = crm_remote_peer_get(node_name);
     CRM_CHECK(node != NULL, return);
@@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd)
         if (ra_data) {
             if (ra_data->migrate_status != takeover_complete) {
                 /* Stop means down if we didn't successfully migrate elsewhere */
-                remote_node_down(cmd->rsc_id);
+                remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
             } else if (AM_I_DC == FALSE) {
                 /* Only the connection host and DC track node state,
                  * so if the connection migrated elsewhere and we aren't DC,
@@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name)
     }
 }
 
+/* A guest node fencing implied by host fencing looks like:
+ *
+ *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
+ *                on_node="lxc1" on_node_uuid="lxc1">
+ *     <attributes CRM_meta_master_lxc_ms="10" CRM_meta_on_node="lxc1"
+ *                 CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
+ *                 crm_feature_set="3.0.12"/>
+ *     <downed>
+ *       <node id="lxc1"/>
+ *     </downed>
+ *  </pseudo_event>
+ */
+#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
+    "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
+    "/" XML_CIB_TAG_NODE
+
+/*!
+ * \internal
+ * \brief Check a pseudo-action for Pacemaker Remote node side effects
+ *
+ * \param[in] xml  XML of pseudo-action to check
+ */
+void
+remote_ra_process_pseudo(xmlNode *xml)
+{
+    xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
+
+    if (numXpathResults(search) == 1) {
+        xmlNode *result = getXpathResult(search, 0);
+
+        /* Normally, we handle the necessary side effects of a guest node stop
+         * action when reporting the remote agent's result. However, if the stop
+         * is implied due to fencing, it will be a fencing pseudo-event, and
+         * there won't be a result to report. Handle that case here.
+         *
+         * This will result in a duplicate call to remote_node_down() if the
+         * guest stop was real instead of implied, but that shouldn't hurt.
+         *
+         * There is still one corner case that isn't handled: if a guest node
+         * isn't running any resources when its host is fenced, it will appear
+         * to be cleanly stopped, so there will be no pseudo-fence, and our
+         * peer cache state will be incorrect unless and until the guest is
+         * recovered.
+         */
+        if (result) {
+            const char *remote = ID(result);
+
+            if (remote) {
+                remote_node_down(remote, DOWN_ERASE_LRM);
+            }
+        }
+    }
+    freeXpathObject(search);
+}
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
index c971273..01538af 100644
--- a/crmd/te_actions.c
+++ b/crmd/te_actions.c
@@ -27,6 +27,7 @@
 #include <tengine.h>
 
 #include <crmd_fsa.h>
+#include <crmd_lrm.h>
 #include <crmd_messages.h>
 #include <crm/cluster.h>
 #include <throttle.h>
@@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action)
 static gboolean
 te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
 {
+    /* Check action for Pacemaker Remote node side effects */
+    remote_ra_process_pseudo(pseudo->xml);
+
     crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id,
               crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY));
     te_action_confirmed(pseudo);
-- 
1.8.3.1