Blame SOURCES/019-guest-fencing-status.patch

eae27e
From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001
eae27e
From: Ken Gaillot <kgaillot@redhat.com>
eae27e
Date: Fri, 15 Apr 2016 15:04:03 -0500
eae27e
Subject: [PATCH] Fix: crmd: update cache status for guest node whose host is
eae27e
 fenced
eae27e
eae27e
Normally, the remote RA's stop action handles setting the peer cache state
eae27e
to down (along with other side effects of a stop). However, if a guest node's
eae27e
host is fenced, the RA will not be called. Check for the fencing pseudo-action
eae27e
created by the pengine in this case.
eae27e
---
eae27e
 crmd/crmd_lrm.h       |  1 +
eae27e
 crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++--
eae27e
 crmd/te_actions.c     |  4 +++
eae27e
 3 files changed, 75 insertions(+), 2 deletions(-)
eae27e
eae27e
diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
eae27e
index 412ce5b..08ba947 100644
eae27e
--- a/crmd/crmd_lrm.h
eae27e
+++ b/crmd/crmd_lrm.h
eae27e
@@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti
eae27e
                    lrmd_key_value_t * params);
eae27e
 void remote_ra_cleanup(lrm_state_t * lrm_state);
eae27e
 void remote_ra_fail(const char *node_name);
eae27e
+void remote_ra_process_pseudo(xmlNode *xml);
eae27e
 
eae27e
 gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending);
eae27e
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
eae27e
index b9c5068..eb995ea 100644
eae27e
--- a/crmd/remote_lrmd_ra.c
eae27e
+++ b/crmd/remote_lrmd_ra.c
eae27e
@@ -226,14 +226,20 @@ remote_node_up(const char *node_name)
eae27e
     free_xml(update);
eae27e
 }
eae27e
 
eae27e
+enum down_opts {
eae27e
+    DOWN_KEEP_LRM,
eae27e
+    DOWN_ERASE_LRM
eae27e
+};
eae27e
+
eae27e
 /*!
eae27e
  * \internal
eae27e
  * \brief Handle cluster communication related to pacemaker_remote node leaving
eae27e
  *
eae27e
  * \param[in] node_name  Name of lost node
eae27e
+ * \param[in] opts       Whether to keep or erase LRM history
eae27e
  */
eae27e
 static void
eae27e
-remote_node_down(const char *node_name)
eae27e
+remote_node_down(const char *node_name, const enum down_opts opts)
eae27e
 {
eae27e
     xmlNode *update;
eae27e
     int call_id = 0;
eae27e
@@ -246,6 +252,14 @@ remote_node_down(const char *node_name)
eae27e
     /* Purge node's transient attributes */
eae27e
     erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
eae27e
 
eae27e
+    /* Normally, the LRM operation history should be kept until the node comes
eae27e
+     * back up. However, after a successful fence, we want to clear it, so we
eae27e
+     * don't think resources are still running on the node.
eae27e
+     */
eae27e
+    if (opts == DOWN_ERASE_LRM) {
eae27e
+        erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
eae27e
+    }
eae27e
+
eae27e
     /* Ensure node is in the remote peer cache with lost state */
eae27e
     node = crm_remote_peer_get(node_name);
eae27e
     CRM_CHECK(node != NULL, return);
eae27e
@@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd)
eae27e
         if (ra_data) {
eae27e
             if (ra_data->migrate_status != takeover_complete) {
eae27e
                 /* Stop means down if we didn't successfully migrate elsewhere */
eae27e
-                remote_node_down(cmd->rsc_id);
eae27e
+                remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
eae27e
             } else if (AM_I_DC == FALSE) {
eae27e
                 /* Only the connection host and DC track node state,
eae27e
                  * so if the connection migrated elsewhere and we aren't DC,
eae27e
@@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name)
eae27e
     }
eae27e
 }
eae27e
 
eae27e
+/* A guest node fencing implied by host fencing looks like:
eae27e
+ *
eae27e
+ *  
eae27e
+ *                on_node="lxc1" on_node_uuid="lxc1">
eae27e
+ *     
eae27e
+ *                 CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
eae27e
+ *                 crm_feature_set="3.0.12"/>
eae27e
+ *     <downed>
eae27e
+ *       <node id="lxc1"/>
eae27e
+ *     </downed>
eae27e
+ *  </pseudo_event>
eae27e
+ */
eae27e
+#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
eae27e
+    "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
eae27e
+    "/" XML_CIB_TAG_NODE
eae27e
+
eae27e
+/*!
eae27e
+ * \internal
eae27e
+ * \brief Check a pseudo-action for Pacemaker Remote node side effects
eae27e
+ *
eae27e
+ * \param[in] xml  XML of pseudo-action to check
eae27e
+ */
eae27e
+void
eae27e
+remote_ra_process_pseudo(xmlNode *xml)
eae27e
+{
eae27e
+    xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
eae27e
+
eae27e
+    if (numXpathResults(search) == 1) {
eae27e
+        xmlNode *result = getXpathResult(search, 0);
eae27e
+
eae27e
+        /* Normally, we handle the necessary side effects of a guest node stop
eae27e
+         * action when reporting the remote agent's result. However, if the stop
eae27e
+         * is implied due to fencing, it will be a fencing pseudo-event, and
eae27e
+         * there won't be a result to report. Handle that case here.
eae27e
+         *
eae27e
+         * This will result in a duplicate call to remote_node_down() if the
eae27e
+         * guest stop was real instead of implied, but that shouldn't hurt.
eae27e
+         *
eae27e
+         * There is still one corner case that isn't handled: if a guest node
eae27e
+         * isn't running any resources when its host is fenced, it will appear
eae27e
+         * to be cleanly stopped, so there will be no pseudo-fence, and our
eae27e
+         * peer cache state will be incorrect unless and until the guest is
eae27e
+         * recovered.
eae27e
+         */
eae27e
+        if (result) {
eae27e
+            const char *remote = ID(result);
eae27e
+
eae27e
+            if (remote) {
eae27e
+                remote_node_down(remote, DOWN_ERASE_LRM);
eae27e
+            }
eae27e
+        }
eae27e
+    }
eae27e
+    freeXpathObject(search);
eae27e
+}
eae27e
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
eae27e
index c971273..01538af 100644
eae27e
--- a/crmd/te_actions.c
eae27e
+++ b/crmd/te_actions.c
eae27e
@@ -27,6 +27,7 @@
eae27e
 #include <tengine.h>
eae27e
 
eae27e
 #include <crmd_fsa.h>
eae27e
+#include <crmd_lrm.h>
eae27e
 #include <crmd_messages.h>
eae27e
 #include <crm/cluster.h>
eae27e
 #include <throttle.h>
eae27e
@@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action)
eae27e
 static gboolean
eae27e
 te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
eae27e
 {
eae27e
+    /* Check action for Pacemaker Remote node side effects */
eae27e
+    remote_ra_process_pseudo(pseudo->xml);
eae27e
+
eae27e
     crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id,
eae27e
               crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY));
eae27e
     te_action_confirmed(pseudo);
eae27e
-- 
eae27e
1.8.3.1
eae27e