|
 |
eae27e |
From 92dd7d7616dc16d345ef73d0685b12e06d09b36b Mon Sep 17 00:00:00 2001
|
|
 |
eae27e |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
 |
eae27e |
Date: Fri, 15 Apr 2016 15:04:03 -0500
|
|
 |
eae27e |
Subject: [PATCH] Fix: crmd: update cache status for guest node whose host is
|
|
 |
eae27e |
fenced
|
|
 |
eae27e |
|
|
 |
eae27e |
Normally, the remote RA's stop action handles setting the peer cache state
|
|
 |
eae27e |
to down (along with other side effects of a stop). However, if a guest node's
|
|
 |
eae27e |
host is fenced, the RA will not be called. Check for the fencing pseudo-action
|
|
 |
eae27e |
created by the pengine in this case.
|
|
 |
eae27e |
---
|
|
 |
eae27e |
crmd/crmd_lrm.h | 1 +
|
|
 |
eae27e |
crmd/remote_lrmd_ra.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++--
|
|
 |
eae27e |
crmd/te_actions.c | 4 +++
|
|
 |
eae27e |
3 files changed, 75 insertions(+), 2 deletions(-)
|
|
 |
eae27e |
|
|
 |
eae27e |
diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
|
|
 |
eae27e |
index 412ce5b..08ba947 100644
|
|
 |
eae27e |
--- a/crmd/crmd_lrm.h
|
|
 |
eae27e |
+++ b/crmd/crmd_lrm.h
|
|
 |
eae27e |
@@ -160,5 +160,6 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti
|
|
 |
eae27e |
lrmd_key_value_t * params);
|
|
 |
eae27e |
void remote_ra_cleanup(lrm_state_t * lrm_state);
|
|
 |
eae27e |
void remote_ra_fail(const char *node_name);
|
|
 |
eae27e |
+void remote_ra_process_pseudo(xmlNode *xml);
|
|
 |
eae27e |
|
|
 |
eae27e |
gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending);
|
|
 |
eae27e |
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
|
|
 |
eae27e |
index b9c5068..eb995ea 100644
|
|
 |
eae27e |
--- a/crmd/remote_lrmd_ra.c
|
|
 |
eae27e |
+++ b/crmd/remote_lrmd_ra.c
|
|
 |
eae27e |
@@ -226,14 +226,20 @@ remote_node_up(const char *node_name)
|
|
 |
eae27e |
free_xml(update);
|
|
 |
eae27e |
}
|
|
 |
eae27e |
|
|
 |
eae27e |
+enum down_opts {
|
|
 |
eae27e |
+ DOWN_KEEP_LRM,
|
|
 |
eae27e |
+ DOWN_ERASE_LRM
|
|
 |
eae27e |
+};
|
|
 |
eae27e |
+
|
|
 |
eae27e |
/*!
|
|
 |
eae27e |
* \internal
|
|
 |
eae27e |
* \brief Handle cluster communication related to pacemaker_remote node leaving
|
|
 |
eae27e |
*
|
|
 |
eae27e |
* \param[in] node_name Name of lost node
|
|
 |
eae27e |
+ * \param[in] opts Whether to keep or erase LRM history
|
|
 |
eae27e |
*/
|
|
 |
eae27e |
static void
|
|
 |
eae27e |
-remote_node_down(const char *node_name)
|
|
 |
eae27e |
+remote_node_down(const char *node_name, const enum down_opts opts)
|
|
 |
eae27e |
{
|
|
 |
eae27e |
xmlNode *update;
|
|
 |
eae27e |
int call_id = 0;
|
|
 |
eae27e |
@@ -246,6 +252,14 @@ remote_node_down(const char *node_name)
|
|
 |
eae27e |
/* Purge node's transient attributes */
|
|
 |
eae27e |
erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
|
|
 |
eae27e |
|
|
 |
eae27e |
+ /* Normally, the LRM operation history should be kept until the node comes
|
|
 |
eae27e |
+ * back up. However, after a successful fence, we want to clear it, so we
|
|
 |
eae27e |
+ * don't think resources are still running on the node.
|
|
 |
eae27e |
+ */
|
|
 |
eae27e |
+ if (opts == DOWN_ERASE_LRM) {
|
|
 |
eae27e |
+ erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
|
|
 |
eae27e |
+ }
|
|
 |
eae27e |
+
|
|
 |
eae27e |
/* Ensure node is in the remote peer cache with lost state */
|
|
 |
eae27e |
node = crm_remote_peer_get(node_name);
|
|
 |
eae27e |
CRM_CHECK(node != NULL, return);
|
|
 |
eae27e |
@@ -301,7 +315,7 @@ check_remote_node_state(remote_ra_cmd_t *cmd)
|
|
 |
eae27e |
if (ra_data) {
|
|
 |
eae27e |
if (ra_data->migrate_status != takeover_complete) {
|
|
 |
eae27e |
/* Stop means down if we didn't successfully migrate elsewhere */
|
|
 |
eae27e |
- remote_node_down(cmd->rsc_id);
|
|
 |
eae27e |
+ remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
|
|
 |
eae27e |
} else if (AM_I_DC == FALSE) {
|
|
 |
eae27e |
/* Only the connection host and DC track node state,
|
|
 |
eae27e |
* so if the connection migrated elsewhere and we aren't DC,
|
|
 |
eae27e |
@@ -1072,3 +1086,57 @@ remote_ra_fail(const char *node_name)
|
|
 |
eae27e |
}
|
|
 |
eae27e |
}
|
|
 |
eae27e |
|
|
 |
eae27e |
+/* A guest node fencing implied by host fencing looks like:
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ * on_node="lxc1" on_node_uuid="lxc1">
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ * CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
|
|
 |
eae27e |
+ * crm_feature_set="3.0.12"/>
|
|
 |
eae27e |
+ * <downed>
|
|
 |
eae27e |
+ * <node id="lxc1"/>
|
|
 |
eae27e |
+ * </downed>
|
|
 |
eae27e |
+ * </pseudo_event>
|
|
 |
eae27e |
+ */
|
|
 |
eae27e |
+#define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
|
|
 |
eae27e |
+ "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
|
|
 |
eae27e |
+ "/" XML_CIB_TAG_NODE
|
|
 |
eae27e |
+
|
|
 |
eae27e |
+/*!
|
|
 |
eae27e |
+ * \internal
|
|
 |
eae27e |
+ * \brief Check a pseudo-action for Pacemaker Remote node side effects
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ * \param[in] xml XML of pseudo-action to check
|
|
 |
eae27e |
+ */
|
|
 |
eae27e |
+void
|
|
 |
eae27e |
+remote_ra_process_pseudo(xmlNode *xml)
|
|
 |
eae27e |
+{
|
|
 |
eae27e |
+ xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
|
|
 |
eae27e |
+
|
|
 |
eae27e |
+ if (numXpathResults(search) == 1) {
|
|
 |
eae27e |
+ xmlNode *result = getXpathResult(search, 0);
|
|
 |
eae27e |
+
|
|
 |
eae27e |
+ /* Normally, we handle the necessary side effects of a guest node stop
|
|
 |
eae27e |
+ * action when reporting the remote agent's result. However, if the stop
|
|
 |
eae27e |
+ * is implied due to fencing, it will be a fencing pseudo-event, and
|
|
 |
eae27e |
+ * there won't be a result to report. Handle that case here.
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ * This will result in a duplicate call to remote_node_down() if the
|
|
 |
eae27e |
+ * guest stop was real instead of implied, but that shouldn't hurt.
|
|
 |
eae27e |
+ *
|
|
 |
eae27e |
+ * There is still one corner case that isn't handled: if a guest node
|
|
 |
eae27e |
+ * isn't running any resources when its host is fenced, it will appear
|
|
 |
eae27e |
+ * to be cleanly stopped, so there will be no pseudo-fence, and our
|
|
 |
eae27e |
+ * peer cache state will be incorrect unless and until the guest is
|
|
 |
eae27e |
+ * recovered.
|
|
 |
eae27e |
+ */
|
|
 |
eae27e |
+ if (result) {
|
|
 |
eae27e |
+ const char *remote = ID(result);
|
|
 |
eae27e |
+
|
|
 |
eae27e |
+ if (remote) {
|
|
 |
eae27e |
+ remote_node_down(remote, DOWN_ERASE_LRM);
|
|
 |
eae27e |
+ }
|
|
 |
eae27e |
+ }
|
|
 |
eae27e |
+ }
|
|
 |
eae27e |
+ freeXpathObject(search);
|
|
 |
eae27e |
+}
|
|
 |
eae27e |
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
|
|
 |
eae27e |
index c971273..01538af 100644
|
|
 |
eae27e |
--- a/crmd/te_actions.c
|
|
 |
eae27e |
+++ b/crmd/te_actions.c
|
|
 |
eae27e |
@@ -27,6 +27,7 @@
|
|
 |
eae27e |
#include <tengine.h>
|
|
 |
eae27e |
|
|
 |
eae27e |
#include <crmd_fsa.h>
|
|
 |
eae27e |
+#include <crmd_lrm.h>
|
|
 |
eae27e |
#include <crmd_messages.h>
|
|
 |
eae27e |
#include <crm/cluster.h>
|
|
 |
eae27e |
#include <throttle.h>
|
|
 |
eae27e |
@@ -52,6 +53,9 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action)
|
|
 |
eae27e |
static gboolean
|
|
 |
eae27e |
te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
|
|
 |
eae27e |
{
|
|
 |
eae27e |
+ /* Check action for Pacemaker Remote node side effects */
|
|
 |
eae27e |
+ remote_ra_process_pseudo(pseudo->xml);
|
|
 |
eae27e |
+
|
|
 |
eae27e |
crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id,
|
|
 |
eae27e |
crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY));
|
|
 |
eae27e |
te_action_confirmed(pseudo);
|
|
 |
eae27e |
--
|
|
 |
eae27e |
1.8.3.1
|
|
 |
eae27e |
|