|
|
ed4e54 |
From 223ab7251adcb8c6f6b96def138be58b1478c42b Mon Sep 17 00:00:00 2001
|
|
|
ed4e54 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
ed4e54 |
Date: Fri, 22 Nov 2019 17:03:20 -0600
|
|
|
ed4e54 |
Subject: [PATCH 09/18] Low: controller: mark shutdown-locked resources in
|
|
|
ed4e54 |
resource history
|
|
|
ed4e54 |
|
|
|
ed4e54 |
When a graph action indicates that the resource should be shutdown-locked
|
|
|
ed4e54 |
to its node, remember the shutdown lock time in active_op_t so we can remember
|
|
|
ed4e54 |
that when the result comes back. When the result does come back, add
|
|
|
ed4e54 |
"shutdown-lock" to its lrm_resource entry in the CIB status section -- as
|
|
|
ed4e54 |
the timestamp if it's a successful stop or a probe finding the resource
|
|
|
ed4e54 |
inactive, or as 0 to clear the lock for any other operation.
|
|
|
ed4e54 |
---
|
|
|
ed4e54 |
daemons/controld/controld_control.c | 9 ++++-
|
|
|
ed4e54 |
daemons/controld/controld_execd.c | 44 +++++++++++++++++++--
|
|
|
ed4e54 |
daemons/controld/controld_lrm.h | 1 +
|
|
|
ed4e54 |
daemons/controld/controld_te_callbacks.c | 65 ++++++++++++++++++++++----------
|
|
|
ed4e54 |
daemons/controld/controld_utils.h | 1 +
|
|
|
ed4e54 |
5 files changed, 95 insertions(+), 25 deletions(-)
|
|
|
ed4e54 |
|
|
|
ed4e54 |
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
|
|
|
ed4e54 |
index 6c7f97c..c918a1e 100644
|
|
|
ed4e54 |
--- a/daemons/controld/controld_control.c
|
|
|
ed4e54 |
+++ b/daemons/controld/controld_control.c
|
|
|
ed4e54 |
@@ -1,5 +1,5 @@
|
|
|
ed4e54 |
/*
|
|
|
ed4e54 |
- * Copyright 2004-2019 the Pacemaker project contributors
|
|
|
ed4e54 |
+ * Copyright 2004-2020 the Pacemaker project contributors
|
|
|
ed4e54 |
*
|
|
|
ed4e54 |
* The version control history for this file may have further details.
|
|
|
ed4e54 |
*
|
|
|
ed4e54 |
@@ -35,6 +35,7 @@ gboolean fsa_has_quorum = FALSE;
|
|
|
ed4e54 |
crm_trigger_t *fsa_source = NULL;
|
|
|
ed4e54 |
crm_trigger_t *config_read = NULL;
|
|
|
ed4e54 |
bool no_quorum_suicide_escalation = FALSE;
|
|
|
ed4e54 |
+bool controld_shutdown_lock_enabled = false;
|
|
|
ed4e54 |
|
|
|
ed4e54 |
/* A_HA_CONNECT */
|
|
|
ed4e54 |
void
|
|
|
ed4e54 |
@@ -587,7 +588,10 @@ static pe_cluster_option crmd_opts[] = {
|
|
|
ed4e54 |
{ "stonith-max-attempts",NULL,"integer",NULL,"10",&check_positive_number,
|
|
|
ed4e54 |
"How many times stonith can fail before it will no longer be attempted on a target"
|
|
|
ed4e54 |
},
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
+ // Already documented in libpe_status (other values must be kept identical)
|
|
|
ed4e54 |
{ "no-quorum-policy", NULL, "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL },
|
|
|
ed4e54 |
+ { XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL, "false", &check_boolean, NULL, NULL },
|
|
|
ed4e54 |
};
|
|
|
ed4e54 |
/* *INDENT-ON* */
|
|
|
ed4e54 |
|
|
|
ed4e54 |
@@ -698,6 +702,9 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
|
|
|
ed4e54 |
value = crmd_pref(config_hash, "join-finalization-timeout");
|
|
|
ed4e54 |
finalization_timer->period_ms = crm_parse_interval_spec(value);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
+ value = crmd_pref(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK);
|
|
|
ed4e54 |
+ controld_shutdown_lock_enabled = crm_is_true(value);
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
free(fsa_cluster_name);
|
|
|
ed4e54 |
fsa_cluster_name = NULL;
|
|
|
ed4e54 |
|
|
|
ed4e54 |
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
|
|
|
ed4e54 |
index 17cc8d6..c0436a2 100644
|
|
|
ed4e54 |
--- a/daemons/controld/controld_execd.c
|
|
|
ed4e54 |
+++ b/daemons/controld/controld_execd.c
|
|
|
ed4e54 |
@@ -44,7 +44,8 @@ static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
|
|
|
ed4e54 |
|
|
|
ed4e54 |
static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
|
|
|
ed4e54 |
int log_level);
|
|
|
ed4e54 |
-static int do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op);
|
|
|
ed4e54 |
+static int do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc,
|
|
|
ed4e54 |
+ lrmd_event_data_t *op, time_t lock_time);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
static void
|
|
|
ed4e54 |
lrm_connection_destroy(void)
|
|
|
ed4e54 |
@@ -2171,7 +2172,7 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
|
|
|
ed4e54 |
crm_debug("Recording pending op " CRM_OP_FMT " on %s in the CIB",
|
|
|
ed4e54 |
op->rsc_id, op->op_type, op->interval_ms, node_name);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
- do_update_resource(node_name, rsc, op);
|
|
|
ed4e54 |
+ do_update_resource(node_name, rsc, op, 0);
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
|
|
|
ed4e54 |
static void
|
|
|
ed4e54 |
@@ -2313,6 +2314,10 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
|
|
|
ed4e54 |
pending->rsc_id = strdup(rsc->id);
|
|
|
ed4e54 |
pending->start_time = time(NULL);
|
|
|
ed4e54 |
pending->user_data = op->user_data? strdup(op->user_data) : NULL;
|
|
|
ed4e54 |
+ if (crm_element_value_epoch(msg, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
|
|
|
ed4e54 |
+ &(pending->lock_time)) != pcmk_ok) {
|
|
|
ed4e54 |
+ pending->lock_time = 0;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
g_hash_table_replace(lrm_state->pending_ops, call_id_s, pending);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
if ((op->interval_ms > 0)
|
|
|
ed4e54 |
@@ -2356,8 +2361,28 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
|
|
|
ed4e54 |
+/* Only successful stops, and probes that found the resource inactive, get locks
|
|
|
ed4e54 |
+ * recorded in the history. This ensures the resource stays locked to the node
|
|
|
ed4e54 |
+ * until it is active there again after the node comes back up.
|
|
|
ed4e54 |
+ */
|
|
|
ed4e54 |
+static bool
|
|
|
ed4e54 |
+should_preserve_lock(lrmd_event_data_t *op)
|
|
|
ed4e54 |
+{
|
|
|
ed4e54 |
+ if (!controld_shutdown_lock_enabled) {
|
|
|
ed4e54 |
+ return false;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
+ if (!strcmp(op->op_type, RSC_STOP) && (op->rc == PCMK_OCF_OK)) {
|
|
|
ed4e54 |
+ return true;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
+ if (!strcmp(op->op_type, RSC_STATUS) && (op->rc == PCMK_OCF_NOT_RUNNING)) {
|
|
|
ed4e54 |
+ return true;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
+ return false;
|
|
|
ed4e54 |
+}
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
static int
|
|
|
ed4e54 |
-do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op)
|
|
|
ed4e54 |
+do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc,
|
|
|
ed4e54 |
+ lrmd_event_data_t *op, time_t lock_time)
|
|
|
ed4e54 |
{
|
|
|
ed4e54 |
/*
|
|
|
ed4e54 |
<status>
|
|
|
ed4e54 |
@@ -2412,6 +2437,16 @@ do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data
|
|
|
ed4e54 |
crm_xml_add(iter, XML_ATTR_TYPE, rsc->type);
|
|
|
ed4e54 |
crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->standard);
|
|
|
ed4e54 |
crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider);
|
|
|
ed4e54 |
+ if (lock_time != 0) {
|
|
|
ed4e54 |
+ /* Actions on a locked resource should either preserve the lock by
|
|
|
ed4e54 |
+ * recording it with the action result, or clear it.
|
|
|
ed4e54 |
+ */
|
|
|
ed4e54 |
+ if (!should_preserve_lock(op)) {
|
|
|
ed4e54 |
+ lock_time = 0;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
+ crm_xml_add_ll(iter, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
|
|
|
ed4e54 |
+ (long long) lock_time);
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
|
|
|
ed4e54 |
if (op->params) {
|
|
|
ed4e54 |
container = g_hash_table_lookup(op->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
|
|
|
ed4e54 |
@@ -2616,7 +2651,8 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
|
|
|
ed4e54 |
if (controld_action_is_recordable(op->op_type)) {
|
|
|
ed4e54 |
if (node_name && rsc) {
|
|
|
ed4e54 |
// We should record the result, and happily, we can
|
|
|
ed4e54 |
- update_id = do_update_resource(node_name, rsc, op);
|
|
|
ed4e54 |
+ update_id = do_update_resource(node_name, rsc, op,
|
|
|
ed4e54 |
+ pending? pending->lock_time : 0);
|
|
|
ed4e54 |
need_direct_ack = FALSE;
|
|
|
ed4e54 |
|
|
|
ed4e54 |
} else if (op->rsc_deleted) {
|
|
|
ed4e54 |
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
|
|
|
ed4e54 |
index 7acac2a..da0582c 100644
|
|
|
ed4e54 |
--- a/daemons/controld/controld_lrm.h
|
|
|
ed4e54 |
+++ b/daemons/controld/controld_lrm.h
|
|
|
ed4e54 |
@@ -46,6 +46,7 @@ typedef struct active_op_s {
|
|
|
ed4e54 |
int call_id;
|
|
|
ed4e54 |
uint32_t flags; // bitmask of active_op_e
|
|
|
ed4e54 |
time_t start_time;
|
|
|
ed4e54 |
+ time_t lock_time;
|
|
|
ed4e54 |
char *rsc_id;
|
|
|
ed4e54 |
char *op_type;
|
|
|
ed4e54 |
char *op_key;
|
|
|
ed4e54 |
diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c
|
|
|
ed4e54 |
index 25f0ab2..8506f26 100644
|
|
|
ed4e54 |
--- a/daemons/controld/controld_te_callbacks.c
|
|
|
ed4e54 |
+++ b/daemons/controld/controld_te_callbacks.c
|
|
|
ed4e54 |
@@ -1,5 +1,5 @@
|
|
|
ed4e54 |
/*
|
|
|
ed4e54 |
- * Copyright 2004-2019 the Pacemaker project contributors
|
|
|
ed4e54 |
+ * Copyright 2004-2020 the Pacemaker project contributors
|
|
|
ed4e54 |
*
|
|
|
ed4e54 |
* The version control history for this file may have further details.
|
|
|
ed4e54 |
*
|
|
|
ed4e54 |
@@ -28,6 +28,17 @@ crm_trigger_t *transition_trigger = NULL;
|
|
|
ed4e54 |
/* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
|
|
|
ed4e54 |
#define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"
|
|
|
ed4e54 |
|
|
|
ed4e54 |
+// An explicit shutdown-lock of 0 means the lock has been cleared
|
|
|
ed4e54 |
+static bool
|
|
|
ed4e54 |
+shutdown_lock_cleared(xmlNode *lrm_resource)
|
|
|
ed4e54 |
+{
|
|
|
ed4e54 |
+ time_t shutdown_lock = 0;
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
+ return (crm_element_value_epoch(lrm_resource, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
|
|
|
ed4e54 |
+ &shutdown_lock) == pcmk_ok)
|
|
|
ed4e54 |
+ && (shutdown_lock == 0);
|
|
|
ed4e54 |
+}
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
static void
|
|
|
ed4e54 |
te_update_diff_v1(const char *event, xmlNode *diff)
|
|
|
ed4e54 |
{
|
|
|
ed4e54 |
@@ -106,33 +117,42 @@ te_update_diff_v1(const char *event, xmlNode *diff)
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
freeXpathObject(xpathObj);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
+ // Check for lrm_resource entries
|
|
|
ed4e54 |
+ xpathObj = xpath_search(diff,
|
|
|
ed4e54 |
+ "//" F_CIB_UPDATE_RESULT
|
|
|
ed4e54 |
+ "//" XML_TAG_DIFF_ADDED
|
|
|
ed4e54 |
+ "//" XML_LRM_TAG_RESOURCE);
|
|
|
ed4e54 |
+ max = numXpathResults(xpathObj);
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
/*
|
|
|
ed4e54 |
- * Updates by, or in response to, TE actions will never contain updates
|
|
|
ed4e54 |
- * for more than one resource at a time, so such updates indicate an
|
|
|
ed4e54 |
- * LRM refresh.
|
|
|
ed4e54 |
- *
|
|
|
ed4e54 |
- * In that case, start a new transition rather than check each result
|
|
|
ed4e54 |
- * individually, which can result in _huge_ speedups in large clusters.
|
|
|
ed4e54 |
+ * Updates by, or in response to, graph actions will never affect more than
|
|
|
ed4e54 |
+ * one resource at a time, so such updates indicate an LRM refresh. In that
|
|
|
ed4e54 |
+ * case, start a new transition rather than check each result individually,
|
|
|
ed4e54 |
+ * which can result in _huge_ speedups in large clusters.
|
|
|
ed4e54 |
*
|
|
|
ed4e54 |
* Unfortunately, we can only do so when there are no pending actions.
|
|
|
ed4e54 |
* Otherwise, we could mistakenly throw away those results here, and
|
|
|
ed4e54 |
* the cluster will stall waiting for them and time out the operation.
|
|
|
ed4e54 |
*/
|
|
|
ed4e54 |
- if (transition_graph->pending == 0) {
|
|
|
ed4e54 |
- xpathObj = xpath_search(diff,
|
|
|
ed4e54 |
- "//" F_CIB_UPDATE_RESULT
|
|
|
ed4e54 |
- "//" XML_TAG_DIFF_ADDED
|
|
|
ed4e54 |
- "//" XML_LRM_TAG_RESOURCE);
|
|
|
ed4e54 |
- max = numXpathResults(xpathObj);
|
|
|
ed4e54 |
- if (max > 1) {
|
|
|
ed4e54 |
- crm_debug("Ignoring resource operation updates due to history refresh of %d resources",
|
|
|
ed4e54 |
- max);
|
|
|
ed4e54 |
- crm_log_xml_trace(diff, "lrm-refresh");
|
|
|
ed4e54 |
- abort_transition(INFINITY, tg_restart, "History refresh", NULL);
|
|
|
ed4e54 |
- goto bail;
|
|
|
ed4e54 |
+ if ((transition_graph->pending == 0) && (max > 1)) {
|
|
|
ed4e54 |
+ crm_debug("Ignoring resource operation updates due to history refresh of %d resources",
|
|
|
ed4e54 |
+ max);
|
|
|
ed4e54 |
+ crm_log_xml_trace(diff, "lrm-refresh");
|
|
|
ed4e54 |
+ abort_transition(INFINITY, tg_restart, "History refresh", NULL);
|
|
|
ed4e54 |
+ goto bail;
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
+ if (max == 1) {
|
|
|
ed4e54 |
+ xmlNode *lrm_resource = getXpathResult(xpathObj, 0);
|
|
|
ed4e54 |
+
|
|
|
ed4e54 |
+ if (shutdown_lock_cleared(lrm_resource)) {
|
|
|
ed4e54 |
+ // @TODO would be more efficient to abort once after transition done
|
|
|
ed4e54 |
+ abort_transition(INFINITY, tg_restart, "Shutdown lock cleared",
|
|
|
ed4e54 |
+ lrm_resource);
|
|
|
ed4e54 |
+ // Still process results, so we stop timers and update failcounts
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
- freeXpathObject(xpathObj);
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
+ freeXpathObject(xpathObj);
|
|
|
ed4e54 |
|
|
|
ed4e54 |
/* Process operation updates */
|
|
|
ed4e54 |
xpathObj =
|
|
|
ed4e54 |
@@ -205,6 +225,11 @@ process_lrm_resource_diff(xmlNode *lrm_resource, const char *node)
|
|
|
ed4e54 |
rsc_op = __xml_next(rsc_op)) {
|
|
|
ed4e54 |
process_graph_event(rsc_op, node);
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
+ if (shutdown_lock_cleared(lrm_resource)) {
|
|
|
ed4e54 |
+ // @TODO would be more efficient to abort once after transition done
|
|
|
ed4e54 |
+ abort_transition(INFINITY, tg_restart, "Shutdown lock cleared",
|
|
|
ed4e54 |
+ lrm_resource);
|
|
|
ed4e54 |
+ }
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
|
|
|
ed4e54 |
static void
|
|
|
ed4e54 |
diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h
|
|
|
ed4e54 |
index ca8cddb..8e31007 100644
|
|
|
ed4e54 |
--- a/daemons/controld/controld_utils.h
|
|
|
ed4e54 |
+++ b/daemons/controld/controld_utils.h
|
|
|
ed4e54 |
@@ -41,6 +41,7 @@ fsa_cib_anon_update(const char *section, xmlNode *data) {
|
|
|
ed4e54 |
}
|
|
|
ed4e54 |
|
|
|
ed4e54 |
extern gboolean fsa_has_quorum;
|
|
|
ed4e54 |
+extern bool controld_shutdown_lock_enabled;
|
|
|
ed4e54 |
extern int last_peer_update;
|
|
|
ed4e54 |
extern int last_resource_update;
|
|
|
ed4e54 |
|
|
|
ed4e54 |
--
|
|
|
ed4e54 |
1.8.3.1
|
|
|
ed4e54 |
|