|
|
60de42 |
From 357cd703e99fbcf1a371f34966accaf5322b1c50 Mon Sep 17 00:00:00 2001
|
|
|
60de42 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
60de42 |
Date: Wed, 22 Feb 2017 14:14:48 -0600
|
|
|
60de42 |
Subject: [PATCH 1/2] Fix: pengine,libpe_status: don't clear same fail-count
|
|
|
60de42 |
twice
|
|
|
60de42 |
|
|
|
60de42 |
Previously, pengine and libpe_status were inconsistent when generating
|
|
|
60de42 |
a key to use for a fail-count op. This could lead to two identical ops
|
|
|
60de42 |
being scheduled, one of which would timeout (during which time the resource
|
|
|
60de42 |
would not be recovered). Now, they consistently use generate_op_key().
|
|
|
60de42 |
---
|
|
|
60de42 |
lib/pengine/unpack.c | 6 +++---
|
|
|
60de42 |
pengine/allocate.c | 9 ++++-----
|
|
|
60de42 |
2 files changed, 7 insertions(+), 8 deletions(-)
|
|
|
60de42 |
|
|
|
60de42 |
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
|
|
|
60de42 |
index 6737273..a357643 100644
|
|
|
60de42 |
--- a/lib/pengine/unpack.c
|
|
|
60de42 |
+++ b/lib/pengine/unpack.c
|
|
|
60de42 |
@@ -2934,10 +2934,10 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
if (clear_failcount) {
|
|
|
60de42 |
- action_t *clear_op = NULL;
|
|
|
60de42 |
+ char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
|
|
|
60de42 |
+ action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
|
|
|
60de42 |
+ node, FALSE, TRUE, data_set);
|
|
|
60de42 |
|
|
|
60de42 |
- clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
|
|
|
60de42 |
- CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
|
|
|
60de42 |
add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
diff --git a/pengine/allocate.c b/pengine/allocate.c
|
|
|
60de42 |
index 74b57fb..4e8d68d 100644
|
|
|
60de42 |
--- a/pengine/allocate.c
|
|
|
60de42 |
+++ b/pengine/allocate.c
|
|
|
60de42 |
@@ -596,7 +596,7 @@ static gboolean
|
|
|
60de42 |
failcount_clear_action_exists(node_t * node, resource_t * rsc)
|
|
|
60de42 |
{
|
|
|
60de42 |
gboolean rc = FALSE;
|
|
|
60de42 |
- char *key = crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_');
|
|
|
60de42 |
+ char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
|
|
|
60de42 |
GListPtr list = find_actions_exact(rsc->actions, key, node);
|
|
|
60de42 |
|
|
|
60de42 |
if (list) {
|
|
|
60de42 |
@@ -1195,10 +1195,9 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
|
|
|
60de42 |
node_t *node = (node_t *) gIter->data;
|
|
|
60de42 |
|
|
|
60de42 |
if (node->details->online && get_failcount(node, rsc, NULL, data_set)) {
|
|
|
60de42 |
- action_t *clear_op = NULL;
|
|
|
60de42 |
-
|
|
|
60de42 |
- clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
|
|
|
60de42 |
- CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
|
|
|
60de42 |
+ char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
|
|
|
60de42 |
+ action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
|
|
|
60de42 |
+ node, FALSE, TRUE, data_set);
|
|
|
60de42 |
|
|
|
60de42 |
add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
|
|
|
60de42 |
pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
|
|
|
60de42 |
--
|
|
|
60de42 |
1.8.3.1
|
|
|
60de42 |
|
|
|
60de42 |
|
|
|
60de42 |
From da8f425a37b844676ca468676b07e61c05ff2843 Mon Sep 17 00:00:00 2001
|
|
|
60de42 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
60de42 |
Date: Wed, 22 Feb 2017 14:33:28 -0600
|
|
|
60de42 |
Subject: [PATCH 2/2] Log: pengine,libpe_status: make failcount clearing
|
|
|
60de42 |
messages more helpful
|
|
|
60de42 |
|
|
|
60de42 |
---
|
|
|
60de42 |
lib/pengine/unpack.c | 17 ++++++++---------
|
|
|
60de42 |
pengine/allocate.c | 12 +++++++++---
|
|
|
60de42 |
2 files changed, 17 insertions(+), 12 deletions(-)
|
|
|
60de42 |
|
|
|
60de42 |
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
|
|
|
60de42 |
index a357643..e0a3452 100644
|
|
|
60de42 |
--- a/lib/pengine/unpack.c
|
|
|
60de42 |
+++ b/lib/pengine/unpack.c
|
|
|
60de42 |
@@ -2851,11 +2851,11 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
|
|
|
60de42 |
{
|
|
|
60de42 |
bool expired = FALSE;
|
|
|
60de42 |
time_t last_failure = 0;
|
|
|
60de42 |
- int clear_failcount = 0;
|
|
|
60de42 |
int interval = 0;
|
|
|
60de42 |
int failure_timeout = rsc->failure_timeout;
|
|
|
60de42 |
const char *key = get_op_key(xml_op);
|
|
|
60de42 |
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
|
|
|
60de42 |
+ const char *clear_reason = NULL;
|
|
|
60de42 |
|
|
|
60de42 |
/* clearing recurring monitor operation failures automatically
|
|
|
60de42 |
* needs to be carefully considered */
|
|
|
60de42 |
@@ -2903,15 +2903,14 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
|
|
|
60de42 |
int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set);
|
|
|
60de42 |
if(fc) {
|
|
|
60de42 |
if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) {
|
|
|
60de42 |
- clear_failcount = 1;
|
|
|
60de42 |
- crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname);
|
|
|
60de42 |
+ clear_reason = "it expired";
|
|
|
60de42 |
|
|
|
60de42 |
} else {
|
|
|
60de42 |
expired = FALSE;
|
|
|
60de42 |
}
|
|
|
60de42 |
} else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) {
|
|
|
60de42 |
/* always clear last failure when reconnect interval is set */
|
|
|
60de42 |
- clear_failcount = 1;
|
|
|
60de42 |
+ clear_reason = "reconnect interval is set";
|
|
|
60de42 |
}
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
@@ -2926,19 +2925,19 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
|
|
|
60de42 |
crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id,
|
|
|
60de42 |
key, node->details->id);
|
|
|
60de42 |
} else if (digest_data->rc != RSC_DIGEST_MATCH) {
|
|
|
60de42 |
- clear_failcount = 1;
|
|
|
60de42 |
- crm_info
|
|
|
60de42 |
- ("Clearing failcount for %s on %s, %s failed and now resource parameters have changed.",
|
|
|
60de42 |
- task, rsc->id, node->details->uname);
|
|
|
60de42 |
+ clear_reason = "resource parameters have changed";
|
|
|
60de42 |
}
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
- if (clear_failcount) {
|
|
|
60de42 |
+ if (clear_reason != NULL) {
|
|
|
60de42 |
char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
|
|
|
60de42 |
action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
|
|
|
60de42 |
node, FALSE, TRUE, data_set);
|
|
|
60de42 |
|
|
|
60de42 |
add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
|
|
|
60de42 |
+
|
|
|
60de42 |
+ crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
|
|
|
60de42 |
+ rsc->id, node->details->uname, clear_reason, clear_op->uuid);
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval);
|
|
|
60de42 |
diff --git a/pengine/allocate.c b/pengine/allocate.c
|
|
|
60de42 |
index 4e8d68d..9a87816 100644
|
|
|
60de42 |
--- a/pengine/allocate.c
|
|
|
60de42 |
+++ b/pengine/allocate.c
|
|
|
60de42 |
@@ -449,6 +449,10 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
|
|
|
60de42 |
action_clear =
|
|
|
60de42 |
custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
|
|
|
60de42 |
set_bit(action_clear->flags, pe_action_runnable);
|
|
|
60de42 |
+
|
|
|
60de42 |
+ crm_notice("Clearing failure of %s on %s "
|
|
|
60de42 |
+ "because action definition changed " CRM_XS " %s",
|
|
|
60de42 |
+ rsc->id, node->details->uname, action_clear->uuid);
|
|
|
60de42 |
}
|
|
|
60de42 |
}
|
|
|
60de42 |
|
|
|
60de42 |
@@ -1200,9 +1204,11 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
|
|
|
60de42 |
node, FALSE, TRUE, data_set);
|
|
|
60de42 |
|
|
|
60de42 |
add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
|
|
|
60de42 |
- pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
|
|
|
60de42 |
- get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname,
|
|
|
60de42 |
- clear_op->uuid);
|
|
|
60de42 |
+
|
|
|
60de42 |
+ pe_rsc_info(rsc,
|
|
|
60de42 |
+ "Clearing failure of %s on %s because it is orphaned "
|
|
|
60de42 |
+ CRM_XS " %s",
|
|
|
60de42 |
+ rsc->id, node->details->uname, clear_op->uuid);
|
|
|
60de42 |
|
|
|
60de42 |
custom_action_order(rsc, NULL, clear_op,
|
|
|
60de42 |
rsc, generate_op_key(rsc->id, RSC_STOP, 0), NULL,
|
|
|
60de42 |
--
|
|
|
60de42 |
1.8.3.1
|
|
|
60de42 |
|