From 357cd703e99fbcf1a371f34966accaf5322b1c50 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 22 Feb 2017 14:14:48 -0600 Subject: [PATCH 1/2] Fix: pengine,libpe_status: don't clear same fail-count twice Previously, pengine and libpe_status were inconsistent when generating a key to use for a fail-count op. This could lead to two identical ops being scheduled, one of which would timeout (during which time the resource would not be recovered). Now, they consistently use generate_op_key(). --- lib/pengine/unpack.c | 6 +++--- pengine/allocate.c | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index 6737273..a357643 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -2934,10 +2934,10 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod } if (clear_failcount) { - action_t *clear_op = NULL; + char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); + action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, + node, FALSE, TRUE, data_set); - clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'), - CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); } diff --git a/pengine/allocate.c b/pengine/allocate.c index 74b57fb..4e8d68d 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -596,7 +596,7 @@ static gboolean failcount_clear_action_exists(node_t * node, resource_t * rsc) { gboolean rc = FALSE; - char *key = crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'); + char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); GListPtr list = find_actions_exact(rsc->actions, key, node); if (list) { @@ -1195,10 +1195,9 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set) node_t *node = (node_t *) gIter->data; if (node->details->online && get_failcount(node, rsc, NULL, data_set)) { - action_t *clear_op = NULL; - - clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'), - CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); + char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); + action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, + node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)", -- 1.8.3.1 From da8f425a37b844676ca468676b07e61c05ff2843 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 22 Feb 2017 14:33:28 -0600 Subject: [PATCH 2/2] Log: pengine,libpe_status: make failcount clearing messages more helpful --- lib/pengine/unpack.c | 17 ++++++++--------- pengine/allocate.c | 12 +++++++++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index a357643..e0a3452 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -2851,11 +2851,11 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod { bool expired = FALSE; time_t last_failure = 0; - int clear_failcount = 0; int interval = 0; int failure_timeout = rsc->failure_timeout; const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); + const char *clear_reason = NULL; /* clearing recurring monitor operation failures automatically * needs to be carefully considered */ @@ -2903,15 +2903,14 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set); if(fc) { if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) { - clear_failcount = 1; - crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname); + clear_reason = "it expired"; } else { expired = FALSE; } } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) { /* always clear last failure when reconnect interval is set */ - clear_failcount = 1; + clear_reason = "reconnect interval is set"; } } @@ -2926,19 +2925,19 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id, key, node->details->id); } else if (digest_data->rc != RSC_DIGEST_MATCH) { - clear_failcount = 1; - crm_info - ("Clearing failcount for %s on %s, %s failed and now resource parameters have changed.", - task, rsc->id, node->details->uname); + clear_reason = "resource parameters have changed"; } } - if (clear_failcount) { + if (clear_reason != NULL) { char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); + + crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s", + rsc->id, node->details->uname, clear_reason, clear_op->uuid); } crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval); diff --git a/pengine/allocate.c b/pengine/allocate.c index 4e8d68d..9a87816 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -449,6 +449,10 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki action_clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); set_bit(action_clear->flags, pe_action_runnable); + + crm_notice("Clearing failure of %s on %s " + "because action definition changed " CRM_XS " %s", + rsc->id, node->details->uname, action_clear->uuid); } } @@ -1200,9 +1204,11 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set) node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); - pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)", - get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname, - clear_op->uuid); + + pe_rsc_info(rsc, + "Clearing failure of %s on %s because it is orphaned " + CRM_XS " %s", + rsc->id, node->details->uname, clear_op->uuid); custom_action_order(rsc, NULL, clear_op, rsc, generate_op_key(rsc->id, RSC_STOP, 0), NULL, -- 1.8.3.1