Blob Blame History Raw
From 357cd703e99fbcf1a371f34966accaf5322b1c50 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Feb 2017 14:14:48 -0600
Subject: [PATCH 1/2] Fix: pengine,libpe_status: don't clear same fail-count
 twice

Previously, pengine and libpe_status were inconsistent when generating
a key to use for a fail-count op. This could lead to two identical ops
being scheduled, one of which would timeout (during which time the resource
would not be recovered). Now, they consistently use generate_op_key().
---
 lib/pengine/unpack.c | 6 +++---
 pengine/allocate.c   | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 6737273..a357643 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2934,10 +2934,10 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
     }
 
     if (clear_failcount) {
-        action_t *clear_op = NULL;
+        char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
+        action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
+                                           node, FALSE, TRUE, data_set);
 
-        clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
-                                 CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
         add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
     }
 
diff --git a/pengine/allocate.c b/pengine/allocate.c
index 74b57fb..4e8d68d 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -596,7 +596,7 @@ static gboolean
 failcount_clear_action_exists(node_t * node, resource_t * rsc)
 {
     gboolean rc = FALSE;
-    char *key = crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_');
+    char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
     GListPtr list = find_actions_exact(rsc->actions, key, node);
 
     if (list) {
@@ -1195,10 +1195,9 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
         node_t *node = (node_t *) gIter->data;
 
         if (node->details->online && get_failcount(node, rsc, NULL, data_set)) {
-            action_t *clear_op = NULL;
-
-            clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
-                                     CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
+            char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
+            action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
+                                               node, FALSE, TRUE, data_set);
 
             add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
             pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
-- 
1.8.3.1


From da8f425a37b844676ca468676b07e61c05ff2843 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Feb 2017 14:33:28 -0600
Subject: [PATCH 2/2] Log: pengine,libpe_status: make failcount clearing
 messages more helpful

---
 lib/pengine/unpack.c | 17 ++++++++---------
 pengine/allocate.c   | 12 +++++++++---
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index a357643..e0a3452 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2851,11 +2851,11 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
 {
     bool expired = FALSE;
     time_t last_failure = 0;
-    int clear_failcount = 0;
     int interval = 0;
     int failure_timeout = rsc->failure_timeout;
     const char *key = get_op_key(xml_op);
     const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
+    const char *clear_reason = NULL;
 
     /* clearing recurring monitor operation failures automatically
      * needs to be carefully considered */
@@ -2903,15 +2903,14 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
             int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set);
             if(fc) {
                 if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) {
-                    clear_failcount = 1;
-                    crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname);
+                    clear_reason = "it expired";
 
                 } else {
                     expired = FALSE;
                 }
             } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) {
                 /* always clear last failure when reconnect interval is set */
-                clear_failcount = 1;
+                clear_reason = "reconnect interval is set";
             }
         }
 
@@ -2926,19 +2925,19 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
             crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id,
                       key, node->details->id);
         } else if (digest_data->rc != RSC_DIGEST_MATCH) {
-            clear_failcount = 1;
-            crm_info
-                ("Clearing failcount for %s on %s, %s failed and now resource parameters have changed.",
-                 task, rsc->id, node->details->uname);
+            clear_reason = "resource parameters have changed";
         }
     }
 
-    if (clear_failcount) {
+    if (clear_reason != NULL) {
         char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
         action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
                                            node, FALSE, TRUE, data_set);
 
         add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
+
+        crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
+                   rsc->id, node->details->uname, clear_reason, clear_op->uuid);
     }
 
     crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval);
diff --git a/pengine/allocate.c b/pengine/allocate.c
index 4e8d68d..9a87816 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -449,6 +449,10 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
             action_clear =
                 custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
             set_bit(action_clear->flags, pe_action_runnable);
+
+            crm_notice("Clearing failure of %s on %s "
+                       "because action definition changed " CRM_XS " %s",
+                       rsc->id, node->details->uname, action_clear->uuid);
         }
     }
 
@@ -1200,9 +1204,11 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
                                                node, FALSE, TRUE, data_set);
 
             add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
-            pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
-                        get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname,
-                        clear_op->uuid);
+
+            pe_rsc_info(rsc,
+                        "Clearing failure of %s on %s because it is orphaned "
+                        CRM_XS " %s",
+                        rsc->id, node->details->uname, clear_op->uuid);
 
             custom_action_order(rsc, NULL, clear_op,
                             rsc, generate_op_key(rsc->id, RSC_STOP, 0), NULL,
-- 
1.8.3.1