Blame SOURCES/024-failcount-clear-fix.patch

60de42
From 357cd703e99fbcf1a371f34966accaf5322b1c50 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Wed, 22 Feb 2017 14:14:48 -0600
60de42
Subject: [PATCH 1/2] Fix: pengine,libpe_status: don't clear same fail-count
60de42
 twice
60de42
60de42
Previously, pengine and libpe_status were inconsistent when generating
60de42
a key to use for a fail-count op. This could lead to two identical ops
60de42
being scheduled, one of which would timeout (during which time the resource
60de42
would not be recovered). Now, they consistently use generate_op_key().
60de42
---
60de42
 lib/pengine/unpack.c | 6 +++---
60de42
 pengine/allocate.c   | 9 ++++-----
60de42
 2 files changed, 7 insertions(+), 8 deletions(-)
60de42
60de42
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
60de42
index 6737273..a357643 100644
60de42
--- a/lib/pengine/unpack.c
60de42
+++ b/lib/pengine/unpack.c
60de42
@@ -2934,10 +2934,10 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
60de42
     }
60de42
 
60de42
     if (clear_failcount) {
60de42
-        action_t *clear_op = NULL;
60de42
+        char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
60de42
+        action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
60de42
+                                           node, FALSE, TRUE, data_set);
60de42
 
60de42
-        clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
60de42
-                                 CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
60de42
         add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
60de42
     }
60de42
 
60de42
diff --git a/pengine/allocate.c b/pengine/allocate.c
60de42
index 74b57fb..4e8d68d 100644
60de42
--- a/pengine/allocate.c
60de42
+++ b/pengine/allocate.c
60de42
@@ -596,7 +596,7 @@ static gboolean
60de42
 failcount_clear_action_exists(node_t * node, resource_t * rsc)
60de42
 {
60de42
     gboolean rc = FALSE;
60de42
-    char *key = crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_');
60de42
+    char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
60de42
     GListPtr list = find_actions_exact(rsc->actions, key, node);
60de42
 
60de42
     if (list) {
60de42
@@ -1195,10 +1195,9 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
60de42
         node_t *node = (node_t *) gIter->data;
60de42
 
60de42
         if (node->details->online && get_failcount(node, rsc, NULL, data_set)) {
60de42
-            action_t *clear_op = NULL;
60de42
-
60de42
-            clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
60de42
-                                     CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
60de42
+            char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
60de42
+            action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
60de42
+                                               node, FALSE, TRUE, data_set);
60de42
 
60de42
             add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
60de42
             pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From da8f425a37b844676ca468676b07e61c05ff2843 Mon Sep 17 00:00:00 2001
60de42
From: Ken Gaillot <kgaillot@redhat.com>
60de42
Date: Wed, 22 Feb 2017 14:33:28 -0600
60de42
Subject: [PATCH 2/2] Log: pengine,libpe_status: make failcount clearing
60de42
 messages more helpful
60de42
60de42
---
60de42
 lib/pengine/unpack.c | 17 ++++++++---------
60de42
 pengine/allocate.c   | 12 +++++++++---
60de42
 2 files changed, 17 insertions(+), 12 deletions(-)
60de42
60de42
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
60de42
index a357643..e0a3452 100644
60de42
--- a/lib/pengine/unpack.c
60de42
+++ b/lib/pengine/unpack.c
60de42
@@ -2851,11 +2851,11 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
60de42
 {
60de42
     bool expired = FALSE;
60de42
     time_t last_failure = 0;
60de42
-    int clear_failcount = 0;
60de42
     int interval = 0;
60de42
     int failure_timeout = rsc->failure_timeout;
60de42
     const char *key = get_op_key(xml_op);
60de42
     const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
60de42
+    const char *clear_reason = NULL;
60de42
 
60de42
     /* clearing recurring monitor operation failures automatically
60de42
      * needs to be carefully considered */
60de42
@@ -2903,15 +2903,14 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
60de42
             int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set);
60de42
             if(fc) {
60de42
                 if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) {
60de42
-                    clear_failcount = 1;
60de42
-                    crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname);
60de42
+                    clear_reason = "it expired";
60de42
 
60de42
                 } else {
60de42
                     expired = FALSE;
60de42
                 }
60de42
             } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) {
60de42
                 /* always clear last failure when reconnect interval is set */
60de42
-                clear_failcount = 1;
60de42
+                clear_reason = "reconnect interval is set";
60de42
             }
60de42
         }
60de42
 
60de42
@@ -2926,19 +2925,19 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
60de42
             crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id,
60de42
                       key, node->details->id);
60de42
         } else if (digest_data->rc != RSC_DIGEST_MATCH) {
60de42
-            clear_failcount = 1;
60de42
-            crm_info
60de42
-                ("Clearing failcount for %s on %s, %s failed and now resource parameters have changed.",
60de42
-                 task, rsc->id, node->details->uname);
60de42
+            clear_reason = "resource parameters have changed";
60de42
         }
60de42
     }
60de42
 
60de42
-    if (clear_failcount) {
60de42
+    if (clear_reason != NULL) {
60de42
         char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
60de42
         action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
60de42
                                            node, FALSE, TRUE, data_set);
60de42
 
60de42
         add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
60de42
+
60de42
+        crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
60de42
+                   rsc->id, node->details->uname, clear_reason, clear_op->uuid);
60de42
     }
60de42
 
60de42
     crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval);
60de42
diff --git a/pengine/allocate.c b/pengine/allocate.c
60de42
index 4e8d68d..9a87816 100644
60de42
--- a/pengine/allocate.c
60de42
+++ b/pengine/allocate.c
60de42
@@ -449,6 +449,10 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
60de42
             action_clear =
60de42
                 custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
60de42
             set_bit(action_clear->flags, pe_action_runnable);
60de42
+
60de42
+            crm_notice("Clearing failure of %s on %s "
60de42
+                       "because action definition changed " CRM_XS " %s",
60de42
+                       rsc->id, node->details->uname, action_clear->uuid);
60de42
         }
60de42
     }
60de42
 
60de42
@@ -1200,9 +1204,11 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
60de42
                                                node, FALSE, TRUE, data_set);
60de42
 
60de42
             add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
60de42
-            pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
60de42
-                        get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname,
60de42
-                        clear_op->uuid);
60de42
+
60de42
+            pe_rsc_info(rsc,
60de42
+                        "Clearing failure of %s on %s because it is orphaned "
60de42
+                        CRM_XS " %s",
60de42
+                        rsc->id, node->details->uname, clear_op->uuid);
60de42
 
60de42
             custom_action_order(rsc, NULL, clear_op,
60de42
                             rsc, generate_op_key(rsc->id, RSC_STOP, 0), NULL,
60de42
-- 
60de42
1.8.3.1
60de42