Blame SOURCES/017-cleanup-pending-op.patch

413fc7
From a48d73f23c6800ae51522c1a91d0f0e1eb967078 Mon Sep 17 00:00:00 2001
413fc7
From: Ken Gaillot <kgaillot@redhat.com>
413fc7
Date: Tue, 8 Jan 2019 15:31:14 -0600
413fc7
Subject: [PATCH] Fix: controller: directly acknowledge unrecordable operation
413fc7
 results
413fc7
413fc7
Regression introduced in 2.0.1-rc1 by 0363985dd
413fc7
413fc7
Before that commit, if an operation result arrived when there was no resource
413fc7
information available, a warning would be logged and the operation would be
413fc7
directly acknowledged. This could occur, for example, if resource history were
413fc7
cleaned while an operation was pending on that resource.
413fc7
413fc7
After that commit, in that situation, an assertion and error would be logged,
413fc7
and no acknowledgement would be sent, leading to a transition timeout.
413fc7
413fc7
Restore the direct ack. Also improve related log messages.
413fc7
---
413fc7
 crmd/lrm.c | 80 ++++++++++++++++++++++++++++++++++++++++++--------------------
413fc7
 1 file changed, 55 insertions(+), 25 deletions(-)
413fc7
413fc7
diff --git a/crmd/lrm.c b/crmd/lrm.c
413fc7
index 0d64f59..51cb50b 100644
413fc7
--- a/crmd/lrm.c
413fc7
+++ b/crmd/lrm.c
413fc7
@@ -2497,6 +2497,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
413fc7
     int update_id = 0;
413fc7
     gboolean remove = FALSE;
413fc7
     gboolean removed = FALSE;
413fc7
+    bool need_direct_ack = FALSE;
413fc7
     lrmd_rsc_info_t *rsc = NULL;
413fc7
     const char *node_name = NULL;
413fc7
 
413fc7
@@ -2527,7 +2528,6 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
413fc7
                     op_key, op->rsc_id);
413fc7
         }
413fc7
     }
413fc7
-    CRM_LOG_ASSERT(rsc != NULL); // If it's still NULL, there's a bug somewhere
413fc7
 
413fc7
     // Get node name if available (from executor state or action XML)
413fc7
     if (lrm_state) {
413fc7
@@ -2559,51 +2559,81 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
413fc7
     }
413fc7
 
413fc7
     if (op->op_status != PCMK_LRM_OP_CANCELLED) {
413fc7
+        /* We might not record the result, so directly acknowledge it to the
413fc7
+         * originator instead, so it doesn't time out waiting for the result
413fc7
+         * (especially important if part of a transition).
413fc7
+         */
413fc7
+        need_direct_ack = TRUE;
413fc7
+
413fc7
         if (controld_action_is_recordable(op->op_type)) {
413fc7
             if (node_name && rsc) {
413fc7
+                // We should record the result, and happily, we can
413fc7
                 update_id = do_update_resource(node_name, rsc, op);
413fc7
+                need_direct_ack = FALSE;
413fc7
+
413fc7
+            } else if (op->rsc_deleted) {
413fc7
+                /* We shouldn't record the result (likely the resource was
413fc7
+                 * refreshed, cleaned, or removed while this operation was
413fc7
+                 * in flight).
413fc7
+                 */
413fc7
+                crm_notice("Not recording %s result in CIB because "
413fc7
+                           "resource information was removed since it was initiated",
413fc7
+                           op_key);
413fc7
             } else {
413fc7
-                // @TODO Should we direct ack?
413fc7
-                crm_err("Unable to record %s result in CIB: %s",
413fc7
-                        op_key,
413fc7
+                /* This shouldn't be possible; the executor didn't consider the
413fc7
+                 * resource deleted, but we couldn't find resource or node
413fc7
+                 * information.
413fc7
+                 */
413fc7
+                crm_err("Unable to record %s result in CIB: %s", op_key,
413fc7
                         (node_name? "No resource information" : "No node name"));
413fc7
             }
413fc7
-        } else {
413fc7
-            send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
413fc7
         }
413fc7
+
413fc7
     } else if (op->interval == 0) {
413fc7
-        /* This will occur when "crm resource cleanup" is called while actions are in-flight */
413fc7
-        crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id);
413fc7
-        send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
413fc7
+        /* A non-recurring operation was cancelled. Most likely, the
413fc7
+         * never-initiated action was removed from the executor's pending
413fc7
+         * operations list upon resource removal.
413fc7
+         */
413fc7
+        need_direct_ack = TRUE;
413fc7
 
413fc7
     } else if (pending == NULL) {
413fc7
-        /* We don't need to do anything for cancelled ops
413fc7
-         * that are not in our pending op list. There are no
413fc7
-         * transition actions waiting on these operations. */
413fc7
+        /* This recurring operation was cancelled, but was not pending. No
413fc7
+         * transition actions are waiting on it, nothing needs to be done.
413fc7
+         */
413fc7
 
413fc7
     } else if (op->user_data == NULL) {
413fc7
-        /* At this point we have a pending entry, but no transition
413fc7
-         * key present in the user_data field. report this */
413fc7
-        crm_err("Op %s (call=%d): No user data", op_key, op->call_id);
413fc7
+        /* This recurring operation was cancelled and pending, but we don't
413fc7
+         * have a transition key. This should never happen.
413fc7
+         */
413fc7
+        crm_err("Recurring operation %s was cancelled without transition information",
413fc7
+                op_key);
413fc7
 
413fc7
     } else if (pending->remove) {
413fc7
-        /* The tengine canceled this op, we have been waiting for the cancel to finish. */
413fc7
+        /* This recurring operation was cancelled (by us) and pending, and we
413fc7
+         * have been waiting for it to finish.
413fc7
+         */
413fc7
         if (lrm_state) {
413fc7
             erase_lrm_history_by_op(lrm_state, op);
413fc7
         }
413fc7
 
413fc7
     } else if (op->rsc_deleted) {
413fc7
-        /* The tengine initiated this op, but it was cancelled outside of the
413fc7
-         * tengine's control during a resource cleanup/re-probe request. The tengine
413fc7
-         * must be alerted that this operation completed, otherwise the tengine
413fc7
-         * will continue waiting for this update to occur until it is timed out.
413fc7
-         * We don't want this update going to the cib though, so use a direct ack. */
413fc7
-        crm_trace("Op %s (call=%d): cancelled due to rsc deletion", op_key, op->call_id);
413fc7
-        send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
413fc7
+        /* This recurring operation was cancelled (but not by us, and the
413fc7
+         * executor does not have resource information, likely due to resource
413fc7
+         * cleanup, refresh, or removal) and pending.
413fc7
+         */
413fc7
+        crm_debug("Recurring op %s was cancelled due to resource deletion",
413fc7
+                  op_key);
413fc7
+        need_direct_ack = TRUE;
413fc7
 
413fc7
     } else {
413fc7
-        /* Before a stop is called, no need to direct ack */
413fc7
-        crm_trace("Op %s (call=%d): no delete event required", op_key, op->call_id);
413fc7
+        /* This recurring operation was cancelled (but not by us, likely by the
413fc7
+         * executor before stopping the resource) and pending. We don't need to
413fc7
+         * do anything special.
413fc7
+         */
413fc7
+    }
413fc7
+
413fc7
+    if (need_direct_ack) {
413fc7
+        send_direct_ack(NULL, NULL, NULL, op, op->rsc_id);
413fc7
     }
413fc7
 
413fc7
     if(remove == FALSE) {
413fc7
-- 
413fc7
1.8.3.1
413fc7