Blame SOURCES/011-resend-shutdown.patch

fada68
From 19d273ae5831f40e1816b138a56260ddd3448a4e Mon Sep 17 00:00:00 2001
fada68
From: Andrew Beekhof <andrew@beekhof.net>
fada68
Date: Fri, 12 Aug 2016 10:03:37 +1000
fada68
Subject: [PATCH] Fix: crmd: Resend the shutdown request if the DC forgets
fada68
fada68
As seen in:
fada68
   https://bugzilla.redhat.com/show_bug.cgi?id=1310486
fada68
fada68
Scenario needs very poor timing and some bad luck:
fada68
fada68
1. Start a node wait for it to become the DC
fada68
2. Start a second node
fada68
3. Tell the second node to stop while it is in the process of
fada68
negotiating with the DC.
fada68
   Specifically just after do_cl_join_finalize_respond() is called on
fada68
the second node.
fada68
4. Cross your fingers that somehow the shutdown=0 update makes it to
fada68
attrd _after_ the DC sets shutdown=${large int}
fada68
fada68
Under these conditions, the request to shut down will be lost and the DC
fada68
will feel free to start services on the second node.
fada68
---
fada68
 crmd/lrm.c | 36 ++++++++++++++++++++++++------------
fada68
 1 file changed, 24 insertions(+), 12 deletions(-)
fada68
fada68
diff --git a/crmd/lrm.c b/crmd/lrm.c
fada68
index c987e49..3e32f33 100644
fada68
--- a/crmd/lrm.c
fada68
+++ b/crmd/lrm.c
fada68
@@ -2025,6 +2025,7 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
fada68
     fsa_data_t *msg_data = NULL;
fada68
     const char *transition = NULL;
fada68
     gboolean stop_recurring = FALSE;
fada68
+    bool send_nack = FALSE;
fada68
 
fada68
     CRM_CHECK(rsc != NULL, return);
fada68
     CRM_CHECK(operation != NULL, return);
fada68
@@ -2075,18 +2076,29 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
fada68
     /* now do the op */
fada68
     crm_info("Performing key=%s op=%s_%s_%d", transition, rsc->id, operation, op->interval);
fada68
 
fada68
-    if (fsa_state != S_NOT_DC && fsa_state != S_POLICY_ENGINE && fsa_state != S_TRANSITION_ENGINE) {
fada68
-        if (safe_str_neq(operation, "fail")
fada68
-            && safe_str_neq(operation, CRMD_ACTION_STOP)) {
fada68
-            crm_info("Discarding attempt to perform action %s on %s in state %s",
fada68
-                     operation, rsc->id, fsa_state2string(fsa_state));
fada68
-            op->rc = CRM_DIRECT_NACK_RC;
fada68
-            op->op_status = PCMK_LRM_OP_ERROR;
fada68
-            send_direct_ack(NULL, NULL, rsc, op, rsc->id);
fada68
-            lrmd_free_event(op);
fada68
-            free(op_id);
fada68
-            return;
fada68
-        }
fada68
+    if (is_set(fsa_input_register, R_SHUTDOWN) && safe_str_eq(operation, RSC_START)) {
fada68
+        register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
fada68
+        send_nack = TRUE;
fada68
+
fada68
+    } else if (fsa_state != S_NOT_DC
fada68
+               && fsa_state != S_POLICY_ENGINE /* Recalculating */
fada68
+               && fsa_state != S_TRANSITION_ENGINE
fada68
+               && safe_str_neq(operation, "fail")
fada68
+               && safe_str_neq(operation, CRMD_ACTION_STOP)) {
fada68
+        send_nack = TRUE;
fada68
+    }
fada68
+
fada68
+    if(send_nack) {
fada68
+        crm_notice("Discarding attempt to perform action %s on %s in state %s (shutdown=%s)",
fada68
+                   operation, rsc->id, fsa_state2string(fsa_state),
fada68
+                   is_set(fsa_input_register, R_SHUTDOWN)?"true":"false");
fada68
+
fada68
+        op->rc = CRM_DIRECT_NACK_RC;
fada68
+        op->op_status = PCMK_LRM_OP_ERROR;
fada68
+        send_direct_ack(NULL, NULL, rsc, op, rsc->id);
fada68
+        lrmd_free_event(op);
fada68
+        free(op_id);
fada68
+        return;
fada68
     }
fada68
 
fada68
     op_id = generate_op_key(rsc->id, op->op_type, op->interval);
fada68
-- 
fada68
1.8.3.1
fada68