Blob Blame History Raw
From 19d273ae5831f40e1816b138a56260ddd3448a4e Mon Sep 17 00:00:00 2001
From: Andrew Beekhof <andrew@beekhof.net>
Date: Fri, 12 Aug 2016 10:03:37 +1000
Subject: [PATCH] Fix: crmd: Resend the shutdown request if the DC forgets

As seen in:
   https://bugzilla.redhat.com/show_bug.cgi?id=1310486

Scenario needs very poor timing and some bad luck:

1. Start a node wait for it to become the DC
2. Start a second node
3. Tell the second node to stop while it is in the process of
negotiating with the DC.
   Specifically just after do_cl_join_finalize_respond() is called on
the second node.
4. Cross your fingers that somehow the shutdown=0 update makes it to
attrd _after_ the DC sets shutdown=${large int}

Under these conditions, the request to shut down will be lost and the DC
will feel free to start services on the second node.
---
 crmd/lrm.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/crmd/lrm.c b/crmd/lrm.c
index c987e49..3e32f33 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -2025,6 +2025,7 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
     fsa_data_t *msg_data = NULL;
     const char *transition = NULL;
     gboolean stop_recurring = FALSE;
+    bool send_nack = FALSE;
 
     CRM_CHECK(rsc != NULL, return);
     CRM_CHECK(operation != NULL, return);
@@ -2075,18 +2076,29 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
     /* now do the op */
     crm_info("Performing key=%s op=%s_%s_%d", transition, rsc->id, operation, op->interval);
 
-    if (fsa_state != S_NOT_DC && fsa_state != S_POLICY_ENGINE && fsa_state != S_TRANSITION_ENGINE) {
-        if (safe_str_neq(operation, "fail")
-            && safe_str_neq(operation, CRMD_ACTION_STOP)) {
-            crm_info("Discarding attempt to perform action %s on %s in state %s",
-                     operation, rsc->id, fsa_state2string(fsa_state));
-            op->rc = CRM_DIRECT_NACK_RC;
-            op->op_status = PCMK_LRM_OP_ERROR;
-            send_direct_ack(NULL, NULL, rsc, op, rsc->id);
-            lrmd_free_event(op);
-            free(op_id);
-            return;
-        }
+    if (is_set(fsa_input_register, R_SHUTDOWN) && safe_str_eq(operation, RSC_START)) {
+        register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
+        send_nack = TRUE;
+
+    } else if (fsa_state != S_NOT_DC
+               && fsa_state != S_POLICY_ENGINE /* Recalculating */
+               && fsa_state != S_TRANSITION_ENGINE
+               && safe_str_neq(operation, "fail")
+               && safe_str_neq(operation, CRMD_ACTION_STOP)) {
+        send_nack = TRUE;
+    }
+
+    if(send_nack) {
+        crm_notice("Discarding attempt to perform action %s on %s in state %s (shutdown=%s)",
+                   operation, rsc->id, fsa_state2string(fsa_state),
+                   is_set(fsa_input_register, R_SHUTDOWN)?"true":"false");
+
+        op->rc = CRM_DIRECT_NACK_RC;
+        op->op_status = PCMK_LRM_OP_ERROR;
+        send_direct_ack(NULL, NULL, rsc, op, rsc->id);
+        lrmd_free_event(op);
+        free(op_id);
+        return;
     }
 
     op_id = generate_op_key(rsc->id, op->op_type, op->interval);
-- 
1.8.3.1