Blame SOURCES/shutdown-lock-03.patch

af918f
From 749f6b256cb2864ce3e862442adc6d219eefeca3 Mon Sep 17 00:00:00 2001
af918f
From: Ken Gaillot <kgaillot@redhat.com>
af918f
Date: Fri, 6 Dec 2019 12:17:03 -0600
af918f
Subject: [PATCH 03/10] Low: scheduler: respect shutdown locks when placing
af918f
 inactive resources
af918f
af918f
When shutdown-lock is enabled, and we're either scheduling a resource stop
af918f
on a node that's cleanly shutting down or scheduling any action for a
af918f
previously locked resource, add "shutdown-lock=<shutdown-timestamp>" to the
af918f
graph action. The controller will be able to use this to know when to preserve
af918f
the lock (by adding the lock time to the resource state entry).
af918f
af918f
When the scheduler unpacks a resource state entry with a lock, it will remember
af918f
the lock node and lock time, which will trigger existing code for applying
af918f
shutdown locks.
af918f
---
af918f
 lib/pengine/unpack.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
af918f
 pengine/allocate.c   | 17 ++++++++++++++++-
af918f
 pengine/graph.c      | 32 +++++++++++++++++++++++++++++++-
af918f
 3 files changed, 91 insertions(+), 7 deletions(-)
af918f
af918f
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
af918f
index 7b0d837..bb5efa4 100644
af918f
--- a/lib/pengine/unpack.c
af918f
+++ b/lib/pengine/unpack.c
af918f
@@ -18,6 +18,7 @@
af918f
 #include <crm_internal.h>
af918f
 
af918f
 #include <glib.h>
af918f
+#include <time.h>
af918f
 
af918f
 #include <crm/crm.h>
af918f
 #include <crm/services.h>
af918f
@@ -1151,7 +1152,8 @@ unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set)
af918f
                 crm_trace("Checking node %s/%s/%s status %d/%d/%d", id, rsc->id, rsc->container->id, fence, rsc->role, RSC_ROLE_STARTED);
af918f
 
af918f
             } else if (is_container_remote_node(this_node) == FALSE
af918f
-                       && rsc->role == RSC_ROLE_STARTED) {
af918f
+                       && ((rsc->role == RSC_ROLE_STARTED)
af918f
+                           || is_set(data_set->flags, pe_flag_shutdown_lock))) {
af918f
                 check = TRUE;
af918f
                 crm_trace("Checking node %s/%s status %d/%d/%d", id, rsc->id, fence, rsc->role, RSC_ROLE_STARTED);
af918f
             }
af918f
@@ -1167,6 +1169,9 @@ unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set)
af918f
 
af918f
         } else if (fence) {
af918f
             process = TRUE;
af918f
+
af918f
+        } else if (is_set(data_set->flags, pe_flag_shutdown_lock)) {
af918f
+            process = TRUE;
af918f
         }
af918f
 
af918f
         if(process) {
af918f
@@ -2286,6 +2291,28 @@ calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index)
af918f
     }
af918f
 }
af918f
 
af918f
+// If resource history entry has shutdown lock, remember lock node and time
af918f
+static void
af918f
+unpack_shutdown_lock(xmlNode *rsc_entry, pe_resource_t *rsc, pe_node_t *node,
af918f
+                     pe_working_set_t *data_set)
af918f
+{
af918f
+    time_t lock_time = 0;   // When lock started (i.e. node shutdown time)
af918f
+
af918f
+    if ((crm_element_value_epoch(rsc_entry, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
af918f
+                                 &lock_time) == pcmk_ok) && (lock_time != 0)) {
af918f
+
af918f
+        if ((data_set->shutdown_lock > 0)
af918f
+            && (get_effective_time(data_set)
af918f
+                > (lock_time + data_set->shutdown_lock))) {
af918f
+            pe_rsc_info(rsc, "Shutdown lock for %s on %s expired",
af918f
+                        rsc->id, node->details->uname);
af918f
+        } else {
af918f
+            rsc->lock_node = node;
af918f
+            rsc->lock_time = lock_time;
af918f
+        }
af918f
+    }
af918f
+}
af918f
+
af918f
 static resource_t *
af918f
 unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set)
af918f
 {
af918f
@@ -2322,18 +2349,30 @@ unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data
af918f
         }
af918f
     }
af918f
 
af918f
-    if (op_list == NULL) {
af918f
-        /* if there are no operations, there is nothing to do */
af918f
-        return NULL;
af918f
+    if (is_not_set(data_set->flags, pe_flag_shutdown_lock)) {
af918f
+        if (op_list == NULL) {
af918f
+            // If there are no operations, there is nothing to do
af918f
+            return NULL;
af918f
+        }
af918f
     }
af918f
 
af918f
     /* find the resource */
af918f
     rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry);
af918f
     if (rsc == NULL) {
af918f
-        rsc = process_orphan_resource(rsc_entry, node, data_set);
af918f
+        if (op_list == NULL) {
af918f
+            // If there are no operations, there is nothing to do
af918f
+            return NULL;
af918f
+        } else {
af918f
+            rsc = process_orphan_resource(rsc_entry, node, data_set);
af918f
+        }
af918f
     }
af918f
     CRM_ASSERT(rsc != NULL);
af918f
 
af918f
+    // Check whether the resource is "shutdown-locked" to this node
af918f
+    if (is_set(data_set->flags, pe_flag_shutdown_lock)) {
af918f
+        unpack_shutdown_lock(rsc_entry, rsc, node, data_set);
af918f
+    }
af918f
+
af918f
     /* process operations */
af918f
     saved_role = rsc->role;
af918f
     on_fail = action_fail_ignore;
af918f
diff --git a/pengine/allocate.c b/pengine/allocate.c
af918f
index 09f9e51..7366716 100644
af918f
--- a/pengine/allocate.c
af918f
+++ b/pengine/allocate.c
af918f
@@ -1047,8 +1047,23 @@ apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set)
af918f
         return;
af918f
     }
af918f
 
af918f
+    if (rsc->lock_node != NULL) {
af918f
+        // The lock was obtained from resource history
af918f
+
af918f
+        if (rsc->running_on != NULL) {
af918f
+            /* The resource was started elsewhere even though it is now
af918f
+             * considered locked. This shouldn't be possible, but as a
af918f
+             * failsafe, we don't want to disturb the resource now.
af918f
+             */
af918f
+            pe_rsc_info(rsc,
af918f
+                        "Cancelling shutdown lock because %s is already active",
af918f
+                        rsc->id);
af918f
+            rsc->lock_node = NULL;
af918f
+            rsc->lock_time = 0;
af918f
+        }
af918f
+
af918f
     // Only a resource active on exactly one node can be locked
af918f
-    if (pcmk__list_of_1(rsc->running_on)) {
af918f
+    } else if (pcmk__list_of_1(rsc->running_on)) {
af918f
         pe_node_t *node = rsc->running_on->data;
af918f
 
af918f
         if (node->details->shutdown) {
af918f
diff --git a/pengine/graph.c b/pengine/graph.c
af918f
index cba30d0..33168ca 100644
af918f
--- a/pengine/graph.c
af918f
+++ b/pengine/graph.c
af918f
@@ -1,5 +1,7 @@
af918f
 /*
af918f
- * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
af918f
+ * Copyright 2004-2020 the Pacemaker project contributors
af918f
+ *
af918f
+ * The version control history for this file may have further details.
af918f
  *
af918f
  * This program is free software; you can redistribute it and/or
af918f
  * modify it under the terms of the GNU General Public
af918f
@@ -998,6 +1000,26 @@ add_downed_nodes(xmlNode *xml, const action_t *action,
af918f
     }
af918f
 }
af918f
 
af918f
+static bool
af918f
+should_lock_action(pe_action_t *action)
af918f
+{
af918f
+    // Only actions taking place on resource's lock node are locked
af918f
+    if ((action->rsc->lock_node == NULL) || (action->node == NULL)
af918f
+        || (action->node->details != action->rsc->lock_node->details)) {
af918f
+        return false;
af918f
+    }
af918f
+
af918f
+    /* During shutdown, only stops are locked (otherwise, another action such as
af918f
+     * a demote would cause the controller to clear the lock)
af918f
+     */
af918f
+    if (action->node->details->shutdown && action->task
af918f
+        && strcmp(action->task, RSC_STOP)) {
af918f
+        return false;
af918f
+    }
af918f
+
af918f
+    return true;
af918f
+}
af918f
+
af918f
 static xmlNode *
af918f
 action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set)
af918f
 {
af918f
@@ -1104,6 +1126,14 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set)
af918f
             XML_ATTR_TYPE
af918f
         };
af918f
 
af918f
+        /* If a resource is locked to a node via shutdown-lock, mark its actions
af918f
+         * so the controller can preserve the lock when the action completes.
af918f
+         */
af918f
+        if (should_lock_action(action)) {
af918f
+            crm_xml_add_ll(action_xml, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
af918f
+                           (long long) action->rsc->lock_time);
af918f
+        }
af918f
+
af918f
         // List affected resource
af918f
 
af918f
         rsc_xml = create_xml_node(action_xml,
af918f
-- 
af918f
1.8.3.1
af918f