|
 |
0240e4 |
From c83dc10b975aa70a3da85dc2e63cec99a0b729b2 Mon Sep 17 00:00:00 2001
|
|
 |
0240e4 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
 |
0240e4 |
Date: Wed, 23 Dec 2015 15:19:28 -0600
|
|
 |
0240e4 |
Subject: [PATCH] Feature: pacemaker_remote: support graceful stops
|
|
 |
0240e4 |
|
|
 |
0240e4 |
When pacemaker_remote gets an interrupt signal, if there are any connected
|
|
 |
0240e4 |
proxy providers, it will send an lrmd IPC op for a shutdown request,
|
|
 |
0240e4 |
and stop accepting new provider connections. If the provider acknowledges the
|
|
 |
0240e4 |
request, pacemaker_remote will wait until all providers disconnect before
|
|
 |
0240e4 |
exiting itself. This gives the cluster the opportunity to stop any resources
|
|
 |
0240e4 |
running on the node that is shutting down.
|
|
 |
0240e4 |
|
|
 |
0240e4 |
If the provider is an older version that does not support graceful stops,
|
|
 |
0240e4 |
pacemaker_remote will time out waiting for the ack, then exit immediately.
|
|
 |
0240e4 |
|
|
 |
0240e4 |
Since we are now waiting for resources to exit, the systemd stop timeout
|
|
 |
0240e4 |
for pacemaker_remote has been raised to match pacemaker's.
|
|
 |
0240e4 |
|
|
 |
0240e4 |
lrmd/ipc_proxy.c | 12 +++-
|
|
 |
0240e4 |
lrmd/lrmd_private.h | 4 +-
|
|
 |
0240e4 |
lrmd/main.c | 121 +++++++++++++++++++++++++++++++++++++
|
|
 |
0240e4 |
lrmd/pacemaker_remote.service.in | 4 +-
|
|
 |
0240e4 |
lrmd/tls_backend.c | 3 +-
|
|
 |
0240e4 |
5 files changed, 135 insertions(+), 9 deletions(-)
|
|
 |
0240e4 |
|
|
 |
0240e4 |
diff
|
|
 |
0240e4 |
index 9633a67..07c13ab 100644
|
|
 |
0240e4 |
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -152,9 +152,19 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml)
|
|
 |
0240e4 |
const char *session = crm_element_value(xml, F_LRMD_IPC_SESSION);
|
|
 |
0240e4 |
const char *msg_type = crm_element_value(xml, F_LRMD_IPC_OP);
|
|
 |
0240e4 |
xmlNode *msg = get_message_xml(xml, F_LRMD_IPC_MSG);
|
|
 |
0240e4 |
- crm_client_t *ipc_client = crm_client_get_by_id(session);
|
|
 |
0240e4 |
+ crm_client_t *ipc_client;
|
|
 |
0240e4 |
int rc = 0;
|
|
 |
0240e4 |
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ * defuse the short exit timer to give the cluster time to
|
|
 |
0240e4 |
+ * stop any resources we're running.
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ if (safe_str_eq(msg_type, LRMD_IPC_OP_SHUTDOWN_ACK)) {
|
|
 |
0240e4 |
+ handle_shutdown_ack();
|
|
 |
0240e4 |
+ return;
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ ipc_client = crm_client_get_by_id(session);
|
|
 |
0240e4 |
if (ipc_client == NULL) {
|
|
 |
0240e4 |
xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY);
|
|
 |
0240e4 |
crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY);
|
|
 |
0240e4 |
diff
|
|
 |
0240e4 |
index 78f14c9..29146f5 100644
|
|
 |
0240e4 |
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -80,7 +80,9 @@ void process_lrmd_message(crm_client_t * client, uint32_t id, xmlNode * request)
|
|
 |
0240e4 |
|
|
 |
0240e4 |
void free_rsc(gpointer data);
|
|
 |
0240e4 |
|
|
 |
0240e4 |
-void lrmd_shutdown(int nsig);
|
|
 |
0240e4 |
+void handle_shutdown_ack(void);
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+void lrmd_client_destroy(crm_client_t *client);
|
|
 |
0240e4 |
|
|
 |
0240e4 |
void client_disconnect_cleanup(const char *client_id);
|
|
 |
0240e4 |
|
|
 |
0240e4 |
diff
|
|
 |
0240e4 |
index 73519e2..98a1412 100644
|
|
 |
0240e4 |
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -40,6 +40,16 @@ static qb_ipcs_service_t *ipcs = NULL;
|
|
 |
0240e4 |
stonith_t *stonith_api = NULL;
|
|
 |
0240e4 |
int lrmd_call_id = 0;
|
|
 |
0240e4 |
|
|
 |
0240e4 |
+#ifdef ENABLE_PCMK_REMOTE
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+static volatile sig_atomic_t shutting_down = FALSE;
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+/* timer for waiting for acknowledgment of shutdown request */
|
|
 |
0240e4 |
+static volatile guint shutdown_ack_timer = 0;
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+static gboolean lrmd_exit(gpointer data);
|
|
 |
0240e4 |
+#endif
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
static void
|
|
 |
0240e4 |
stonith_connection_destroy_cb(stonith_t * st, stonith_event_t * e)
|
|
 |
0240e4 |
{
|
|
 |
0240e4 |
@@ -151,6 +161,27 @@ lrmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size)
|
|
 |
0240e4 |
return 0;
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
+/*!
|
|
 |
0240e4 |
+ * \internal
|
|
 |
0240e4 |
+ * \brief Free a client connection, and exit if appropriate
|
|
 |
0240e4 |
+ *
|
|
 |
0240e4 |
+ * \param[in] client Client connection to free
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+void
|
|
 |
0240e4 |
+lrmd_client_destroy(crm_client_t *client)
|
|
 |
0240e4 |
+{
|
|
 |
0240e4 |
+ crm_client_destroy(client);
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+#ifdef ENABLE_PCMK_REMOTE
|
|
 |
0240e4 |
+ /* If we were waiting to shut down, we can now safely do so
|
|
 |
0240e4 |
+ * if there are no more proxied IPC providers
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ if (shutting_down && (ipc_proxy_get_provider() == NULL)) {
|
|
 |
0240e4 |
+ lrmd_exit(NULL);
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+#endif
|
|
 |
0240e4 |
+}
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
static int32_t
|
|
 |
0240e4 |
lrmd_ipc_closed(qb_ipcs_connection_t * c)
|
|
 |
0240e4 |
{
|
|
 |
0240e4 |
@@ -165,7 +196,7 @@ lrmd_ipc_closed(qb_ipcs_connection_t * c)
|
|
 |
0240e4 |
#ifdef ENABLE_PCMK_REMOTE
|
|
 |
0240e4 |
ipc_proxy_remove_provider(client);
|
|
 |
0240e4 |
#endif
|
|
 |
0240e4 |
- crm_client_destroy(client);
|
|
 |
0240e4 |
+ lrmd_client_destroy(client);
|
|
 |
0240e4 |
return 0;
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -227,8 +258,17 @@ lrmd_server_send_notify(crm_client_t * client, xmlNode * msg)
|
|
 |
0240e4 |
return -1;
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
-void
|
|
 |
0240e4 |
-lrmd_shutdown(int nsig)
|
|
 |
0240e4 |
+/*!
|
|
 |
0240e4 |
+ * \internal
|
|
 |
0240e4 |
+ * \brief Clean up and exit immediately
|
|
 |
0240e4 |
+ *
|
|
 |
0240e4 |
+ * \param[in] data Ignored
|
|
 |
0240e4 |
+ *
|
|
 |
0240e4 |
+ * \return Doesn't return
|
|
 |
0240e4 |
+ * \note This can be used as a timer callback.
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+static gboolean
|
|
 |
0240e4 |
+lrmd_exit(gpointer data)
|
|
 |
0240e4 |
{
|
|
 |
0240e4 |
crm_info("Terminating with %d clients", crm_hash_table_size(client_connections));
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -249,6 +289,79 @@ lrmd_shutdown(int nsig)
|
|
 |
0240e4 |
crm_client_cleanup();
|
|
 |
0240e4 |
g_hash_table_destroy(rsc_list);
|
|
 |
0240e4 |
crm_exit(pcmk_ok);
|
|
 |
0240e4 |
+ return FALSE;
|
|
 |
0240e4 |
+}
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+/*!
|
|
 |
0240e4 |
+ * \internal
|
|
 |
0240e4 |
+ * \brief Request cluster shutdown if appropriate, otherwise exit immediately
|
|
 |
0240e4 |
+ *
|
|
 |
0240e4 |
+ * \param[in] nsig Signal that caused invocation (ignored)
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+static void
|
|
 |
0240e4 |
+lrmd_shutdown(int nsig)
|
|
 |
0240e4 |
+{
|
|
 |
0240e4 |
+#ifdef ENABLE_PCMK_REMOTE
|
|
 |
0240e4 |
+ crm_client_t *ipc_proxy = ipc_proxy_get_provider();
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ /* If there are active proxied IPC providers, then we may be running
|
|
 |
0240e4 |
+ * resources, so notify the cluster that we wish to shut down.
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ if (ipc_proxy) {
|
|
 |
0240e4 |
+ if (shutting_down) {
|
|
 |
0240e4 |
+ crm_trace("Shutdown already in progress");
|
|
 |
0240e4 |
+ return;
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ crm_info("Sending shutdown request to cluster");
|
|
 |
0240e4 |
+ if (ipc_proxy_shutdown_req(ipc_proxy) < 0) {
|
|
 |
0240e4 |
+ crm_crit("Shutdown request failed, exiting immediately");
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ } else {
|
|
 |
0240e4 |
+ /* We requested a shutdown. Now, we need to wait for an
|
|
 |
0240e4 |
+ * acknowledgement from the proxy host (which ensures the proxy host
|
|
 |
0240e4 |
+ * supports shutdown requests), then wait for all proxy hosts to
|
|
 |
0240e4 |
+ * disconnect (which ensures that all resources have been stopped).
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ shutting_down = TRUE;
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ /* Stop accepting new proxy connections */
|
|
 |
0240e4 |
+ lrmd_tls_server_destroy();
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ * fairly short timeout to exit quickly in that case. If we get the
|
|
 |
0240e4 |
+ * ack, we'll defuse this timer.
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ shutdown_ack_timer = g_timeout_add_seconds(20, lrmd_exit, NULL);
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+ /* Currently, we let the OS kill us if the clients don't disconnect
|
|
 |
0240e4 |
+ * in a reasonable time. We could instead set a long timer here
|
|
 |
0240e4 |
+ * (shorter than what the OS is likely to use) and exit immediately
|
|
 |
0240e4 |
+ * if it pops.
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+ return;
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+#endif
|
|
 |
0240e4 |
+ lrmd_exit(NULL);
|
|
 |
0240e4 |
+}
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+/*!
|
|
 |
0240e4 |
+ * \internal
|
|
 |
0240e4 |
+ * \brief Defuse short exit timer if shutting down
|
|
 |
0240e4 |
+ */
|
|
 |
0240e4 |
+void handle_shutdown_ack()
|
|
 |
0240e4 |
+{
|
|
 |
0240e4 |
+#ifdef ENABLE_PCMK_REMOTE
|
|
 |
0240e4 |
+ if (shutting_down) {
|
|
 |
0240e4 |
+ crm_info("Received shutdown ack");
|
|
 |
0240e4 |
+ if (shutdown_ack_timer > 0) {
|
|
 |
0240e4 |
+ g_source_remove(shutdown_ack_timer);
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+ return;
|
|
 |
0240e4 |
+ }
|
|
 |
0240e4 |
+#endif
|
|
 |
0240e4 |
+ crm_debug("Ignoring unexpected shutdown ack");
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
/* *INDENT-OFF* */
|
|
 |
0240e4 |
@@ -363,6 +476,6 @@ main(int argc, char **argv)
|
|
 |
0240e4 |
g_main_run(mainloop);
|
|
 |
0240e4 |
|
|
 |
0240e4 |
|
|
 |
0240e4 |
- lrmd_shutdown(SIGTERM);
|
|
 |
0240e4 |
+ lrmd_exit(NULL);
|
|
 |
0240e4 |
return pcmk_ok;
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
diff --git a/lrmd/pacemaker_remote.service.in b/lrmd/pacemaker_remote.service.in
|
|
 |
0240e4 |
index 15e61fb..7252976 100644
|
|
 |
0240e4 |
|
|
 |
0240e4 |
+++ b/lrmd/pacemaker_remote.service.in
|
|
 |
0240e4 |
@@ -13,7 +13,9 @@ EnvironmentFile=-/etc/sysconfig/pacemaker
|
|
 |
0240e4 |
|
|
 |
0240e4 |
ExecStart=@sbindir@/pacemaker_remoted
|
|
 |
0240e4 |
|
|
 |
0240e4 |
-TimeoutStopSec=30s
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+
|
|
 |
0240e4 |
+TimeoutStopSec=30min
|
|
 |
0240e4 |
TimeoutStartSec=30s
|
|
 |
0240e4 |
|
|
 |
0240e4 |
# Restart options include: no, on-success, on-failure, on-abort or always
|
|
 |
0240e4 |
diff
|
|
 |
0240e4 |
index df5387f..7b8ef9d 100644
|
|
 |
0240e4 |
|
|
 |
0240e4 |
|
|
 |
0240e4 |
@@ -163,8 +163,7 @@ lrmd_remote_client_destroy(gpointer user_data)
|
|
 |
0240e4 |
close(csock);
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
- crm_client_destroy(client);
|
|
 |
0240e4 |
-
|
|
 |
0240e4 |
+ lrmd_client_destroy(client);
|
|
 |
0240e4 |
return;
|
|
 |
0240e4 |
}
|
|
 |
0240e4 |
|
|
 |
0240e4 |
--
|
|
 |
0240e4 |
1.8.3.1
|
|
 |
0240e4 |
|