Blame SOURCES/bz1832321-rabbitmq-cluster-increase-wait-timeout.patch

42b704
From cf1e7bfab984b5e9451a63c25b39c0932e0d9116 Mon Sep 17 00:00:00 2001
42b704
From: Michele Baldessari <michele@acksyn.org>
42b704
Date: Wed, 6 May 2020 16:11:36 +0200
42b704
Subject: [PATCH] Increase the rabbitmqctl wait timeout during start()
42b704
42b704
After we start the rabbitmq process we wait for the pid to show up
42b704
and then declare the server to be started successfully.
42b704
This wait is done via 'rabbitmqctl wait'. Now from
42b704
From https://www.rabbitmq.com/rabbitmqctl.8.html we have:
42b704
42b704
  If the specified pidfile is not created or erlang node is not started
42b704
  within --timeout the command will fail. Default timeout is 10 seconds.
42b704
42b704
This default of 10 seconds might not be enough in overloaded
42b704
environments. So what we want to do here is wait for as much time as
42b704
the start() operation allows us. So we wait for OCF_RESKEY_CRM_meta_timeout
42b704
minus 5 seconds. In the rare and non-sensical case that it is less than
42b704
10s we do not pass a timeout string at all to rabbitmqctl.
42b704
42b704
Co-Authored-By: John Eckersberg <jeckersb@redhat.com>
42b704
---
42b704
 heartbeat/rabbitmq-cluster | 15 +++++++++++++--
42b704
 1 file changed, 13 insertions(+), 2 deletions(-)
42b704
42b704
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
42b704
index a9ebd37ad..f7d48120c 100755
42b704
--- a/heartbeat/rabbitmq-cluster
42b704
+++ b/heartbeat/rabbitmq-cluster
42b704
@@ -294,6 +294,8 @@ rmq_monitor() {
42b704
 rmq_init_and_wait()
42b704
 {
42b704
 	local rc
42b704
+	local wait_timeout
42b704
+	local timeout_string
42b704
 
42b704
 	prepare_dir $RMQ_PID_DIR
42b704
 	prepare_dir $RMQ_LOG_DIR
42b704
@@ -305,11 +307,20 @@ rmq_init_and_wait()
42b704
 	setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" &
42b704
 
42b704
 	ocf_log info "Waiting for server to start"
42b704
-	$RMQ_CTL wait $RMQ_PID_FILE
42b704
+	# We want to give the wait command almost the full startup timeout we are given
42b704
+	# So we use the start operation timeout (in ms), convert it and subtract 5 seconds
42b704
+	# In the silly case that it is less than 10 seconds we just skip setting the timeout
42b704
+	wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5`
42b704
+	if [ $wait_timeout -gt 10 ]; then
42b704
+		timeout_string="--timeout ${wait_timeout}"
42b704
+	else
42b704
+		timeout_string=""
42b704
+	fi
42b704
+	$RMQ_CTL $timeout_string wait $RMQ_PID_FILE
42b704
 	rc=$?
42b704
 	if [ $rc -ne $OCF_SUCCESS ]; then
42b704
 		remove_pid
42b704
-		ocf_log info "rabbitmq-server start failed: $rc"
42b704
+		ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc"
42b704
 		return $OCF_ERR_GENERIC
42b704
 	fi
42b704