Blob Blame History Raw
From cf1e7bfab984b5e9451a63c25b39c0932e0d9116 Mon Sep 17 00:00:00 2001
From: Michele Baldessari <michele@acksyn.org>
Date: Wed, 6 May 2020 16:11:36 +0200
Subject: [PATCH] Increase the rabbitmqctl wait timeout during start()

After we start the rabbitmq process we wait for the pid to show up
and then declare the server to be started successfully.
This wait is done via 'rabbitmqctl wait'. Now from
From https://www.rabbitmq.com/rabbitmqctl.8.html we have:

  If the specified pidfile is not created or erlang node is not started
  within --timeout the command will fail. Default timeout is 10 seconds.

This default of 10 seconds might not be enough in overloaded
environments. So what we want to do here is wait for as much time as
the start() operation allows us. So we wait for OCF_RESKEY_CRM_meta_timeout
minus 5 seconds. In the rare and non-sensical case that it is less than
10s we do not pass a timeout string at all to rabbitmqctl.

Co-Authored-By: John Eckersberg <jeckersb@redhat.com>
---
 heartbeat/rabbitmq-cluster | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
index a9ebd37ad..f7d48120c 100755
--- a/heartbeat/rabbitmq-cluster
+++ b/heartbeat/rabbitmq-cluster
@@ -294,6 +294,8 @@ rmq_monitor() {
 rmq_init_and_wait()
 {
 	local rc
+	local wait_timeout
+	local timeout_string
 
 	prepare_dir $RMQ_PID_DIR
 	prepare_dir $RMQ_LOG_DIR
@@ -305,11 +307,20 @@ rmq_init_and_wait()
 	setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" &
 
 	ocf_log info "Waiting for server to start"
-	$RMQ_CTL wait $RMQ_PID_FILE
+	# We want to give the wait command almost the full startup timeout we are given
+	# So we use the start operation timeout (in ms), convert it and subtract 5 seconds
+	# In the silly case that it is less than 10 seconds we just skip setting the timeout
+	wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5`
+	if [ $wait_timeout -gt 10 ]; then
+		timeout_string="--timeout ${wait_timeout}"
+	else
+		timeout_string=""
+	fi
+	$RMQ_CTL $timeout_string wait $RMQ_PID_FILE
 	rc=$?
 	if [ $rc -ne $OCF_SUCCESS ]; then
 		remove_pid
-		ocf_log info "rabbitmq-server start failed: $rc"
+		ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc"
 		return $OCF_ERR_GENERIC
 	fi