|
|
42b704 |
From cf1e7bfab984b5e9451a63c25b39c0932e0d9116 Mon Sep 17 00:00:00 2001
|
|
|
42b704 |
From: Michele Baldessari <michele@acksyn.org>
|
|
|
42b704 |
Date: Wed, 6 May 2020 16:11:36 +0200
|
|
|
42b704 |
Subject: [PATCH] Increase the rabbitmqctl wait timeout during start()
|
|
|
42b704 |
|
|
|
42b704 |
After we start the rabbitmq process we wait for the pid to show up
|
|
|
42b704 |
and then declare the server to be started successfully.
|
|
|
42b704 |
This wait is done via 'rabbitmqctl wait'. Now from
|
|
|
42b704 |
From https://www.rabbitmq.com/rabbitmqctl.8.html we have:
|
|
|
42b704 |
|
|
|
42b704 |
If the specified pidfile is not created or erlang node is not started
|
|
|
42b704 |
within --timeout the command will fail. Default timeout is 10 seconds.
|
|
|
42b704 |
|
|
|
42b704 |
This default of 10 seconds might not be enough in overloaded
|
|
|
42b704 |
environments. So what we want to do here is wait for as much time as
|
|
|
42b704 |
the start() operation allows us. So we wait for OCF_RESKEY_CRM_meta_timeout
|
|
|
42b704 |
minus 5 seconds. In the rare and non-sensical case that it is less than
|
|
|
42b704 |
10s we do not pass a timeout string at all to rabbitmqctl.
|
|
|
42b704 |
|
|
|
42b704 |
Co-Authored-By: John Eckersberg <jeckersb@redhat.com>
|
|
|
42b704 |
---
|
|
|
42b704 |
heartbeat/rabbitmq-cluster | 15 +++++++++++++--
|
|
|
42b704 |
1 file changed, 13 insertions(+), 2 deletions(-)
|
|
|
42b704 |
|
|
|
42b704 |
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
|
|
|
42b704 |
index a9ebd37ad..f7d48120c 100755
|
|
|
42b704 |
--- a/heartbeat/rabbitmq-cluster
|
|
|
42b704 |
+++ b/heartbeat/rabbitmq-cluster
|
|
|
42b704 |
@@ -294,6 +294,8 @@ rmq_monitor() {
|
|
|
42b704 |
rmq_init_and_wait()
|
|
|
42b704 |
{
|
|
|
42b704 |
local rc
|
|
|
42b704 |
+ local wait_timeout
|
|
|
42b704 |
+ local timeout_string
|
|
|
42b704 |
|
|
|
42b704 |
prepare_dir $RMQ_PID_DIR
|
|
|
42b704 |
prepare_dir $RMQ_LOG_DIR
|
|
|
42b704 |
@@ -305,11 +307,20 @@ rmq_init_and_wait()
|
|
|
42b704 |
setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" &
|
|
|
42b704 |
|
|
|
42b704 |
ocf_log info "Waiting for server to start"
|
|
|
42b704 |
- $RMQ_CTL wait $RMQ_PID_FILE
|
|
|
42b704 |
+ # We want to give the wait command almost the full startup timeout we are given
|
|
|
42b704 |
+ # So we use the start operation timeout (in ms), convert it and subtract 5 seconds
|
|
|
42b704 |
+ # In the silly case that it is less than 10 seconds we just skip setting the timeout
|
|
|
42b704 |
+ wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5`
|
|
|
42b704 |
+ if [ $wait_timeout -gt 10 ]; then
|
|
|
42b704 |
+ timeout_string="--timeout ${wait_timeout}"
|
|
|
42b704 |
+ else
|
|
|
42b704 |
+ timeout_string=""
|
|
|
42b704 |
+ fi
|
|
|
42b704 |
+ $RMQ_CTL $timeout_string wait $RMQ_PID_FILE
|
|
|
42b704 |
rc=$?
|
|
|
42b704 |
if [ $rc -ne $OCF_SUCCESS ]; then
|
|
|
42b704 |
remove_pid
|
|
|
42b704 |
- ocf_log info "rabbitmq-server start failed: $rc"
|
|
|
42b704 |
+ ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc"
|
|
|
42b704 |
return $OCF_ERR_GENERIC
|
|
|
42b704 |
fi
|
|
|
42b704 |
|