From cf1e7bfab984b5e9451a63c25b39c0932e0d9116 Mon Sep 17 00:00:00 2001 From: Michele Baldessari Date: Wed, 6 May 2020 16:11:36 +0200 Subject: [PATCH] Increase the rabbitmqctl wait timeout during start() After we start the rabbitmq process we wait for the pid to show up and then declare the server to be started successfully. This wait is done via 'rabbitmqctl wait'. Now from From https://www.rabbitmq.com/rabbitmqctl.8.html we have: If the specified pidfile is not created or erlang node is not started within --timeout the command will fail. Default timeout is 10 seconds. This default of 10 seconds might not be enough in overloaded environments. So what we want to do here is wait for as much time as the start() operation allows us. So we wait for OCF_RESKEY_CRM_meta_timeout minus 5 seconds. In the rare and non-sensical case that it is less than 10s we do not pass a timeout string at all to rabbitmqctl. Co-Authored-By: John Eckersberg --- heartbeat/rabbitmq-cluster | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster index a9ebd37ad..f7d48120c 100755 --- a/heartbeat/rabbitmq-cluster +++ b/heartbeat/rabbitmq-cluster @@ -294,6 +294,8 @@ rmq_monitor() { rmq_init_and_wait() { local rc + local wait_timeout + local timeout_string prepare_dir $RMQ_PID_DIR prepare_dir $RMQ_LOG_DIR @@ -305,11 +307,20 @@ rmq_init_and_wait() setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & ocf_log info "Waiting for server to start" - $RMQ_CTL wait $RMQ_PID_FILE + # We want to give the wait command almost the full startup timeout we are given + # So we use the start operation timeout (in ms), convert it and subtract 5 seconds + # In the silly case that it is less than 10 seconds we just skip setting the timeout + wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5` + if [ $wait_timeout -gt 10 ]; then + timeout_string="--timeout ${wait_timeout}" + else + timeout_string="" + fi + $RMQ_CTL $timeout_string wait $RMQ_PID_FILE rc=$? if [ $rc -ne $OCF_SUCCESS ]; then remove_pid - ocf_log info "rabbitmq-server start failed: $rc" + ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc" return $OCF_ERR_GENERIC fi