|
|
a89620 |
From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001
|
|
|
a89620 |
From: Damien Ciabrini <dciabrin@redhat.com>
|
|
|
a89620 |
Date: Thu, 9 Aug 2018 16:33:26 +0200
|
|
|
a89620 |
Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails
|
|
|
a89620 |
|
|
|
a89620 |
When a node tries to join an existing cluster, it fetches a node
|
|
|
a89620 |
list to try to connect from any of those running nodes.
|
|
|
a89620 |
|
|
|
a89620 |
If the nodes from this list become unavailable while we're joining
|
|
|
a89620 |
the cluster, the rabbitmq server will fail to get clustered and
|
|
|
a89620 |
make the start operation fail.
|
|
|
a89620 |
|
|
|
a89620 |
Give the resource a chance to start anyway by retrying the entire
|
|
|
a89620 |
start actions until it succeeds or until the start timeout is
|
|
|
a89620 |
reached and pacemaker stops the start operation.
|
|
|
a89620 |
|
|
|
a89620 |
Co-Authored-by: <michele@acksyn.org>
|
|
|
a89620 |
Suggested-by: <abeekhof@redhat.com>
|
|
|
a89620 |
---
|
|
|
a89620 |
heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++---
|
|
|
a89620 |
1 file changed, 26 insertions(+), 3 deletions(-)
|
|
|
a89620 |
|
|
|
a89620 |
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
index 9ff49e075..84f383460 100755
|
|
|
a89620 |
--- a/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
+++ b/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
@@ -31,6 +31,12 @@
|
|
|
a89620 |
|
|
|
a89620 |
#######################################################################
|
|
|
a89620 |
|
|
|
a89620 |
+# This arbitrary value here is used by the rmq_start action to
|
|
|
a89620 |
+# signify that the resource agent must retry the start process
|
|
|
a89620 |
+# It might potentially conflict with OCF assigned error code
|
|
|
a89620 |
+# in the future.
|
|
|
a89620 |
+RMQ_TRY_RESTART_ERROR_CODE=126
|
|
|
a89620 |
+
|
|
|
a89620 |
RMQ_SERVER=/usr/sbin/rabbitmq-server
|
|
|
a89620 |
RMQ_CTL=/usr/sbin/rabbitmqctl
|
|
|
a89620 |
RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia"
|
|
|
a89620 |
@@ -354,7 +360,7 @@ rmq_notify() {
|
|
|
a89620 |
return $OCF_SUCCESS
|
|
|
a89620 |
}
|
|
|
a89620 |
|
|
|
a89620 |
-rmq_start() {
|
|
|
a89620 |
+rmq_try_start() {
|
|
|
a89620 |
local join_list=""
|
|
|
a89620 |
local rc
|
|
|
a89620 |
|
|
|
a89620 |
@@ -384,8 +390,16 @@ rmq_start() {
|
|
|
a89620 |
rc=$?
|
|
|
a89620 |
|
|
|
a89620 |
if [ $rc -ne 0 ]; then
|
|
|
a89620 |
- ocf_log info "node failed to join even after reseting local data. Check SELINUX policy"
|
|
|
a89620 |
- return $OCF_ERR_GENERIC
|
|
|
a89620 |
+ # we could not join the rabbitmq cluster from any of the running nodes
|
|
|
a89620 |
+ # this might be due to a unexpected reset of those nodes. Give ourself
|
|
|
a89620 |
+ # a chance to start by retrying the entire start sequence.
|
|
|
a89620 |
+
|
|
|
a89620 |
+ ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq"
|
|
|
a89620 |
+ rmq_stop
|
|
|
a89620 |
+
|
|
|
a89620 |
+ ocf_log warn "Re-detect available rabbitmq nodes and try to start again"
|
|
|
a89620 |
+ # return an unused OCF value to signify a "retry" condition
|
|
|
a89620 |
+ return $RMQ_TRY_RESTART_ERROR_CODE
|
|
|
a89620 |
fi
|
|
|
a89620 |
|
|
|
a89620 |
# Restore users, user permissions, and policies (if any)
|
|
|
a89620 |
@@ -443,6 +457,15 @@ rmq_start() {
|
|
|
a89620 |
return $OCF_SUCCESS
|
|
|
a89620 |
}
|
|
|
a89620 |
|
|
|
a89620 |
+rmq_start() {
|
|
|
a89620 |
+ local rc=$RMQ_TRY_RESTART_ERROR_CODE
|
|
|
a89620 |
+ while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do
|
|
|
a89620 |
+ rmq_try_start
|
|
|
a89620 |
+ rc=$?
|
|
|
a89620 |
+ done
|
|
|
a89620 |
+ return $rc
|
|
|
a89620 |
+}
|
|
|
a89620 |
+
|
|
|
a89620 |
rmq_stop() {
|
|
|
a89620 |
# Backup users, user permissions, and policies
|
|
|
a89620 |
BaseDataDir=`dirname $RMQ_DATA_DIR`
|