Blame SOURCES/bz1745713-rabbitmq-cluster-4-retry-start-cluster-join-fails.patch

a89620
From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001
a89620
From: Damien Ciabrini <dciabrin@redhat.com>
a89620
Date: Thu, 9 Aug 2018 16:33:26 +0200
a89620
Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails
a89620
a89620
When a node tries to join an existing cluster, it fetches a node
a89620
list to try to connect from any of those running nodes.
a89620
a89620
If the nodes from this list become unavailable while we're joining
a89620
the cluster, the rabbitmq server will fail to get clustered and
a89620
make the start operation fail.
a89620
a89620
Give the resource a chance to start anyway by retrying the entire
a89620
start actions until it succeeds or until the start timeout is
a89620
reached and pacemaker stops the start operation.
a89620
a89620
Co-Authored-by: <michele@acksyn.org>
a89620
Suggested-by: <abeekhof@redhat.com>
a89620
---
a89620
 heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++---
a89620
 1 file changed, 26 insertions(+), 3 deletions(-)
a89620
a89620
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
a89620
index 9ff49e075..84f383460 100755
a89620
--- a/heartbeat/rabbitmq-cluster
a89620
+++ b/heartbeat/rabbitmq-cluster
a89620
@@ -31,6 +31,12 @@
a89620
 
a89620
 #######################################################################
a89620
 
a89620
+# This arbitrary value here is used by the rmq_start action to
a89620
+# signify that the resource agent must retry the start process
a89620
+# It might potentially conflict with OCF assigned error code
a89620
+# in the future.
a89620
+RMQ_TRY_RESTART_ERROR_CODE=126
a89620
+
a89620
 RMQ_SERVER=/usr/sbin/rabbitmq-server
a89620
 RMQ_CTL=/usr/sbin/rabbitmqctl
a89620
 RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia"
a89620
@@ -354,7 +360,7 @@ rmq_notify() {
a89620
 	return $OCF_SUCCESS
a89620
 }
a89620
 
a89620
-rmq_start() {
a89620
+rmq_try_start() {
a89620
 	local join_list=""
a89620
 	local rc
a89620
 
a89620
@@ -384,8 +390,16 @@ rmq_start() {
a89620
 	rc=$?
a89620
 
a89620
 	if [ $rc -ne 0 ]; then
a89620
-		ocf_log info "node failed to join even after reseting local data. Check SELINUX policy"
a89620
-		return $OCF_ERR_GENERIC
a89620
+		# we could not join the rabbitmq cluster from any of the running nodes
a89620
+		# this might be due to a unexpected reset of those nodes. Give ourself
a89620
+		# a chance to start by retrying the entire start sequence.
a89620
+
a89620
+		ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq"
a89620
+		rmq_stop
a89620
+
a89620
+		ocf_log warn "Re-detect available rabbitmq nodes and try to start again"
a89620
+		# return an unused OCF value to signify a "retry" condition
a89620
+		return $RMQ_TRY_RESTART_ERROR_CODE
a89620
 	fi
a89620
 
a89620
 	# Restore users, user permissions, and policies (if any)
a89620
@@ -443,6 +457,15 @@ rmq_start() {
a89620
 	return $OCF_SUCCESS
a89620
 }
a89620
 
a89620
+rmq_start() {
a89620
+	local rc=$RMQ_TRY_RESTART_ERROR_CODE
a89620
+	while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do
a89620
+		rmq_try_start
a89620
+		rc=$?
a89620
+	done
a89620
+	return $rc
a89620
+}
a89620
+
a89620
 rmq_stop() {
a89620
 	# Backup users, user permissions, and policies
a89620
 	BaseDataDir=`dirname $RMQ_DATA_DIR`