Blame SOURCES/bz1745713-rabbitmq-cluster-4-retry-start-cluster-join-fails.patch

734564
From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001
734564
From: Damien Ciabrini <dciabrin@redhat.com>
734564
Date: Thu, 9 Aug 2018 16:33:26 +0200
734564
Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails
734564
734564
When a node tries to join an existing cluster, it fetches a node
734564
list to try to connect from any of those running nodes.
734564
734564
If the nodes from this list become unavailable while we're joining
734564
the cluster, the rabbitmq server will fail to get clustered and
734564
make the start operation fail.
734564
734564
Give the resource a chance to start anyway by retrying the entire
734564
start actions until it succeeds or until the start timeout is
734564
reached and pacemaker stops the start operation.
734564
734564
Co-Authored-by: <michele@acksyn.org>
734564
Suggested-by: <abeekhof@redhat.com>
734564
---
734564
 heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++---
734564
 1 file changed, 26 insertions(+), 3 deletions(-)
734564
734564
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
734564
index 9ff49e075..84f383460 100755
734564
--- a/heartbeat/rabbitmq-cluster
734564
+++ b/heartbeat/rabbitmq-cluster
734564
@@ -31,6 +31,12 @@
734564
 
734564
 #######################################################################
734564
 
734564
+# This arbitrary value here is used by the rmq_start action to
734564
+# signify that the resource agent must retry the start process
734564
+# It might potentially conflict with OCF assigned error code
734564
+# in the future.
734564
+RMQ_TRY_RESTART_ERROR_CODE=126
734564
+
734564
 RMQ_SERVER=/usr/sbin/rabbitmq-server
734564
 RMQ_CTL=/usr/sbin/rabbitmqctl
734564
 RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia"
734564
@@ -354,7 +360,7 @@ rmq_notify() {
734564
 	return $OCF_SUCCESS
734564
 }
734564
 
734564
-rmq_start() {
734564
+rmq_try_start() {
734564
 	local join_list=""
734564
 	local rc
734564
 
734564
@@ -384,8 +390,16 @@ rmq_start() {
734564
 	rc=$?
734564
 
734564
 	if [ $rc -ne 0 ]; then
734564
-		ocf_log info "node failed to join even after reseting local data. Check SELINUX policy"
734564
-		return $OCF_ERR_GENERIC
734564
+		# we could not join the rabbitmq cluster from any of the running nodes
734564
+		# this might be due to a unexpected reset of those nodes. Give ourself
734564
+		# a chance to start by retrying the entire start sequence.
734564
+
734564
+		ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq"
734564
+		rmq_stop
734564
+
734564
+		ocf_log warn "Re-detect available rabbitmq nodes and try to start again"
734564
+		# return an unused OCF value to signify a "retry" condition
734564
+		return $RMQ_TRY_RESTART_ERROR_CODE
734564
 	fi
734564
 
734564
 	# Restore users, user permissions, and policies (if any)
734564
@@ -443,6 +457,15 @@ rmq_start() {
734564
 	return $OCF_SUCCESS
734564
 }
734564
 
734564
+rmq_start() {
734564
+	local rc=$RMQ_TRY_RESTART_ERROR_CODE
734564
+	while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do
734564
+		rmq_try_start
734564
+		rc=$?
734564
+	done
734564
+	return $rc
734564
+}
734564
+
734564
 rmq_stop() {
734564
 	# Backup users, user permissions, and policies
734564
 	BaseDataDir=`dirname $RMQ_DATA_DIR`