Blame SOURCES/bz1656733-rabbitmq-cluster-retry-start-cluster-join-fails.patch

082cb3
From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001
082cb3
From: Damien Ciabrini <dciabrin@redhat.com>
082cb3
Date: Thu, 9 Aug 2018 16:33:26 +0200
082cb3
Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails
082cb3
082cb3
When a node tries to join an existing cluster, it fetches a node
082cb3
list to try to connect from any of those running nodes.
082cb3
082cb3
If the nodes from this list become unavailable while we're joining
082cb3
the cluster, the rabbitmq server will fail to get clustered and
082cb3
make the start operation fail.
082cb3
082cb3
Give the resource a chance to start anyway by retrying the entire
082cb3
start actions until it succeeds or until the start timeout is
082cb3
reached and pacemaker stops the start operation.
082cb3
082cb3
Co-Authored-by: <michele@acksyn.org>
082cb3
Suggested-by: <abeekhof@redhat.com>
082cb3
---
082cb3
 heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++---
082cb3
 1 file changed, 26 insertions(+), 3 deletions(-)
082cb3
082cb3
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
082cb3
index 9ff49e075..84f383460 100755
082cb3
--- a/heartbeat/rabbitmq-cluster
082cb3
+++ b/heartbeat/rabbitmq-cluster
082cb3
@@ -31,6 +31,12 @@
082cb3
 
082cb3
 #######################################################################
082cb3
 
082cb3
+# This arbitrary value here is used by the rmq_start action to
082cb3
+# signify that the resource agent must retry the start process
082cb3
+# It might potentially conflict with OCF assigned error code
082cb3
+# in the future.
082cb3
+RMQ_TRY_RESTART_ERROR_CODE=126
082cb3
+
082cb3
 RMQ_SERVER=/usr/sbin/rabbitmq-server
082cb3
 RMQ_CTL=/usr/sbin/rabbitmqctl
082cb3
 RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia"
082cb3
@@ -354,7 +360,7 @@ rmq_notify() {
082cb3
 	return $OCF_SUCCESS
082cb3
 }
082cb3
 
082cb3
-rmq_start() {
082cb3
+rmq_try_start() {
082cb3
 	local join_list=""
082cb3
 	local rc
082cb3
 
082cb3
@@ -384,8 +390,16 @@ rmq_start() {
082cb3
 	rc=$?
082cb3
 
082cb3
 	if [ $rc -ne 0 ]; then
082cb3
-		ocf_log info "node failed to join even after reseting local data. Check SELINUX policy"
082cb3
-		return $OCF_ERR_GENERIC
082cb3
+		# we could not join the rabbitmq cluster from any of the running nodes
082cb3
+		# this might be due to a unexpected reset of those nodes. Give ourself
082cb3
+		# a chance to start by retrying the entire start sequence.
082cb3
+
082cb3
+		ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq"
082cb3
+		rmq_stop
082cb3
+
082cb3
+		ocf_log warn "Re-detect available rabbitmq nodes and try to start again"
082cb3
+		# return an unused OCF value to signify a "retry" condition
082cb3
+		return $RMQ_TRY_RESTART_ERROR_CODE
082cb3
 	fi
082cb3
 
082cb3
 	# Restore users, user permissions, and policies (if any)
082cb3
@@ -443,6 +457,15 @@ rmq_start() {
082cb3
 	return $OCF_SUCCESS
082cb3
 }
082cb3
 
082cb3
+rmq_start() {
082cb3
+	local rc=$RMQ_TRY_RESTART_ERROR_CODE
082cb3
+	while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do
082cb3
+		rmq_try_start
082cb3
+		rc=$?
082cb3
+	done
082cb3
+	return $rc
082cb3
+}
082cb3
+
082cb3
 rmq_stop() {
082cb3
 	# Backup users, user permissions, and policies
082cb3
 	BaseDataDir=`dirname $RMQ_DATA_DIR`