Blame SOURCES/bz1745713-rabbitmq-cluster-4-retry-start-cluster-join-fails.patch

b4b3ce
From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001
b4b3ce
From: Damien Ciabrini <dciabrin@redhat.com>
b4b3ce
Date: Thu, 9 Aug 2018 16:33:26 +0200
b4b3ce
Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails
b4b3ce
b4b3ce
When a node tries to join an existing cluster, it fetches a node
b4b3ce
list to try to connect from any of those running nodes.
b4b3ce
b4b3ce
If the nodes from this list become unavailable while we're joining
b4b3ce
the cluster, the rabbitmq server will fail to get clustered and
b4b3ce
make the start operation fail.
b4b3ce
b4b3ce
Give the resource a chance to start anyway by retrying the entire
b4b3ce
start actions until it succeeds or until the start timeout is
b4b3ce
reached and pacemaker stops the start operation.
b4b3ce
b4b3ce
Co-Authored-by: <michele@acksyn.org>
b4b3ce
Suggested-by: <abeekhof@redhat.com>
b4b3ce
---
b4b3ce
 heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++---
b4b3ce
 1 file changed, 26 insertions(+), 3 deletions(-)
b4b3ce
b4b3ce
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
b4b3ce
index 9ff49e075..84f383460 100755
b4b3ce
--- a/heartbeat/rabbitmq-cluster
b4b3ce
+++ b/heartbeat/rabbitmq-cluster
b4b3ce
@@ -31,6 +31,12 @@
b4b3ce
 
b4b3ce
 #######################################################################
b4b3ce
 
b4b3ce
+# This arbitrary value here is used by the rmq_start action to
b4b3ce
+# signify that the resource agent must retry the start process
b4b3ce
+# It might potentially conflict with OCF assigned error code
b4b3ce
+# in the future.
b4b3ce
+RMQ_TRY_RESTART_ERROR_CODE=126
b4b3ce
+
b4b3ce
 RMQ_SERVER=/usr/sbin/rabbitmq-server
b4b3ce
 RMQ_CTL=/usr/sbin/rabbitmqctl
b4b3ce
 RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia"
b4b3ce
@@ -354,7 +360,7 @@ rmq_notify() {
b4b3ce
 	return $OCF_SUCCESS
b4b3ce
 }
b4b3ce
 
b4b3ce
-rmq_start() {
b4b3ce
+rmq_try_start() {
b4b3ce
 	local join_list=""
b4b3ce
 	local rc
b4b3ce
 
b4b3ce
@@ -384,8 +390,16 @@ rmq_start() {
b4b3ce
 	rc=$?
b4b3ce
 
b4b3ce
 	if [ $rc -ne 0 ]; then
b4b3ce
-		ocf_log info "node failed to join even after reseting local data. Check SELINUX policy"
b4b3ce
-		return $OCF_ERR_GENERIC
b4b3ce
+		# we could not join the rabbitmq cluster from any of the running nodes
b4b3ce
+		# this might be due to a unexpected reset of those nodes. Give ourself
b4b3ce
+		# a chance to start by retrying the entire start sequence.
b4b3ce
+
b4b3ce
+		ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq"
b4b3ce
+		rmq_stop
b4b3ce
+
b4b3ce
+		ocf_log warn "Re-detect available rabbitmq nodes and try to start again"
b4b3ce
+		# return an unused OCF value to signify a "retry" condition
b4b3ce
+		return $RMQ_TRY_RESTART_ERROR_CODE
b4b3ce
 	fi
b4b3ce
 
b4b3ce
 	# Restore users, user permissions, and policies (if any)
b4b3ce
@@ -443,6 +457,15 @@ rmq_start() {
b4b3ce
 	return $OCF_SUCCESS
b4b3ce
 }
b4b3ce
 
b4b3ce
+rmq_start() {
b4b3ce
+	local rc=$RMQ_TRY_RESTART_ERROR_CODE
b4b3ce
+	while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do
b4b3ce
+		rmq_try_start
b4b3ce
+		rc=$?
b4b3ce
+	done
b4b3ce
+	return $rc
b4b3ce
+}
b4b3ce
+
b4b3ce
 rmq_stop() {
b4b3ce
 	# Backup users, user permissions, and policies
b4b3ce
 	BaseDataDir=`dirname $RMQ_DATA_DIR`