Blame SOURCES/bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch

00d327
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
00d327
From: John Eckersberg <jeckersb@redhat.com>
00d327
Date: Tue, 16 Oct 2018 16:21:25 -0400
00d327
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
00d327
 partition
00d327
00d327
It's possible for mnesia to still be running, but for mnesia to be
00d327
partitioned.  And it's also possible to get into this state without
00d327
pacemaker seeing the node go down so no corrective action is taken.
00d327
00d327
When monitoring, check the number of nodes that pacemaker thinks is
00d327
running, and compare to the number of nodes that mnesia thinks is
00d327
running.  If mnesia only sees a minority of the total nodes, fail it
00d327
so corrective action can be taken to rejoin the cluster.
00d327
00d327
This also adds a new function, rmq_app_running, which simply checks
00d327
whether the app is running or not and does not care about the
00d327
partition status.  This is now used instead of the full monitor in a
00d327
few places where we don't care about partition state.
00d327
00d327
Resolves: RHBZ#1639826
00d327
---
00d327
 heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
00d327
 1 file changed, 25 insertions(+), 3 deletions(-)
00d327
00d327
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
00d327
index 204917475..78b2bbadf 100755
00d327
--- a/heartbeat/rabbitmq-cluster
00d327
+++ b/heartbeat/rabbitmq-cluster
00d327
@@ -178,10 +178,31 @@ remove_pid () {
00d327
 	rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
00d327
 }
00d327
 
00d327
+rmq_app_running() {
00d327
+	if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
00d327
+		ocf_log debug "RabbitMQ application is running"
00d327
+		return $OCF_SUCCESS
00d327
+	else
00d327
+		ocf_log debug "RabbitMQ application is stopped"
00d327
+		return $OCF_NOT_RUNNING
00d327
+	fi
00d327
+}
00d327
+
00d327
 rmq_monitor() {
00d327
 	local rc
00d327
 
00d327
 	if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
00d327
+		pcs_running=$(rmq_join_list | wc -w)
00d327
+		ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
00d327
+		rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
00d327
+		ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
00d327
+
00d327
+		if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
00d327
+			ocf_log info "RabbitMQ is a minority partition, failing monitor"
00d327
+			rmq_delete_nodename
00d327
+			return $OCF_ERR_GENERIC
00d327
+		fi
00d327
+
00d327
 		ocf_log debug "RabbitMQ server is running normally"
00d327
 		rmq_write_nodename
00d327
 
00d327
@@ -215,7 +236,7 @@ rmq_init_and_wait()
00d327
 		return $OCF_ERR_GENERIC
00d327
 	fi
00d327
 
00d327
-	rmq_monitor
00d327
+	rmq_app_running
00d327
 	return $?
00d327
 }
00d327
 
00d327
@@ -236,6 +257,7 @@ rmq_start_first()
00d327
 	if [ $rc -eq 0 ]; then
00d327
 		rc=$OCF_SUCCESS
00d327
 		ocf_log info "cluster bootstrapped"
00d327
+		rmq_write_nodename
00d327
 
00d327
 		if [ -n "$OCF_RESKEY_set_policy" ]; then
00d327
 			# do not quote set_policy, we are passing in arguments
00d327
@@ -492,7 +514,7 @@ rmq_stop() {
00d327
 		end.
00d327
 	"
00d327
 
00d327
-	rmq_monitor
00d327
+	rmq_app_running
00d327
 	if [ $? -eq $OCF_NOT_RUNNING ]; then
00d327
 		return $OCF_SUCCESS
00d327
 	fi
00d327
@@ -508,7 +530,7 @@ rmq_stop() {
00d327
 	#TODO add kill logic
00d327
 	stop_wait=1
00d327
 	while [ $stop_wait = 1 ]; do
00d327
-		rmq_monitor
00d327
+		rmq_app_running
00d327
 		rc=$?
00d327
 		if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
00d327
 			stop_wait=0