Blame SOURCES/bz1745713-rabbitmq-cluster-2-fail-when-in-minority-partition.patch

a89620
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
a89620
From: John Eckersberg <jeckersb@redhat.com>
a89620
Date: Tue, 16 Oct 2018 16:21:25 -0400
a89620
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
a89620
 partition
a89620
a89620
It's possible for mnesia to still be running, but for mnesia to be
a89620
partitioned.  And it's also possible to get into this state without
a89620
pacemaker seeing the node go down so no corrective action is taken.
a89620
a89620
When monitoring, check the number of nodes that pacemaker thinks is
a89620
running, and compare to the number of nodes that mnesia thinks is
a89620
running.  If mnesia only sees a minority of the total nodes, fail it
a89620
so corrective action can be taken to rejoin the cluster.
a89620
a89620
This also adds a new function, rmq_app_running, which simply checks
a89620
whether the app is running or not and does not care about the
a89620
partition status.  This is now used instead of the full monitor in a
a89620
few places where we don't care about partition state.
a89620
a89620
Resolves: RHBZ#1639826
a89620
---
a89620
 heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
a89620
 1 file changed, 25 insertions(+), 3 deletions(-)
a89620
a89620
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
a89620
index 204917475..78b2bbadf 100755
a89620
--- a/heartbeat/rabbitmq-cluster
a89620
+++ b/heartbeat/rabbitmq-cluster
a89620
@@ -178,10 +178,31 @@ remove_pid () {
a89620
 	rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
a89620
 }
a89620
 
a89620
+rmq_app_running() {
a89620
+	if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
a89620
+		ocf_log debug "RabbitMQ application is running"
a89620
+		return $OCF_SUCCESS
a89620
+	else
a89620
+		ocf_log debug "RabbitMQ application is stopped"
a89620
+		return $OCF_NOT_RUNNING
a89620
+	fi
a89620
+}
a89620
+
a89620
 rmq_monitor() {
a89620
 	local rc
a89620
 
a89620
 	if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
a89620
+		pcs_running=$(rmq_join_list | wc -w)
a89620
+		ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
a89620
+		rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
a89620
+		ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
a89620
+
a89620
+		if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
a89620
+			ocf_log info "RabbitMQ is a minority partition, failing monitor"
a89620
+			rmq_delete_nodename
a89620
+			return $OCF_ERR_GENERIC
a89620
+		fi
a89620
+
a89620
 		ocf_log debug "RabbitMQ server is running normally"
a89620
 		rmq_write_nodename
a89620
 
a89620
@@ -215,7 +236,7 @@ rmq_init_and_wait()
a89620
 		return $OCF_ERR_GENERIC
a89620
 	fi
a89620
 
a89620
-	rmq_monitor
a89620
+	rmq_app_running
a89620
 	return $?
a89620
 }
a89620
 
a89620
@@ -236,6 +257,7 @@ rmq_start_first()
a89620
 	if [ $rc -eq 0 ]; then
a89620
 		rc=$OCF_SUCCESS
a89620
 		ocf_log info "cluster bootstrapped"
a89620
+		rmq_write_nodename
a89620
 
a89620
 		if [ -n "$OCF_RESKEY_set_policy" ]; then
a89620
 			# do not quote set_policy, we are passing in arguments
a89620
@@ -492,7 +514,7 @@ rmq_stop() {
a89620
 		end.
a89620
 	"
a89620
 
a89620
-	rmq_monitor
a89620
+	rmq_app_running
a89620
 	if [ $? -eq $OCF_NOT_RUNNING ]; then
a89620
 		return $OCF_SUCCESS
a89620
 	fi
a89620
@@ -508,7 +530,7 @@ rmq_stop() {
a89620
 	#TODO add kill logic
a89620
 	stop_wait=1
a89620
 	while [ $stop_wait = 1 ]; do
a89620
-		rmq_monitor
a89620
+		rmq_app_running
a89620
 		rc=$?
a89620
 		if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
a89620
 			stop_wait=0