Blame SOURCES/bz1745713-rabbitmq-cluster-2-fail-when-in-minority-partition.patch

b4b3ce
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
b4b3ce
From: John Eckersberg <jeckersb@redhat.com>
b4b3ce
Date: Tue, 16 Oct 2018 16:21:25 -0400
b4b3ce
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
b4b3ce
 partition
b4b3ce
b4b3ce
It's possible for mnesia to still be running, but for mnesia to be
b4b3ce
partitioned.  And it's also possible to get into this state without
b4b3ce
pacemaker seeing the node go down so no corrective action is taken.
b4b3ce
b4b3ce
When monitoring, check the number of nodes that pacemaker thinks is
b4b3ce
running, and compare to the number of nodes that mnesia thinks is
b4b3ce
running.  If mnesia only sees a minority of the total nodes, fail it
b4b3ce
so corrective action can be taken to rejoin the cluster.
b4b3ce
b4b3ce
This also adds a new function, rmq_app_running, which simply checks
b4b3ce
whether the app is running or not and does not care about the
b4b3ce
partition status.  This is now used instead of the full monitor in a
b4b3ce
few places where we don't care about partition state.
b4b3ce
b4b3ce
Resolves: RHBZ#1639826
b4b3ce
---
b4b3ce
 heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
b4b3ce
 1 file changed, 25 insertions(+), 3 deletions(-)
b4b3ce
b4b3ce
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
b4b3ce
index 204917475..78b2bbadf 100755
b4b3ce
--- a/heartbeat/rabbitmq-cluster
b4b3ce
+++ b/heartbeat/rabbitmq-cluster
b4b3ce
@@ -178,10 +178,31 @@ remove_pid () {
b4b3ce
 	rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
b4b3ce
 }
b4b3ce
 
b4b3ce
+rmq_app_running() {
b4b3ce
+	if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
b4b3ce
+		ocf_log debug "RabbitMQ application is running"
b4b3ce
+		return $OCF_SUCCESS
b4b3ce
+	else
b4b3ce
+		ocf_log debug "RabbitMQ application is stopped"
b4b3ce
+		return $OCF_NOT_RUNNING
b4b3ce
+	fi
b4b3ce
+}
b4b3ce
+
b4b3ce
 rmq_monitor() {
b4b3ce
 	local rc
b4b3ce
 
b4b3ce
 	if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
b4b3ce
+		pcs_running=$(rmq_join_list | wc -w)
b4b3ce
+		ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
b4b3ce
+		rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
b4b3ce
+		ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
b4b3ce
+
b4b3ce
+		if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
b4b3ce
+			ocf_log info "RabbitMQ is a minority partition, failing monitor"
b4b3ce
+			rmq_delete_nodename
b4b3ce
+			return $OCF_ERR_GENERIC
b4b3ce
+		fi
b4b3ce
+
b4b3ce
 		ocf_log debug "RabbitMQ server is running normally"
b4b3ce
 		rmq_write_nodename
b4b3ce
 
b4b3ce
@@ -215,7 +236,7 @@ rmq_init_and_wait()
b4b3ce
 		return $OCF_ERR_GENERIC
b4b3ce
 	fi
b4b3ce
 
b4b3ce
-	rmq_monitor
b4b3ce
+	rmq_app_running
b4b3ce
 	return $?
b4b3ce
 }
b4b3ce
 
b4b3ce
@@ -236,6 +257,7 @@ rmq_start_first()
b4b3ce
 	if [ $rc -eq 0 ]; then
b4b3ce
 		rc=$OCF_SUCCESS
b4b3ce
 		ocf_log info "cluster bootstrapped"
b4b3ce
+		rmq_write_nodename
b4b3ce
 
b4b3ce
 		if [ -n "$OCF_RESKEY_set_policy" ]; then
b4b3ce
 			# do not quote set_policy, we are passing in arguments
b4b3ce
@@ -492,7 +514,7 @@ rmq_stop() {
b4b3ce
 		end.
b4b3ce
 	"
b4b3ce
 
b4b3ce
-	rmq_monitor
b4b3ce
+	rmq_app_running
b4b3ce
 	if [ $? -eq $OCF_NOT_RUNNING ]; then
b4b3ce
 		return $OCF_SUCCESS
b4b3ce
 	fi
b4b3ce
@@ -508,7 +530,7 @@ rmq_stop() {
b4b3ce
 	#TODO add kill logic
b4b3ce
 	stop_wait=1
b4b3ce
 	while [ $stop_wait = 1 ]; do
b4b3ce
-		rmq_monitor
b4b3ce
+		rmq_app_running
b4b3ce
 		rc=$?
b4b3ce
 		if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
b4b3ce
 			stop_wait=0