Blob Blame History Raw
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
From: John Eckersberg <jeckersb@redhat.com>
Date: Tue, 16 Oct 2018 16:21:25 -0400
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
 partition

It's possible for mnesia to still be running, but for mnesia to be
partitioned.  And it's also possible to get into this state without
pacemaker seeing the node go down so no corrective action is taken.

When monitoring, check the number of nodes that pacemaker thinks is
running, and compare to the number of nodes that mnesia thinks is
running.  If mnesia only sees a minority of the total nodes, fail it
so corrective action can be taken to rejoin the cluster.

This also adds a new function, rmq_app_running, which simply checks
whether the app is running or not and does not care about the
partition status.  This is now used instead of the full monitor in a
few places where we don't care about partition state.

Resolves: RHBZ#1639826
---
 heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
index 204917475..78b2bbadf 100755
--- a/heartbeat/rabbitmq-cluster
+++ b/heartbeat/rabbitmq-cluster
@@ -178,10 +178,31 @@ remove_pid () {
 	rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
 }
 
+rmq_app_running() {
+	if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
+		ocf_log debug "RabbitMQ application is running"
+		return $OCF_SUCCESS
+	else
+		ocf_log debug "RabbitMQ application is stopped"
+		return $OCF_NOT_RUNNING
+	fi
+}
+
 rmq_monitor() {
 	local rc
 
 	if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
+		pcs_running=$(rmq_join_list | wc -w)
+		ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
+		rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
+		ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
+
+		if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
+			ocf_log info "RabbitMQ is a minority partition, failing monitor"
+			rmq_delete_nodename
+			return $OCF_ERR_GENERIC
+		fi
+
 		ocf_log debug "RabbitMQ server is running normally"
 		rmq_write_nodename
 
@@ -215,7 +236,7 @@ rmq_init_and_wait()
 		return $OCF_ERR_GENERIC
 	fi
 
-	rmq_monitor
+	rmq_app_running
 	return $?
 }
 
@@ -236,6 +257,7 @@ rmq_start_first()
 	if [ $rc -eq 0 ]; then
 		rc=$OCF_SUCCESS
 		ocf_log info "cluster bootstrapped"
+		rmq_write_nodename
 
 		if [ -n "$OCF_RESKEY_set_policy" ]; then
 			# do not quote set_policy, we are passing in arguments
@@ -492,7 +514,7 @@ rmq_stop() {
 		end.
 	"
 
-	rmq_monitor
+	rmq_app_running
 	if [ $? -eq $OCF_NOT_RUNNING ]; then
 		return $OCF_SUCCESS
 	fi
@@ -508,7 +530,7 @@ rmq_stop() {
 	#TODO add kill logic
 	stop_wait=1
 	while [ $stop_wait = 1 ]; do
-		rmq_monitor
+		rmq_app_running
 		rc=$?
 		if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
 			stop_wait=0