|
|
a89620 |
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
|
|
|
a89620 |
From: John Eckersberg <jeckersb@redhat.com>
|
|
|
a89620 |
Date: Tue, 16 Oct 2018 16:21:25 -0400
|
|
|
a89620 |
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
|
|
|
a89620 |
partition
|
|
|
a89620 |
|
|
|
a89620 |
It's possible for mnesia to still be running, but for mnesia to be
|
|
|
a89620 |
partitioned. And it's also possible to get into this state without
|
|
|
a89620 |
pacemaker seeing the node go down so no corrective action is taken.
|
|
|
a89620 |
|
|
|
a89620 |
When monitoring, check the number of nodes that pacemaker thinks is
|
|
|
a89620 |
running, and compare to the number of nodes that mnesia thinks is
|
|
|
a89620 |
running. If mnesia only sees a minority of the total nodes, fail it
|
|
|
a89620 |
so corrective action can be taken to rejoin the cluster.
|
|
|
a89620 |
|
|
|
a89620 |
This also adds a new function, rmq_app_running, which simply checks
|
|
|
a89620 |
whether the app is running or not and does not care about the
|
|
|
a89620 |
partition status. This is now used instead of the full monitor in a
|
|
|
a89620 |
few places where we don't care about partition state.
|
|
|
a89620 |
|
|
|
a89620 |
Resolves: RHBZ#1639826
|
|
|
a89620 |
---
|
|
|
a89620 |
heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
|
|
|
a89620 |
1 file changed, 25 insertions(+), 3 deletions(-)
|
|
|
a89620 |
|
|
|
a89620 |
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
index 204917475..78b2bbadf 100755
|
|
|
a89620 |
--- a/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
+++ b/heartbeat/rabbitmq-cluster
|
|
|
a89620 |
@@ -178,10 +178,31 @@ remove_pid () {
|
|
|
a89620 |
rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
|
|
|
a89620 |
}
|
|
|
a89620 |
|
|
|
a89620 |
+rmq_app_running() {
|
|
|
a89620 |
+ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
|
|
|
a89620 |
+ ocf_log debug "RabbitMQ application is running"
|
|
|
a89620 |
+ return $OCF_SUCCESS
|
|
|
a89620 |
+ else
|
|
|
a89620 |
+ ocf_log debug "RabbitMQ application is stopped"
|
|
|
a89620 |
+ return $OCF_NOT_RUNNING
|
|
|
a89620 |
+ fi
|
|
|
a89620 |
+}
|
|
|
a89620 |
+
|
|
|
a89620 |
rmq_monitor() {
|
|
|
a89620 |
local rc
|
|
|
a89620 |
|
|
|
a89620 |
if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
|
|
|
a89620 |
+ pcs_running=$(rmq_join_list | wc -w)
|
|
|
a89620 |
+ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
|
|
|
a89620 |
+ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
|
|
|
a89620 |
+ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
|
|
|
a89620 |
+
|
|
|
a89620 |
+ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
|
|
|
a89620 |
+ ocf_log info "RabbitMQ is a minority partition, failing monitor"
|
|
|
a89620 |
+ rmq_delete_nodename
|
|
|
a89620 |
+ return $OCF_ERR_GENERIC
|
|
|
a89620 |
+ fi
|
|
|
a89620 |
+
|
|
|
a89620 |
ocf_log debug "RabbitMQ server is running normally"
|
|
|
a89620 |
rmq_write_nodename
|
|
|
a89620 |
|
|
|
a89620 |
@@ -215,7 +236,7 @@ rmq_init_and_wait()
|
|
|
a89620 |
return $OCF_ERR_GENERIC
|
|
|
a89620 |
fi
|
|
|
a89620 |
|
|
|
a89620 |
- rmq_monitor
|
|
|
a89620 |
+ rmq_app_running
|
|
|
a89620 |
return $?
|
|
|
a89620 |
}
|
|
|
a89620 |
|
|
|
a89620 |
@@ -236,6 +257,7 @@ rmq_start_first()
|
|
|
a89620 |
if [ $rc -eq 0 ]; then
|
|
|
a89620 |
rc=$OCF_SUCCESS
|
|
|
a89620 |
ocf_log info "cluster bootstrapped"
|
|
|
a89620 |
+ rmq_write_nodename
|
|
|
a89620 |
|
|
|
a89620 |
if [ -n "$OCF_RESKEY_set_policy" ]; then
|
|
|
a89620 |
# do not quote set_policy, we are passing in arguments
|
|
|
a89620 |
@@ -492,7 +514,7 @@ rmq_stop() {
|
|
|
a89620 |
end.
|
|
|
a89620 |
"
|
|
|
a89620 |
|
|
|
a89620 |
- rmq_monitor
|
|
|
a89620 |
+ rmq_app_running
|
|
|
a89620 |
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
a89620 |
return $OCF_SUCCESS
|
|
|
a89620 |
fi
|
|
|
a89620 |
@@ -508,7 +530,7 @@ rmq_stop() {
|
|
|
a89620 |
#TODO add kill logic
|
|
|
a89620 |
stop_wait=1
|
|
|
a89620 |
while [ $stop_wait = 1 ]; do
|
|
|
a89620 |
- rmq_monitor
|
|
|
a89620 |
+ rmq_app_running
|
|
|
a89620 |
rc=$?
|
|
|
a89620 |
if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
|
|
|
a89620 |
stop_wait=0
|