Tree - rpms/resource-agents - CentOS Git server

rpms / resource-agents

Blame SOURCES/bz1745713-rabbitmq-cluster-2-fail-when-in-minority-partition.patch

Blob History Raw

		b4b3ce	`From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001`
		b4b3ce	`From: John Eckersberg <jeckersb@redhat.com>`
		b4b3ce	`Date: Tue, 16 Oct 2018 16:21:25 -0400`
		b4b3ce	`Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority`
		b4b3ce	`partition`
		b4b3ce
		b4b3ce	`It's possible for mnesia to still be running, but for mnesia to be`
		b4b3ce	`partitioned. And it's also possible to get into this state without`
		b4b3ce	`pacemaker seeing the node go down so no corrective action is taken.`
		b4b3ce
		b4b3ce	`When monitoring, check the number of nodes that pacemaker thinks is`
		b4b3ce	`running, and compare to the number of nodes that mnesia thinks is`
		b4b3ce	`running. If mnesia only sees a minority of the total nodes, fail it`
		b4b3ce	`so corrective action can be taken to rejoin the cluster.`
		b4b3ce
		b4b3ce	`This also adds a new function, rmq_app_running, which simply checks`
		b4b3ce	`whether the app is running or not and does not care about the`
		b4b3ce	`partition status. This is now used instead of the full monitor in a`
		b4b3ce	`few places where we don't care about partition state.`
		b4b3ce
		b4b3ce	`Resolves: RHBZ#1639826`
		b4b3ce	`---`
		b4b3ce	`heartbeat/rabbitmq-cluster \| 28 +++++++++++++++++++++++++---`
		b4b3ce	`1 file changed, 25 insertions(+), 3 deletions(-)`
		b4b3ce
		b4b3ce	`diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster`
		b4b3ce	`index 204917475..78b2bbadf 100755`
		b4b3ce	`--- a/heartbeat/rabbitmq-cluster`
		b4b3ce	`+++ b/heartbeat/rabbitmq-cluster`
		b4b3ce	`@@ -178,10 +178,31 @@ remove_pid () {`
		b4b3ce	`rm -f ${RMQ_PID_FILE} > /dev/null 2>&1`
		b4b3ce	`}`
		b4b3ce
		b4b3ce	`+rmq_app_running() {`
		b4b3ce	`+ if $RMQ_CTL eval 'application:which_applications().' \| grep -q '{rabbit,'; then`
		b4b3ce	`+ ocf_log debug "RabbitMQ application is running"`
		b4b3ce	`+ return $OCF_SUCCESS`
		b4b3ce	`+ else`
		b4b3ce	`+ ocf_log debug "RabbitMQ application is stopped"`
		b4b3ce	`+ return $OCF_NOT_RUNNING`
		b4b3ce	`+ fi`
		b4b3ce	`+}`
		b4b3ce	`+`
		b4b3ce	`rmq_monitor() {`
		b4b3ce	`local rc`
		b4b3ce
		b4b3ce	`if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' \| grep -q '^{ok'; then`
		b4b3ce	`+ pcs_running=$(rmq_join_list \| wc -w)`
		b4b3ce	`+ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"`
		b4b3ce	`+ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')`
		b4b3ce	`+ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"`
		b4b3ce	`+`
		b4b3ce	`+ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then`
		b4b3ce	`+ ocf_log info "RabbitMQ is a minority partition, failing monitor"`
		b4b3ce	`+ rmq_delete_nodename`
		b4b3ce	`+ return $OCF_ERR_GENERIC`
		b4b3ce	`+ fi`
		b4b3ce	`+`
		b4b3ce	`ocf_log debug "RabbitMQ server is running normally"`
		b4b3ce	`rmq_write_nodename`
		b4b3ce
		b4b3ce	`@@ -215,7 +236,7 @@ rmq_init_and_wait()`
		b4b3ce	`return $OCF_ERR_GENERIC`
		b4b3ce	`fi`
		b4b3ce
		b4b3ce	`- rmq_monitor`
		b4b3ce	`+ rmq_app_running`
		b4b3ce	`return $?`
		b4b3ce	`}`
		b4b3ce
		b4b3ce	`@@ -236,6 +257,7 @@ rmq_start_first()`
		b4b3ce	`if [ $rc -eq 0 ]; then`
		b4b3ce	`rc=$OCF_SUCCESS`
		b4b3ce	`ocf_log info "cluster bootstrapped"`
		b4b3ce	`+ rmq_write_nodename`
		b4b3ce
		b4b3ce	`if [ -n "$OCF_RESKEY_set_policy" ]; then`
		b4b3ce	`# do not quote set_policy, we are passing in arguments`
		b4b3ce	`@@ -492,7 +514,7 @@ rmq_stop() {`
		b4b3ce	`end.`
		b4b3ce	`"`
		b4b3ce
		b4b3ce	`- rmq_monitor`
		b4b3ce	`+ rmq_app_running`
		b4b3ce	`if [ $? -eq $OCF_NOT_RUNNING ]; then`
		b4b3ce	`return $OCF_SUCCESS`
		b4b3ce	`fi`
		b4b3ce	`@@ -508,7 +530,7 @@ rmq_stop() {`
		b4b3ce	`#TODO add kill logic`
		b4b3ce	`stop_wait=1`
		b4b3ce	`while [ $stop_wait = 1 ]; do`
		b4b3ce	`- rmq_monitor`
		b4b3ce	`+ rmq_app_running`
		b4b3ce	`rc=$?`
		b4b3ce	`if [ "$rc" -eq $OCF_NOT_RUNNING ]; then`
		b4b3ce	`stop_wait=0`

rpms / resource-agents

Source Code

Blame SOURCES/bz1745713-rabbitmq-cluster-2-fail-when-in-minority-partition.patch