From 00d327e44048d2acdbe9354f3b488721e501806e Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Nov 26 2018 15:29:42 +0000 Subject: import resource-agents-4.1.1-12.el7_6.6 --- diff --git a/SOURCES/bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch b/SOURCES/bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch new file mode 100644 index 0000000..fab8bfd --- /dev/null +++ b/SOURCES/bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch @@ -0,0 +1,57 @@ +From fcaa52bb98a8686d993550c6f4ab7867625c8059 Mon Sep 17 00:00:00 2001 +From: John Eckersberg +Date: Wed, 29 Aug 2018 16:18:55 -0400 +Subject: [PATCH] rabbitmq-cluster: get cluster status from mnesia during + monitor + +If mnesia is not running (for example if `rabbitmqctl stop_app` has +been called, or the service has paused during partition due to the +pause_minority strategy) then the cluster_status command to +rabbitmqctl will read the cached cluster status from disk and the +command returns 0 even though the service isn't really running at all. + +Instead, force the cluster status to be read from mnesia. If mnesia +is not running due to the above or similar circumstances, the command +will catch that and properly fail the monitor action. + +Resolves: RHBZ#1595753 +--- + heartbeat/rabbitmq-cluster | 20 +++++--------------- + 1 file changed, 5 insertions(+), 15 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index a7d2db614..204917475 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -181,26 +181,16 @@ remove_pid () { + rmq_monitor() { + local rc + +- $RMQ_CTL cluster_status > /dev/null 2>&1 +- rc=$? +- case "$rc" in +- 0) ++ if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then + ocf_log debug "RabbitMQ server is running normally" + rmq_write_nodename +- ++ + return $OCF_SUCCESS +- ;; +- 2|68|69|70|75|78) +- ocf_log info "RabbitMQ server is not running" ++ else ++ ocf_log info "RabbitMQ server could not get cluster status from mnesia" + rmq_delete_nodename + return $OCF_NOT_RUNNING +- ;; +- *) +- ocf_log err "Unexpected return code from '$RMQ_CTL cluster_status' exit code: $rc" +- rmq_delete_nodename +- return $OCF_ERR_GENERIC +- ;; +- esac ++ fi + } + + rmq_init_and_wait() diff --git a/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch b/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch new file mode 100644 index 0000000..72f5ff6 --- /dev/null +++ b/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch @@ -0,0 +1,96 @@ +From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001 +From: John Eckersberg +Date: Tue, 16 Oct 2018 16:21:25 -0400 +Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority + partition + +It's possible for mnesia to still be running, but for mnesia to be +partitioned. And it's also possible to get into this state without +pacemaker seeing the node go down so no corrective action is taken. + +When monitoring, check the number of nodes that pacemaker thinks is +running, and compare to the number of nodes that mnesia thinks is +running. If mnesia only sees a minority of the total nodes, fail it +so corrective action can be taken to rejoin the cluster. + +This also adds a new function, rmq_app_running, which simply checks +whether the app is running or not and does not care about the +partition status. This is now used instead of the full monitor in a +few places where we don't care about partition state. + +Resolves: RHBZ#1639826 +--- + heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++--- + 1 file changed, 25 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 204917475..78b2bbadf 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -178,10 +178,31 @@ remove_pid () { + rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 + } + ++rmq_app_running() { ++ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then ++ ocf_log debug "RabbitMQ application is running" ++ return $OCF_SUCCESS ++ else ++ ocf_log debug "RabbitMQ application is stopped" ++ return $OCF_NOT_RUNNING ++ fi ++} ++ + rmq_monitor() { + local rc + + if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then ++ pcs_running=$(rmq_join_list | wc -w) ++ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running" ++ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).') ++ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running" ++ ++ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then ++ ocf_log info "RabbitMQ is a minority partition, failing monitor" ++ rmq_delete_nodename ++ return $OCF_ERR_GENERIC ++ fi ++ + ocf_log debug "RabbitMQ server is running normally" + rmq_write_nodename + +@@ -215,7 +236,7 @@ rmq_init_and_wait() + return $OCF_ERR_GENERIC + fi + +- rmq_monitor ++ rmq_app_running + return $? + } + +@@ -236,6 +257,7 @@ rmq_start_first() + if [ $rc -eq 0 ]; then + rc=$OCF_SUCCESS + ocf_log info "cluster bootstrapped" ++ rmq_write_nodename + + if [ -n "$OCF_RESKEY_set_policy" ]; then + # do not quote set_policy, we are passing in arguments +@@ -492,7 +514,7 @@ rmq_stop() { + end. + " + +- rmq_monitor ++ rmq_app_running + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi +@@ -508,7 +530,7 @@ rmq_stop() { + #TODO add kill logic + stop_wait=1 + while [ $stop_wait = 1 ]; do +- rmq_monitor ++ rmq_app_running + rc=$? + if [ "$rc" -eq $OCF_NOT_RUNNING ]; then + stop_wait=0 diff --git a/SOURCES/bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch b/SOURCES/bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch new file mode 100644 index 0000000..8b422eb --- /dev/null +++ b/SOURCES/bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch @@ -0,0 +1,63 @@ +From 19ee29342f8bb573722991b8cbe4503309ad0bf9 Mon Sep 17 00:00:00 2001 +From: John Eckersberg +Date: Fri, 2 Nov 2018 13:12:53 -0400 +Subject: [PATCH] rabbitmq-cluster: fix regression in rmq_stop + +This regression was introduced in PR#1249 (cc23c55). The stop action +was modified to use rmq_app_running in order to check the service +status, which allows for the following sequence of events: + +- service is started, unclustered +- stop_app is called +- cluster_join is attempted and fails +- stop is called + +Because stop_app was called, rmq_app_running returns $OCF_NOT_RUNNING +and the stop action is a no-op. This means the erlang VM continues +running. + +When the start action is attempted again, a new erlang VM is launched, +but this VM fails to boot because the old one is still running and is +registered with the same name (rabbit@nodename). + +This adds a new function, rmq_node_alive, which does a simple eval to +test whether the erlang VM is up, independent of the rabbit app. The +stop action now uses rmq_node_alive to check the service status, so +even if stop_app was previously called, the erlang VM will be stopped +properly. + +Resolves: RHBZ#1639826 +--- + heartbeat/rabbitmq-cluster | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 78b2bbadf..a2de9dc20 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -188,6 +188,16 @@ rmq_app_running() { + fi + } + ++rmq_node_alive() { ++ if $RMQ_CTL eval 'ok.'; then ++ ocf_log debug "RabbitMQ node is alive" ++ return $OCF_SUCCESS ++ else ++ ocf_log debug "RabbitMQ node is down" ++ return $OCF_NOT_RUNNING ++ fi ++} ++ + rmq_monitor() { + local rc + +@@ -514,7 +524,7 @@ rmq_stop() { + end. + " + +- rmq_app_running ++ rmq_node_alive + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 9f0f228..2a4492e 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -89,7 +89,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 12%{?dist}.4 +Release: 12%{?dist}.6 License: GPLv2+ and LGPLv2+ and ASL 2.0 URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -137,6 +137,9 @@ Patch24: bz1523318-timeout-interval-add-s-suffix.patch Patch25: bz1619428-2-LVM-activate-parameters-access-mode-fixes.patch Patch26: bz1637823-1-nfsserver-mount-rpc_pipefs.patch Patch27: bz1637823-2-nfsserver-var-lib-nfs-fix.patch +Patch28: bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch +Patch29: bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch +Patch30: bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch # bundle patches Patch1000: bz1568588-7-gcp-bundled.patch Patch1001: bz1568588-8-google-cloud-sdk-fixes.patch @@ -361,6 +364,9 @@ exit 1 %patch25 -p1 %patch26 -p1 %patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 # add SAPHana agents to Makefile.am mv %{saphana_prefix}-%{saphana_hash}/SAPHana/ra/SAPHana* heartbeat @@ -880,7 +886,7 @@ ccs_update_schema > /dev/null 2>&1 ||: %ifarch x86_64 %files aliyun -%doc %{aliyuncli}_README.rst %{colorama}_README.rst %{jmespath}_README.rst %{pycryptodome}_README.rst aliyun*_README* +%doc aliyun*_README* %{colorama}_README.rst %{jmespath}_README.rst %{pycryptodome}_README.rst %license %{aliyuncli}_LICENSE %{colorama}_LICENSE.txt %{jmespath}_LICENSE.txt %{pycryptodome}_LICENSE.rst %defattr(-,root,root) /usr/lib/ocf/resource.d/heartbeat/aliyun-vpc-move-ip* @@ -937,6 +943,18 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Wed Nov 7 2018 Oyvind Albrigtsen - 4.1.1-12.6 +- rabbitmq-cluster: fix stop regression + + Resolves: rhbz#1641946 + +* Tue Oct 23 2018 Oyvind Albrigtsen - 4.1.1-12.5 +- rabbitmq-cluster: get cluster status from mnesia during monitor +- rabbitmq-cluster: fail monitor when node is in minority partition + + Resolves: rhbz#1641944 + Resolves: rhbz#1641946 + * Thu Oct 11 2018 Oyvind Albrigtsen - 4.1.1-12.4 - nfsserver: mount rpc_pipefs