From de56014fff5dc280f62a82ff85f13011a537cc2d Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Dec 17 2018 19:06:10 +0000 Subject: import resource-agents-4.1.1-12.el7_6.7 --- diff --git a/SOURCES/bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch b/SOURCES/bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch new file mode 100644 index 0000000..72f5ff6 --- /dev/null +++ b/SOURCES/bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch @@ -0,0 +1,96 @@ +From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001 +From: John Eckersberg +Date: Tue, 16 Oct 2018 16:21:25 -0400 +Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority + partition + +It's possible for mnesia to still be running, but for mnesia to be +partitioned. And it's also possible to get into this state without +pacemaker seeing the node go down so no corrective action is taken. + +When monitoring, check the number of nodes that pacemaker thinks is +running, and compare to the number of nodes that mnesia thinks is +running. If mnesia only sees a minority of the total nodes, fail it +so corrective action can be taken to rejoin the cluster. + +This also adds a new function, rmq_app_running, which simply checks +whether the app is running or not and does not care about the +partition status. This is now used instead of the full monitor in a +few places where we don't care about partition state. + +Resolves: RHBZ#1639826 +--- + heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++--- + 1 file changed, 25 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 204917475..78b2bbadf 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -178,10 +178,31 @@ remove_pid () { + rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 + } + ++rmq_app_running() { ++ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then ++ ocf_log debug "RabbitMQ application is running" ++ return $OCF_SUCCESS ++ else ++ ocf_log debug "RabbitMQ application is stopped" ++ return $OCF_NOT_RUNNING ++ fi ++} ++ + rmq_monitor() { + local rc + + if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then ++ pcs_running=$(rmq_join_list | wc -w) ++ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running" ++ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).') ++ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running" ++ ++ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then ++ ocf_log info "RabbitMQ is a minority partition, failing monitor" ++ rmq_delete_nodename ++ return $OCF_ERR_GENERIC ++ fi ++ + ocf_log debug "RabbitMQ server is running normally" + rmq_write_nodename + +@@ -215,7 +236,7 @@ rmq_init_and_wait() + return $OCF_ERR_GENERIC + fi + +- rmq_monitor ++ rmq_app_running + return $? + } + +@@ -236,6 +257,7 @@ rmq_start_first() + if [ $rc -eq 0 ]; then + rc=$OCF_SUCCESS + ocf_log info "cluster bootstrapped" ++ rmq_write_nodename + + if [ -n "$OCF_RESKEY_set_policy" ]; then + # do not quote set_policy, we are passing in arguments +@@ -492,7 +514,7 @@ rmq_stop() { + end. + " + +- rmq_monitor ++ rmq_app_running + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi +@@ -508,7 +530,7 @@ rmq_stop() { + #TODO add kill logic + stop_wait=1 + while [ $stop_wait = 1 ]; do +- rmq_monitor ++ rmq_app_running + rc=$? + if [ "$rc" -eq $OCF_NOT_RUNNING ]; then + stop_wait=0 diff --git a/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch b/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch deleted file mode 100644 index 72f5ff6..0000000 --- a/SOURCES/bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch +++ /dev/null @@ -1,96 +0,0 @@ -From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001 -From: John Eckersberg -Date: Tue, 16 Oct 2018 16:21:25 -0400 -Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority - partition - -It's possible for mnesia to still be running, but for mnesia to be -partitioned. And it's also possible to get into this state without -pacemaker seeing the node go down so no corrective action is taken. - -When monitoring, check the number of nodes that pacemaker thinks is -running, and compare to the number of nodes that mnesia thinks is -running. If mnesia only sees a minority of the total nodes, fail it -so corrective action can be taken to rejoin the cluster. - -This also adds a new function, rmq_app_running, which simply checks -whether the app is running or not and does not care about the -partition status. This is now used instead of the full monitor in a -few places where we don't care about partition state. - -Resolves: RHBZ#1639826 ---- - heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++--- - 1 file changed, 25 insertions(+), 3 deletions(-) - -diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster -index 204917475..78b2bbadf 100755 ---- a/heartbeat/rabbitmq-cluster -+++ b/heartbeat/rabbitmq-cluster -@@ -178,10 +178,31 @@ remove_pid () { - rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 - } - -+rmq_app_running() { -+ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then -+ ocf_log debug "RabbitMQ application is running" -+ return $OCF_SUCCESS -+ else -+ ocf_log debug "RabbitMQ application is stopped" -+ return $OCF_NOT_RUNNING -+ fi -+} -+ - rmq_monitor() { - local rc - - if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then -+ pcs_running=$(rmq_join_list | wc -w) -+ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running" -+ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).') -+ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running" -+ -+ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then -+ ocf_log info "RabbitMQ is a minority partition, failing monitor" -+ rmq_delete_nodename -+ return $OCF_ERR_GENERIC -+ fi -+ - ocf_log debug "RabbitMQ server is running normally" - rmq_write_nodename - -@@ -215,7 +236,7 @@ rmq_init_and_wait() - return $OCF_ERR_GENERIC - fi - -- rmq_monitor -+ rmq_app_running - return $? - } - -@@ -236,6 +257,7 @@ rmq_start_first() - if [ $rc -eq 0 ]; then - rc=$OCF_SUCCESS - ocf_log info "cluster bootstrapped" -+ rmq_write_nodename - - if [ -n "$OCF_RESKEY_set_policy" ]; then - # do not quote set_policy, we are passing in arguments -@@ -492,7 +514,7 @@ rmq_stop() { - end. - " - -- rmq_monitor -+ rmq_app_running - if [ $? -eq $OCF_NOT_RUNNING ]; then - return $OCF_SUCCESS - fi -@@ -508,7 +530,7 @@ rmq_stop() { - #TODO add kill logic - stop_wait=1 - while [ $stop_wait = 1 ]; do -- rmq_monitor -+ rmq_app_running - rc=$? - if [ "$rc" -eq $OCF_NOT_RUNNING ]; then - stop_wait=0 diff --git a/SOURCES/bz1657138-rabbitmq-cluster-ensure-node-attribures-removed.patch b/SOURCES/bz1657138-rabbitmq-cluster-ensure-node-attribures-removed.patch new file mode 100644 index 0000000..0a25333 --- /dev/null +++ b/SOURCES/bz1657138-rabbitmq-cluster-ensure-node-attribures-removed.patch @@ -0,0 +1,42 @@ +From 8ed87936e9ad06318cc49ea767885a405dfde11e Mon Sep 17 00:00:00 2001 +From: John Eckersberg +Date: Wed, 5 Dec 2018 11:45:43 -0500 +Subject: [PATCH] rabbitmq-cluster: better ensure node attributes are removed + +Ensure that the attribute is removed at the end of the stop action. +Also if rmq_app_running or rmq_node_alive shows the service as down, +ensure the attribute is deleted as well. + +Resolves: RHBZ#1656368 +--- + heartbeat/rabbitmq-cluster | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 1643dd1e7..2dca3e216 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -184,6 +184,7 @@ rmq_app_running() { + return $OCF_SUCCESS + else + ocf_log debug "RabbitMQ application is stopped" ++ rmq_delete_nodename + return $OCF_NOT_RUNNING + fi + } +@@ -194,6 +195,7 @@ rmq_node_alive() { + return $OCF_SUCCESS + else + ocf_log debug "RabbitMQ node is down" ++ rmq_delete_nodename + return $OCF_NOT_RUNNING + fi + } +@@ -554,6 +556,7 @@ rmq_stop() { + sleep 1 + done + ++ rmq_delete_nodename + remove_pid + return $OCF_SUCCESS + } diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 2a4492e..941cb8b 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -89,7 +89,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 12%{?dist}.6 +Release: 12%{?dist}.7 License: GPLv2+ and LGPLv2+ and ASL 2.0 URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -138,8 +138,9 @@ Patch25: bz1619428-2-LVM-activate-parameters-access-mode-fixes.patch Patch26: bz1637823-1-nfsserver-mount-rpc_pipefs.patch Patch27: bz1637823-2-nfsserver-var-lib-nfs-fix.patch Patch28: bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch -Patch29: bz1641946-1-rabbitmq-cluster-fail-when-in-minority-partition.patch +Patch29: bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch Patch30: bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch +Patch31: bz1657138-rabbitmq-cluster-ensure-node-attribures-removed.patch # bundle patches Patch1000: bz1568588-7-gcp-bundled.patch Patch1001: bz1568588-8-google-cloud-sdk-fixes.patch @@ -367,6 +368,7 @@ exit 1 %patch28 -p1 %patch29 -p1 %patch30 -p1 +%patch31 -p1 # add SAPHana agents to Makefile.am mv %{saphana_prefix}-%{saphana_hash}/SAPHana/ra/SAPHana* heartbeat @@ -943,6 +945,11 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Fri Dec 7 2018 Oyvind Albrigtsen - 4.1.1-12.7 +- rabbitmq-cluster: ensure node attributes are removed + + Resolves: rhbz#1657138 + * Wed Nov 7 2018 Oyvind Albrigtsen - 4.1.1-12.6 - rabbitmq-cluster: fix stop regression