diff --git a/SOURCES/bz1656733-rabbitmq-cluster-retry-start-cluster-join-fails.patch b/SOURCES/bz1656733-rabbitmq-cluster-retry-start-cluster-join-fails.patch new file mode 100644 index 0000000..80fe18b --- /dev/null +++ b/SOURCES/bz1656733-rabbitmq-cluster-retry-start-cluster-join-fails.patch @@ -0,0 +1,83 @@ +From 63c9449bfa9a7fecbc0f00394699a475a384671d Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Thu, 9 Aug 2018 16:33:26 +0200 +Subject: [PATCH] rabbitmq-cluster: retry start when cluster join fails + +When a node tries to join an existing cluster, it fetches a node +list to try to connect from any of those running nodes. + +If the nodes from this list become unavailable while we're joining +the cluster, the rabbitmq server will fail to get clustered and +make the start operation fail. + +Give the resource a chance to start anyway by retrying the entire +start actions until it succeeds or until the start timeout is +reached and pacemaker stops the start operation. + +Co-Authored-by: +Suggested-by: +--- + heartbeat/rabbitmq-cluster | 29 ++++++++++++++++++++++++++--- + 1 file changed, 26 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 9ff49e075..84f383460 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -31,6 +31,12 @@ + + ####################################################################### + ++# This arbitrary value here is used by the rmq_start action to ++# signify that the resource agent must retry the start process ++# It might potentially conflict with OCF assigned error code ++# in the future. ++RMQ_TRY_RESTART_ERROR_CODE=126 ++ + RMQ_SERVER=/usr/sbin/rabbitmq-server + RMQ_CTL=/usr/sbin/rabbitmqctl + RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia" +@@ -354,7 +360,7 @@ rmq_notify() { + return $OCF_SUCCESS + } + +-rmq_start() { ++rmq_try_start() { + local join_list="" + local rc + +@@ -384,8 +390,16 @@ rmq_start() { + rc=$? + + if [ $rc -ne 0 ]; then +- ocf_log info "node failed to join even after reseting local data. Check SELINUX policy" +- return $OCF_ERR_GENERIC ++ # we could not join the rabbitmq cluster from any of the running nodes ++ # this might be due to a unexpected reset of those nodes. Give ourself ++ # a chance to start by retrying the entire start sequence. ++ ++ ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq" ++ rmq_stop ++ ++ ocf_log warn "Re-detect available rabbitmq nodes and try to start again" ++ # return an unused OCF value to signify a "retry" condition ++ return $RMQ_TRY_RESTART_ERROR_CODE + fi + + # Restore users, user permissions, and policies (if any) +@@ -443,6 +457,15 @@ rmq_start() { + return $OCF_SUCCESS + } + ++rmq_start() { ++ local rc=$RMQ_TRY_RESTART_ERROR_CODE ++ while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do ++ rmq_try_start ++ rc=$? ++ done ++ return $rc ++} ++ + rmq_stop() { + # Backup users, user permissions, and policies + BaseDataDir=`dirname $RMQ_DATA_DIR` diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 941cb8b..6c6b664 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -89,7 +89,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 12%{?dist}.7 +Release: 12%{?dist}.8 License: GPLv2+ and LGPLv2+ and ASL 2.0 URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -141,6 +141,7 @@ Patch28: bz1641944-rabbitmq-cluster-monitor-mnesia-status.patch Patch29: bz1641946-1-rabbitmq-cluster-fail-in-minority-partition.patch Patch30: bz1641946-2-rabbitmq-cluster-fix-stop-regression.patch Patch31: bz1657138-rabbitmq-cluster-ensure-node-attribures-removed.patch +Patch32: bz1656733-rabbitmq-cluster-retry-start-cluster-join-fails.patch # bundle patches Patch1000: bz1568588-7-gcp-bundled.patch Patch1001: bz1568588-8-google-cloud-sdk-fixes.patch @@ -369,6 +370,7 @@ exit 1 %patch29 -p1 %patch30 -p1 %patch31 -p1 +%patch32 -p1 # add SAPHana agents to Makefile.am mv %{saphana_prefix}-%{saphana_hash}/SAPHana/ra/SAPHana* heartbeat @@ -945,6 +947,10 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Thu Jan 17 2019 Oyvind Albrigtsen - 4.1.1-12.8 +- rabbitmq-cluster: retry start when cluster join fails + Resolves: rhbz#1656733 + * Fri Dec 7 2018 Oyvind Albrigtsen - 4.1.1-12.7 - rabbitmq-cluster: ensure node attributes are removed