diff --git a/SOURCES/bz1832859-rabbitmq-cluster-increase-wait-timeout.patch b/SOURCES/bz1832859-rabbitmq-cluster-increase-wait-timeout.patch new file mode 100644 index 0000000..558ecc6 --- /dev/null +++ b/SOURCES/bz1832859-rabbitmq-cluster-increase-wait-timeout.patch @@ -0,0 +1,60 @@ +From cf1e7bfab984b5e9451a63c25b39c0932e0d9116 Mon Sep 17 00:00:00 2001 +From: Michele Baldessari +Date: Wed, 6 May 2020 16:11:36 +0200 +Subject: [PATCH] Increase the rabbitmqctl wait timeout during start() + +After we start the rabbitmq process we wait for the pid to show up +and then declare the server to be started successfully. +This wait is done via 'rabbitmqctl wait'. Now from +From https://www.rabbitmq.com/rabbitmqctl.8.html we have: + + If the specified pidfile is not created or erlang node is not started + within --timeout the command will fail. Default timeout is 10 seconds. + +This default of 10 seconds might not be enough in overloaded +environments. So what we want to do here is wait for as much time as +the start() operation allows us. So we wait for OCF_RESKEY_CRM_meta_timeout +minus 5 seconds. In the rare and non-sensical case that it is less than +10s we do not pass a timeout string at all to rabbitmqctl. + +Co-Authored-By: John Eckersberg +--- + heartbeat/rabbitmq-cluster | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index a9ebd37ad..f7d48120c 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -294,6 +294,8 @@ rmq_monitor() { + rmq_init_and_wait() + { + local rc ++ local wait_timeout ++ local timeout_string + + prepare_dir $RMQ_PID_DIR + prepare_dir $RMQ_LOG_DIR +@@ -305,11 +307,20 @@ rmq_init_and_wait() + setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & + + ocf_log info "Waiting for server to start" +- $RMQ_CTL wait $RMQ_PID_FILE ++ # We want to give the wait command almost the full startup timeout we are given ++ # So we use the start operation timeout (in ms), convert it and subtract 5 seconds ++ # In the silly case that it is less than 10 seconds we just skip setting the timeout ++ wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5` ++ if [ $wait_timeout -gt 10 ]; then ++ timeout_string="--timeout ${wait_timeout}" ++ else ++ timeout_string="" ++ fi ++ $RMQ_CTL $timeout_string wait $RMQ_PID_FILE + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + remove_pid +- ocf_log info "rabbitmq-server start failed: $rc" ++ ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc" + return $OCF_ERR_GENERIC + fi + diff --git a/SOURCES/bz1841831-podman-force-rm-container-if-rm-fails.patch b/SOURCES/bz1841831-podman-force-rm-container-if-rm-fails.patch new file mode 100644 index 0000000..89fbb06 --- /dev/null +++ b/SOURCES/bz1841831-podman-force-rm-container-if-rm-fails.patch @@ -0,0 +1,53 @@ +From 5a732511db2c49ff6afe0a20e738b565a35273ae Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Fri, 29 May 2020 11:57:29 +0200 +Subject: [PATCH] podman: make sure to remove containers with lingering exec + sessions + +It may happen that some "podman exec" commands don't finish +cleanly and leave lingering "Exec sessions" in the container's +state. In that case, a "podman rm" command will always fail. + +To overcome the podman bug, issue a "podman rm -f" command when +we detect a container is stopped but still has some lingering +"Exec sessions" associated with it. + +Related-Bug: rhbz#1839721 +--- + heartbeat/podman | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/heartbeat/podman b/heartbeat/podman +index f77d988fc..e2f6e981b 100755 +--- a/heartbeat/podman ++++ b/heartbeat/podman +@@ -232,6 +232,9 @@ container_exists() + + remove_container() + { ++ local rc ++ local execids ++ + if ocf_is_true "$OCF_RESKEY_reuse"; then + # never remove the container if we have reuse enabled. + return 0 +@@ -244,6 +247,19 @@ remove_container() + fi + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run podman rm $CONTAINER ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ # due to a podman bug (rhbz#1841485), sometimes a stopped ++ # container can still be associated with Exec sessions, in ++ # which case the "podman rm" has to be forced ++ execids=$(podman inspect $CONTAINER --format '{{len .ExecIDs}}') ++ if [ "$execids" -ne "0" ]; then ++ ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." ++ ocf_run podman rm -f $CONTAINER ++ rc=$? ++ fi ++ fi ++ return $rc + } + + podman_simple_status() diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index aa69629..2e9e451 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -66,7 +66,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.1 +Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.3 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -189,6 +189,8 @@ Patch102: bz1792196-rabbitmq-cluster-delete-nodename-when-stop-fails.patch Patch103: bz1808468-1-lvmlockd-fix-conditionals.patch Patch104: bz1808468-2-remove-locking_type.patch Patch105: bz1822250-aws-vpc-move-ip-delete-remaining-route-entries.patch +Patch106: bz1832859-rabbitmq-cluster-increase-wait-timeout.patch +Patch107: bz1841831-podman-force-rm-container-if-rm-fails.patch # bundle patches Patch1000: 7-gcp-bundled.patch @@ -443,6 +445,8 @@ exit 1 %patch103 -p1 %patch104 -p1 %patch105 -p1 +%patch106 -p1 +%patch107 -p1 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -987,6 +991,16 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Tue Jun 2 2020 Oyvind Albrigtsen - 4.1.1-44.3 +- podman: force remove container if remove fails + + Resolves: rhbz#1841831 + +* Thu May 7 2020 Oyvind Albrigtsen - 4.1.1-44.2 +- rabbitmq-cluster: increase rabbitmqctl wait timeout during start + + Resolves: rhbz#1832859 + * Wed Apr 15 2020 Oyvind Albrigtsen - 4.1.1-44.1 - aws-vpc-move-ip: delete remaining route entries