diff --git a/SOURCES/bz1750704-docker-1-fail-gracefully-when-not-running.patch b/SOURCES/bz1750704-docker-1-fail-gracefully-when-not-running.patch new file mode 100644 index 0000000..1cc2bbb --- /dev/null +++ b/SOURCES/bz1750704-docker-1-fail-gracefully-when-not-running.patch @@ -0,0 +1,82 @@ +From 5941b98140b09e39b4dc2ee155817b287ef32859 Mon Sep 17 00:00:00 2001 +From: zaenk +Date: Thu, 16 May 2019 15:01:43 +0200 +Subject: [PATCH 1/2] Fails docker RA gracefully when command not found Fails + gracefully when daemon not running + +--- + heartbeat/docker | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/heartbeat/docker b/heartbeat/docker +index c206344ad..1942b8f2f 100755 +--- a/heartbeat/docker ++++ b/heartbeat/docker +@@ -33,6 +33,11 @@ + : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} + . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + ++# Parameter defaults ++ ++OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid" ++: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}} ++ + ####################################################################### + + meta_data() +@@ -176,6 +181,16 @@ container to be considered healthy. + + + ++ ++ ++ ++The RA will report not running status on hosts where the docker daemon ++is not running. ++ ++Name of the docker daemon pid file ++ ++ ++ + + + +@@ -277,6 +292,16 @@ docker_simple_status() + { + local val + ++ if [ ! -x "$(command -v docker)" ]; then ++ ocf_log err "docker is not installed on this host" ++ return $OCF_ERR_INSTALLED ++ fi ++ ++ if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then ++ ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists" ++ return $OCF_NOT_RUNNING ++ fi ++ + container_exists + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + +From dca670318452a4666984b2087ea562987d7c5b4f Mon Sep 17 00:00:00 2001 +From: zaenk +Date: Thu, 16 May 2019 15:46:28 +0200 +Subject: [PATCH 2/2] Fixes parameter meta-data + +--- + heartbeat/docker | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/heartbeat/docker b/heartbeat/docker +index 1942b8f2f..250714613 100755 +--- a/heartbeat/docker ++++ b/heartbeat/docker +@@ -181,7 +181,6 @@ container to be considered healthy. + + + +- + + + The RA will report not running status on hosts where the docker daemon diff --git a/SOURCES/bz1750704-docker-2-improve-daemon-check.patch b/SOURCES/bz1750704-docker-2-improve-daemon-check.patch new file mode 100644 index 0000000..fca2b95 --- /dev/null +++ b/SOURCES/bz1750704-docker-2-improve-daemon-check.patch @@ -0,0 +1,99 @@ +From b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 Mon Sep 17 00:00:00 2001 +From: Michele Baldessari +Date: Wed, 28 Aug 2019 10:46:36 +0200 +Subject: [PATCH] Make the check for the docker daemon being up more robust + +This amends 5941b98140b09e39b4dc2ee155817b287ef32859 (Fails docker RA +gracefully when command not found). That commit checked for a pidfile +which tends to be less robust in the presence of stale pidfiles and +also adds a configuration option for the pidfile location which is +more churn than needed to simply check for a service availability. + +Let's simply call 'docker version'. When that commands returns 1 the docker +daemon is not running and also return OCF_ERR_GENERIC instead of +OCF_NOT_RUNNING. This is a key point because if the docker daemon +is stopped and not running it can very well be that the containers +are still up (e.g. when you use live-restore in docker). In this +situation we want an explicit fence event to be triggered due to +the failure of stopping. + +Not doing so would mean that the stop operation returned ok and +for example we'd be starting an A/P resource on a second node all +the while it was still running on the node there the docker daemon +was stopped. + +We also explicitely catch OCF_ERR_GENERIC in the docker_stop function +to make our intent clearer. + +Tested this in an Openstack deployment and observed the following: +A) All the usual pcmk operations still correctly work +B) A 'systemctl stop docker' will eventually trigger a fence operation + on the node. + +Co-Authored-By: Luca Miccini +Co-Authored-By: Damien Ciabrini +Signed-off-by: Michele Baldessari +--- + heartbeat/docker | 25 ++++++++++--------------- + 1 file changed, 10 insertions(+), 15 deletions(-) + +diff --git a/heartbeat/docker b/heartbeat/docker +index 60e163bda..7c587b962 100755 +--- a/heartbeat/docker ++++ b/heartbeat/docker +@@ -35,9 +35,6 @@ + + # Parameter defaults + +-OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid" +-: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}} +- + ####################################################################### + + meta_data() +@@ -184,15 +182,6 @@ container to be considered healthy. + + + +- +- +-The RA will report not running status on hosts where the docker daemon +-is not running. +- +-Name of the docker daemon pid file +- +- +- + + + +@@ -299,9 +288,13 @@ docker_simple_status() + return $OCF_ERR_INSTALLED + fi + +- if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then +- ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists" +- return $OCF_NOT_RUNNING ++ ++ # let's first check if the daemon is up and running. ++ VERSION_OUT=$(docker version) ++ version_ret=$? ++ if [ $version_ret -eq 1 ]; then ++ ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}" ++ return $OCF_ERR_GENERIC + fi + + container_exists +@@ -457,9 +450,11 @@ docker_stop() + { + local timeout=60 + docker_simple_status +- if [ $? -eq $OCF_NOT_RUNNING ]; then ++ if [ $? -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS ++ elif [ $? -eq $OCF_ERR_GENERIC ]; then ++ return $OCF_ERR_GENERIC + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then diff --git a/SOURCES/bz1750704-docker-3-fix-stop-return-code.patch b/SOURCES/bz1750704-docker-3-fix-stop-return-code.patch new file mode 100644 index 0000000..7029c6f --- /dev/null +++ b/SOURCES/bz1750704-docker-3-fix-stop-return-code.patch @@ -0,0 +1,41 @@ +From 3d52ec553fc3de82e1d1dcbef949882947915c49 Mon Sep 17 00:00:00 2001 +From: Michele Baldessari +Date: Thu, 29 Aug 2019 08:10:57 +0200 +Subject: [PATCH] Fixup docker_stop condition on error + +In docker_stop() let's store the return code of docker_simple_status +otherwise the second branch of the elif risks reflecting the return +code of the previous operaiton and not of docker_simple_status() + +Retested this with a bunch of stop/start commands on a rabbitmq bundle +and observed no issues. + +This fixes a stop() regression introduced in +b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 ("Make the check for the docker +daemon being up more robust") + +Thanks to Kota Akatsuka for spotting it. + +Signed-off-by: Michele Baldessari +--- + heartbeat/docker | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/docker b/heartbeat/docker +index 7c587b962..41ac42129 100755 +--- a/heartbeat/docker ++++ b/heartbeat/docker +@@ -450,10 +450,11 @@ docker_stop() + { + local timeout=60 + docker_simple_status +- if [ $? -eq $OCF_NOT_RUNNING ]; then ++ ret=$? ++ if [ $ret -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS +- elif [ $? -eq $OCF_ERR_GENERIC ]; then ++ elif [ $ret -eq $OCF_ERR_GENERIC ]; then + return $OCF_ERR_GENERIC + fi + diff --git a/SOURCES/bz1756262-NovaEvacuate-evacuate_delay.patch b/SOURCES/bz1756262-NovaEvacuate-evacuate_delay.patch new file mode 100644 index 0000000..9b429d7 --- /dev/null +++ b/SOURCES/bz1756262-NovaEvacuate-evacuate_delay.patch @@ -0,0 +1,50 @@ +From 8b9c49fd965f73709d5a6e2c21987ba26af4856b Mon Sep 17 00:00:00 2001 +From: Luca Miccini +Date: Wed, 25 Sep 2019 17:12:39 +0200 +Subject: [PATCH] Add a configurable delay to Nova Evacuate calls + +In case /var/lib/nova/instances resides on NFS we have seen migrations +failing with 'Failed to get "write" lock - Is another process using the +image' errors. + +This has been tracked down to grace/lease timeouts not having expired +before attempting the migration/evacuate, so in this cases it might be +desirable to delay the nova evacuate call to give the storage time to +release the locks. + +Change-Id: Ie2fe784202d754eda38092479b1ab3ff4d02136a +Resolves: rhbz#1740069 +--- + +diff --git a/heartbeat/NovaEvacuate b/heartbeat/NovaEvacuate +index 810f30a..596f520 100644 +--- a/heartbeat/NovaEvacuate ++++ b/heartbeat/NovaEvacuate +@@ -125,6 +125,15 @@ + + + ++ ++ ++Allows delaying the nova evacuate API call, e.g. to give a storage array time to clean ++up eventual locks/leases. ++ ++Nova evacuate delay ++ ++ ++ + + + +@@ -216,6 +225,11 @@ + fence_agent="fence_evacuate" + fi + ++ if [ ${OCF_RESKEY_evacuate_delay} != 0 ]; then ++ ocf_log info "Delaying nova evacuate by $OCF_RESKEY_evacuate_delay seconds" ++ sleep ${OCF_RESKEY_evacuate_delay} ++ fi ++ + ocf_log notice "Initiating evacuation of $node with $fence_agent" + $fence_agent ${fence_options} -o status -n ${node} + if [ $? = 1 ]; then diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index a7d6fe7..02d9d81 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -95,7 +95,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 30%{?dist}.2 +Release: 30%{?dist}.4 License: GPLv2+ and LGPLv2+ and ASL 2.0 URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -177,6 +177,10 @@ Patch56: bz1731426-dhcpd-keep-selinux-context.patch Patch57: bz1731427-CTDB-1-fixes.patch Patch58: bz1731427-CTDB-2-add-v4.9-support.patch Patch59: bz1744923-SAPHanaTopology-make-multi-instance-aware.patch +Patch60: bz1750704-docker-1-fail-gracefully-when-not-running.patch +Patch61: bz1750704-docker-2-improve-daemon-check.patch +Patch62: bz1750704-docker-3-fix-stop-return-code.patch +Patch63: bz1756262-NovaEvacuate-evacuate_delay.patch # bundle patches Patch1000: bz1568588-7-gcp-bundled.patch @@ -371,7 +375,7 @@ SAP instances to be managed in a cluster environment. License: GPLv2+ Summary: SAP HANA Scale-Out cluster resource agents Version: 0.163.2 -Release: 7%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.2 +Release: 7%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.4 %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} Group: System Environment/Base %else @@ -391,7 +395,7 @@ environment. License: GPLv2+ Summary: SAP cluster connector script Version: 3.0.1 -Release: 7%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.2 +Release: 7%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.4 %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} Group: System Environment/Base %else @@ -475,6 +479,10 @@ exit 1 %patch57 -p1 %patch58 -p1 -F1 %patch59 -p1 +%patch60 -p1 +%patch61 -p1 +%patch62 -p1 +%patch63 -p1 # add SAPHana agents to Makefile.am mv %{saphana_prefix}-%{saphana_hash}/SAPHana/ra/SAPHana* heartbeat @@ -1105,6 +1113,16 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Fri Sep 27 2019 Oyvind Albrigtsen - 4.1.1-30.4 +- NovaEvacuate: add "evacuate_delay" parameter + + Resolves: rhbz#1756262 + +* Tue Sep 10 2019 Oyvind Albrigtsen - 4.1.1-30.3 +- docker: improve daemon check to fence when docker is stopped manually + + Resolves: rhbz#1750704 + * Fri Aug 23 2019 Oyvind Albrigtsen - 4.1.1-30.2 - SAPHanaTopology: make multi instance aware (MCOS)