Blame SOURCES/bz1746266-docker-2-improve-daemon-check.patch

bafc1b
From b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 Mon Sep 17 00:00:00 2001
bafc1b
From: Michele Baldessari <michele@acksyn.org>
bafc1b
Date: Wed, 28 Aug 2019 10:46:36 +0200
bafc1b
Subject: [PATCH] Make the check for the docker daemon being up more robust
bafc1b
bafc1b
This amends 5941b98140b09e39b4dc2ee155817b287ef32859 (Fails docker RA
bafc1b
gracefully when command not found). That commit checked for a pidfile
bafc1b
which tends to be less robust in the presence of stale pidfiles and
bafc1b
also adds a configuration option for the pidfile location which is
bafc1b
more churn than needed to simply check for a service availability.
bafc1b
bafc1b
Let's simply call 'docker version'. When that commands returns 1 the docker
bafc1b
daemon is not running and also return OCF_ERR_GENERIC instead of
bafc1b
OCF_NOT_RUNNING. This is a key point because if the docker daemon
bafc1b
is stopped and not running it can very well be that the containers
bafc1b
are still up (e.g. when you use live-restore in docker). In this
bafc1b
situation we want an explicit fence event to be triggered due to
bafc1b
the failure of stopping.
bafc1b
bafc1b
Not doing so would mean that the stop operation returned ok and
bafc1b
for example we'd be starting an A/P resource on a second node all
bafc1b
the while it was still running on the node there the docker daemon
bafc1b
was stopped.
bafc1b
bafc1b
We also explicitely catch OCF_ERR_GENERIC in the docker_stop function
bafc1b
to make our intent clearer.
bafc1b
bafc1b
Tested this in an Openstack deployment and observed the following:
bafc1b
A) All the usual pcmk operations still correctly work
bafc1b
B) A 'systemctl stop docker' will eventually trigger a fence operation
bafc1b
   on the node.
bafc1b
bafc1b
Co-Authored-By: Luca Miccini <lmiccini@redhat.com>
bafc1b
Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com>
bafc1b
Signed-off-by: Michele Baldessari <michele@acksyn.org>
bafc1b
---
bafc1b
 heartbeat/docker | 25 ++++++++++---------------
bafc1b
 1 file changed, 10 insertions(+), 15 deletions(-)
bafc1b
bafc1b
diff --git a/heartbeat/docker b/heartbeat/docker
bafc1b
index 60e163bda..7c587b962 100755
bafc1b
--- a/heartbeat/docker
bafc1b
+++ b/heartbeat/docker
bafc1b
@@ -35,9 +35,6 @@
bafc1b
 
bafc1b
 # Parameter defaults
bafc1b
 
bafc1b
-OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid"
bafc1b
-: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}}
bafc1b
-
bafc1b
 #######################################################################
bafc1b
 
bafc1b
 meta_data()
bafc1b
@@ -184,15 +182,6 @@ container to be considered healthy.
bafc1b
 <content type="boolean"/>
bafc1b
 </parameter>
bafc1b
 
bafc1b
-<parameter name="daemon_pidfile" required="0" unique="0">
bafc1b
-<longdesc lang="en">
bafc1b
-The RA will report not running status on hosts where the docker daemon
bafc1b
-is not running.
bafc1b
-</longdesc>
bafc1b
-<shortdesc lang="en">Name of the docker daemon pid file</shortdesc>
bafc1b
-<content type="string" default="${OCF_RESKEY_daemon_pidfile_default}"/>
bafc1b
-</parameter>
bafc1b
-
bafc1b
 </parameters>
bafc1b
 
bafc1b
 <actions>
bafc1b
@@ -299,9 +288,13 @@ docker_simple_status()
bafc1b
 		return $OCF_ERR_INSTALLED
bafc1b
 	fi
bafc1b
 
bafc1b
-	if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then
bafc1b
-		ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists"
bafc1b
-		return $OCF_NOT_RUNNING
bafc1b
+
bafc1b
+	# let's first check if the daemon is up and running.
bafc1b
+	VERSION_OUT=$(docker version)
bafc1b
+	version_ret=$?
bafc1b
+	if [ $version_ret -eq 1 ]; then
bafc1b
+		ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}"
bafc1b
+		return $OCF_ERR_GENERIC
bafc1b
 	fi
bafc1b
 
bafc1b
 	container_exists
bafc1b
@@ -457,9 +450,11 @@ docker_stop()
bafc1b
 {
bafc1b
 	local timeout=60
bafc1b
 	docker_simple_status
bafc1b
-	if [ $? -eq  $OCF_NOT_RUNNING ]; then
bafc1b
+	if [ $? -eq $OCF_NOT_RUNNING ]; then
bafc1b
 		remove_container
bafc1b
 		return $OCF_SUCCESS
bafc1b
+        elif [ $? -eq $OCF_ERR_GENERIC ]; then
bafc1b
+               return $OCF_ERR_GENERIC
bafc1b
 	fi
bafc1b
 
bafc1b
 	if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then