Blame SOURCES/bz1746266-docker-2-improve-daemon-check.patch

391384
From b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 Mon Sep 17 00:00:00 2001
391384
From: Michele Baldessari <michele@acksyn.org>
391384
Date: Wed, 28 Aug 2019 10:46:36 +0200
391384
Subject: [PATCH] Make the check for the docker daemon being up more robust
391384
391384
This amends 5941b98140b09e39b4dc2ee155817b287ef32859 (Fails docker RA
391384
gracefully when command not found). That commit checked for a pidfile
391384
which tends to be less robust in the presence of stale pidfiles and
391384
also adds a configuration option for the pidfile location which is
391384
more churn than needed to simply check for a service availability.
391384
391384
Let's simply call 'docker version'. When that commands returns 1 the docker
391384
daemon is not running and also return OCF_ERR_GENERIC instead of
391384
OCF_NOT_RUNNING. This is a key point because if the docker daemon
391384
is stopped and not running it can very well be that the containers
391384
are still up (e.g. when you use live-restore in docker). In this
391384
situation we want an explicit fence event to be triggered due to
391384
the failure of stopping.
391384
391384
Not doing so would mean that the stop operation returned ok and
391384
for example we'd be starting an A/P resource on a second node all
391384
the while it was still running on the node there the docker daemon
391384
was stopped.
391384
391384
We also explicitely catch OCF_ERR_GENERIC in the docker_stop function
391384
to make our intent clearer.
391384
391384
Tested this in an Openstack deployment and observed the following:
391384
A) All the usual pcmk operations still correctly work
391384
B) A 'systemctl stop docker' will eventually trigger a fence operation
391384
   on the node.
391384
391384
Co-Authored-By: Luca Miccini <lmiccini@redhat.com>
391384
Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com>
391384
Signed-off-by: Michele Baldessari <michele@acksyn.org>
391384
---
391384
 heartbeat/docker | 25 ++++++++++---------------
391384
 1 file changed, 10 insertions(+), 15 deletions(-)
391384
391384
diff --git a/heartbeat/docker b/heartbeat/docker
391384
index 60e163bda..7c587b962 100755
391384
--- a/heartbeat/docker
391384
+++ b/heartbeat/docker
391384
@@ -35,9 +35,6 @@
391384
 
391384
 # Parameter defaults
391384
 
391384
-OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid"
391384
-: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}}
391384
-
391384
 #######################################################################
391384
 
391384
 meta_data()
391384
@@ -184,15 +182,6 @@ container to be considered healthy.
391384
 <content type="boolean"/>
391384
 </parameter>
391384
 
391384
-<parameter name="daemon_pidfile" required="0" unique="0">
391384
-<longdesc lang="en">
391384
-The RA will report not running status on hosts where the docker daemon
391384
-is not running.
391384
-</longdesc>
391384
-<shortdesc lang="en">Name of the docker daemon pid file</shortdesc>
391384
-<content type="string" default="${OCF_RESKEY_daemon_pidfile_default}"/>
391384
-</parameter>
391384
-
391384
 </parameters>
391384
 
391384
 <actions>
391384
@@ -299,9 +288,13 @@ docker_simple_status()
391384
 		return $OCF_ERR_INSTALLED
391384
 	fi
391384
 
391384
-	if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then
391384
-		ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists"
391384
-		return $OCF_NOT_RUNNING
391384
+
391384
+	# let's first check if the daemon is up and running.
391384
+	VERSION_OUT=$(docker version)
391384
+	version_ret=$?
391384
+	if [ $version_ret -eq 1 ]; then
391384
+		ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}"
391384
+		return $OCF_ERR_GENERIC
391384
 	fi
391384
 
391384
 	container_exists
391384
@@ -457,9 +450,11 @@ docker_stop()
391384
 {
391384
 	local timeout=60
391384
 	docker_simple_status
391384
-	if [ $? -eq  $OCF_NOT_RUNNING ]; then
391384
+	if [ $? -eq $OCF_NOT_RUNNING ]; then
391384
 		remove_container
391384
 		return $OCF_SUCCESS
391384
+        elif [ $? -eq $OCF_ERR_GENERIC ]; then
391384
+               return $OCF_ERR_GENERIC
391384
 	fi
391384
 
391384
 	if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then