|
|
bafc1b |
From b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 Mon Sep 17 00:00:00 2001
|
|
|
bafc1b |
From: Michele Baldessari <michele@acksyn.org>
|
|
|
bafc1b |
Date: Wed, 28 Aug 2019 10:46:36 +0200
|
|
|
bafc1b |
Subject: [PATCH] Make the check for the docker daemon being up more robust
|
|
|
bafc1b |
|
|
|
bafc1b |
This amends 5941b98140b09e39b4dc2ee155817b287ef32859 (Fails docker RA
|
|
|
bafc1b |
gracefully when command not found). That commit checked for a pidfile
|
|
|
bafc1b |
which tends to be less robust in the presence of stale pidfiles and
|
|
|
bafc1b |
also adds a configuration option for the pidfile location which is
|
|
|
bafc1b |
more churn than needed to simply check for a service availability.
|
|
|
bafc1b |
|
|
|
bafc1b |
Let's simply call 'docker version'. When that commands returns 1 the docker
|
|
|
bafc1b |
daemon is not running and also return OCF_ERR_GENERIC instead of
|
|
|
bafc1b |
OCF_NOT_RUNNING. This is a key point because if the docker daemon
|
|
|
bafc1b |
is stopped and not running it can very well be that the containers
|
|
|
bafc1b |
are still up (e.g. when you use live-restore in docker). In this
|
|
|
bafc1b |
situation we want an explicit fence event to be triggered due to
|
|
|
bafc1b |
the failure of stopping.
|
|
|
bafc1b |
|
|
|
bafc1b |
Not doing so would mean that the stop operation returned ok and
|
|
|
bafc1b |
for example we'd be starting an A/P resource on a second node all
|
|
|
bafc1b |
the while it was still running on the node there the docker daemon
|
|
|
bafc1b |
was stopped.
|
|
|
bafc1b |
|
|
|
bafc1b |
We also explicitely catch OCF_ERR_GENERIC in the docker_stop function
|
|
|
bafc1b |
to make our intent clearer.
|
|
|
bafc1b |
|
|
|
bafc1b |
Tested this in an Openstack deployment and observed the following:
|
|
|
bafc1b |
A) All the usual pcmk operations still correctly work
|
|
|
bafc1b |
B) A 'systemctl stop docker' will eventually trigger a fence operation
|
|
|
bafc1b |
on the node.
|
|
|
bafc1b |
|
|
|
bafc1b |
Co-Authored-By: Luca Miccini <lmiccini@redhat.com>
|
|
|
bafc1b |
Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com>
|
|
|
bafc1b |
Signed-off-by: Michele Baldessari <michele@acksyn.org>
|
|
|
bafc1b |
---
|
|
|
bafc1b |
heartbeat/docker | 25 ++++++++++---------------
|
|
|
bafc1b |
1 file changed, 10 insertions(+), 15 deletions(-)
|
|
|
bafc1b |
|
|
|
bafc1b |
diff --git a/heartbeat/docker b/heartbeat/docker
|
|
|
bafc1b |
index 60e163bda..7c587b962 100755
|
|
|
bafc1b |
--- a/heartbeat/docker
|
|
|
bafc1b |
+++ b/heartbeat/docker
|
|
|
bafc1b |
@@ -35,9 +35,6 @@
|
|
|
bafc1b |
|
|
|
bafc1b |
# Parameter defaults
|
|
|
bafc1b |
|
|
|
bafc1b |
-OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid"
|
|
|
bafc1b |
-: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}}
|
|
|
bafc1b |
-
|
|
|
bafc1b |
#######################################################################
|
|
|
bafc1b |
|
|
|
bafc1b |
meta_data()
|
|
|
bafc1b |
@@ -184,15 +182,6 @@ container to be considered healthy.
|
|
|
bafc1b |
<content type="boolean"/>
|
|
|
bafc1b |
</parameter>
|
|
|
bafc1b |
|
|
|
bafc1b |
-<parameter name="daemon_pidfile" required="0" unique="0">
|
|
|
bafc1b |
-<longdesc lang="en">
|
|
|
bafc1b |
-The RA will report not running status on hosts where the docker daemon
|
|
|
bafc1b |
-is not running.
|
|
|
bafc1b |
-</longdesc>
|
|
|
bafc1b |
-<shortdesc lang="en">Name of the docker daemon pid file</shortdesc>
|
|
|
bafc1b |
-<content type="string" default="${OCF_RESKEY_daemon_pidfile_default}"/>
|
|
|
bafc1b |
-</parameter>
|
|
|
bafc1b |
-
|
|
|
bafc1b |
</parameters>
|
|
|
bafc1b |
|
|
|
bafc1b |
<actions>
|
|
|
bafc1b |
@@ -299,9 +288,13 @@ docker_simple_status()
|
|
|
bafc1b |
return $OCF_ERR_INSTALLED
|
|
|
bafc1b |
fi
|
|
|
bafc1b |
|
|
|
bafc1b |
- if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then
|
|
|
bafc1b |
- ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists"
|
|
|
bafc1b |
- return $OCF_NOT_RUNNING
|
|
|
bafc1b |
+
|
|
|
bafc1b |
+ # let's first check if the daemon is up and running.
|
|
|
bafc1b |
+ VERSION_OUT=$(docker version)
|
|
|
bafc1b |
+ version_ret=$?
|
|
|
bafc1b |
+ if [ $version_ret -eq 1 ]; then
|
|
|
bafc1b |
+ ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}"
|
|
|
bafc1b |
+ return $OCF_ERR_GENERIC
|
|
|
bafc1b |
fi
|
|
|
bafc1b |
|
|
|
bafc1b |
container_exists
|
|
|
bafc1b |
@@ -457,9 +450,11 @@ docker_stop()
|
|
|
bafc1b |
{
|
|
|
bafc1b |
local timeout=60
|
|
|
bafc1b |
docker_simple_status
|
|
|
bafc1b |
- if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
bafc1b |
+ if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
bafc1b |
remove_container
|
|
|
bafc1b |
return $OCF_SUCCESS
|
|
|
bafc1b |
+ elif [ $? -eq $OCF_ERR_GENERIC ]; then
|
|
|
bafc1b |
+ return $OCF_ERR_GENERIC
|
|
|
bafc1b |
fi
|
|
|
bafc1b |
|
|
|
bafc1b |
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|