|
|
391384 |
From b7ae1bfbe7d393a9e9d993da3da35cc4d1bc9eb4 Mon Sep 17 00:00:00 2001
|
|
|
391384 |
From: Michele Baldessari <michele@acksyn.org>
|
|
|
391384 |
Date: Wed, 28 Aug 2019 10:46:36 +0200
|
|
|
391384 |
Subject: [PATCH] Make the check for the docker daemon being up more robust
|
|
|
391384 |
|
|
|
391384 |
This amends 5941b98140b09e39b4dc2ee155817b287ef32859 (Fails docker RA
|
|
|
391384 |
gracefully when command not found). That commit checked for a pidfile
|
|
|
391384 |
which tends to be less robust in the presence of stale pidfiles and
|
|
|
391384 |
also adds a configuration option for the pidfile location which is
|
|
|
391384 |
more churn than needed to simply check for a service availability.
|
|
|
391384 |
|
|
|
391384 |
Let's simply call 'docker version'. When that commands returns 1 the docker
|
|
|
391384 |
daemon is not running and also return OCF_ERR_GENERIC instead of
|
|
|
391384 |
OCF_NOT_RUNNING. This is a key point because if the docker daemon
|
|
|
391384 |
is stopped and not running it can very well be that the containers
|
|
|
391384 |
are still up (e.g. when you use live-restore in docker). In this
|
|
|
391384 |
situation we want an explicit fence event to be triggered due to
|
|
|
391384 |
the failure of stopping.
|
|
|
391384 |
|
|
|
391384 |
Not doing so would mean that the stop operation returned ok and
|
|
|
391384 |
for example we'd be starting an A/P resource on a second node all
|
|
|
391384 |
the while it was still running on the node there the docker daemon
|
|
|
391384 |
was stopped.
|
|
|
391384 |
|
|
|
391384 |
We also explicitely catch OCF_ERR_GENERIC in the docker_stop function
|
|
|
391384 |
to make our intent clearer.
|
|
|
391384 |
|
|
|
391384 |
Tested this in an Openstack deployment and observed the following:
|
|
|
391384 |
A) All the usual pcmk operations still correctly work
|
|
|
391384 |
B) A 'systemctl stop docker' will eventually trigger a fence operation
|
|
|
391384 |
on the node.
|
|
|
391384 |
|
|
|
391384 |
Co-Authored-By: Luca Miccini <lmiccini@redhat.com>
|
|
|
391384 |
Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com>
|
|
|
391384 |
Signed-off-by: Michele Baldessari <michele@acksyn.org>
|
|
|
391384 |
---
|
|
|
391384 |
heartbeat/docker | 25 ++++++++++---------------
|
|
|
391384 |
1 file changed, 10 insertions(+), 15 deletions(-)
|
|
|
391384 |
|
|
|
391384 |
diff --git a/heartbeat/docker b/heartbeat/docker
|
|
|
391384 |
index 60e163bda..7c587b962 100755
|
|
|
391384 |
--- a/heartbeat/docker
|
|
|
391384 |
+++ b/heartbeat/docker
|
|
|
391384 |
@@ -35,9 +35,6 @@
|
|
|
391384 |
|
|
|
391384 |
# Parameter defaults
|
|
|
391384 |
|
|
|
391384 |
-OCF_RESKEY_daemon_pidfile_default="/var/run/docker.pid"
|
|
|
391384 |
-: ${OCF_RESKEY_daemon_pidfile=${OCF_RESKEY_daemon_pidfile_default}}
|
|
|
391384 |
-
|
|
|
391384 |
#######################################################################
|
|
|
391384 |
|
|
|
391384 |
meta_data()
|
|
|
391384 |
@@ -184,15 +182,6 @@ container to be considered healthy.
|
|
|
391384 |
<content type="boolean"/>
|
|
|
391384 |
</parameter>
|
|
|
391384 |
|
|
|
391384 |
-<parameter name="daemon_pidfile" required="0" unique="0">
|
|
|
391384 |
-<longdesc lang="en">
|
|
|
391384 |
-The RA will report not running status on hosts where the docker daemon
|
|
|
391384 |
-is not running.
|
|
|
391384 |
-</longdesc>
|
|
|
391384 |
-<shortdesc lang="en">Name of the docker daemon pid file</shortdesc>
|
|
|
391384 |
-<content type="string" default="${OCF_RESKEY_daemon_pidfile_default}"/>
|
|
|
391384 |
-</parameter>
|
|
|
391384 |
-
|
|
|
391384 |
</parameters>
|
|
|
391384 |
|
|
|
391384 |
<actions>
|
|
|
391384 |
@@ -299,9 +288,13 @@ docker_simple_status()
|
|
|
391384 |
return $OCF_ERR_INSTALLED
|
|
|
391384 |
fi
|
|
|
391384 |
|
|
|
391384 |
- if [ ! -e "$OCF_RESKEY_daemon_pidfile" ]; then
|
|
|
391384 |
- ocf_log err "docker daemon is not running, pid file $OCF_RESKEY_daemon_pidfile not exists"
|
|
|
391384 |
- return $OCF_NOT_RUNNING
|
|
|
391384 |
+
|
|
|
391384 |
+ # let's first check if the daemon is up and running.
|
|
|
391384 |
+ VERSION_OUT=$(docker version)
|
|
|
391384 |
+ version_ret=$?
|
|
|
391384 |
+ if [ $version_ret -eq 1 ]; then
|
|
|
391384 |
+ ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}"
|
|
|
391384 |
+ return $OCF_ERR_GENERIC
|
|
|
391384 |
fi
|
|
|
391384 |
|
|
|
391384 |
container_exists
|
|
|
391384 |
@@ -457,9 +450,11 @@ docker_stop()
|
|
|
391384 |
{
|
|
|
391384 |
local timeout=60
|
|
|
391384 |
docker_simple_status
|
|
|
391384 |
- if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
391384 |
+ if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
391384 |
remove_container
|
|
|
391384 |
return $OCF_SUCCESS
|
|
|
391384 |
+ elif [ $? -eq $OCF_ERR_GENERIC ]; then
|
|
|
391384 |
+ return $OCF_ERR_GENERIC
|
|
|
391384 |
fi
|
|
|
391384 |
|
|
|
391384 |
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|