|
|
030326 |
From 3aa0dda4e0c2a3b801d65aeacc4fdfd713a604f2 Mon Sep 17 00:00:00 2001
|
|
|
030326 |
From: Damien Ciabrini <damien.ciabrini@gmail.com>
|
|
|
030326 |
Date: Tue, 27 Oct 2020 18:01:36 +0100
|
|
|
030326 |
Subject: [PATCH] podman: recover from killed conmon side process
|
|
|
030326 |
|
|
|
030326 |
When podman containers are created by the resource-agent, the podman
|
|
|
030326 |
runtime spawns a side process (conmon) to monitor the container and
|
|
|
030326 |
record the exit status.
|
|
|
030326 |
|
|
|
030326 |
If the conmon process dies unexpectedly (e.g. kill -9), the podman
|
|
|
030326 |
container can still be stopped, even if the cli returns a generic
|
|
|
030326 |
error.
|
|
|
030326 |
|
|
|
030326 |
Try to distinguish this specific failure condition and make the stop
|
|
|
030326 |
operation resilient; when it happens, just log a warning and finish
|
|
|
030326 |
the usual stop actions.
|
|
|
030326 |
---
|
|
|
030326 |
heartbeat/podman | 18 +++++++++++++++---
|
|
|
030326 |
1 file changed, 15 insertions(+), 3 deletions(-)
|
|
|
030326 |
|
|
|
030326 |
diff --git a/heartbeat/podman b/heartbeat/podman
|
|
|
030326 |
index 81b00ee6f..9f8c2a091 100755
|
|
|
030326 |
--- a/heartbeat/podman
|
|
|
030326 |
+++ b/heartbeat/podman
|
|
|
030326 |
@@ -419,6 +419,7 @@ podman_start()
|
|
|
030326 |
podman_stop()
|
|
|
030326 |
{
|
|
|
030326 |
local timeout=60
|
|
|
030326 |
+ local rc
|
|
|
030326 |
podman_simple_status
|
|
|
030326 |
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
|
030326 |
remove_container
|
|
|
030326 |
@@ -434,16 +435,27 @@ podman_stop()
|
|
|
030326 |
|
|
|
030326 |
if ocf_is_true "$OCF_RESKEY_force_kill"; then
|
|
|
030326 |
ocf_run podman kill $CONTAINER
|
|
|
030326 |
+ rc=$?
|
|
|
030326 |
else
|
|
|
030326 |
ocf_log debug "waiting $timeout second[s] before killing container"
|
|
|
030326 |
ocf_run podman stop -t=$timeout $CONTAINER
|
|
|
030326 |
+ rc=$?
|
|
|
030326 |
# on stop, systemd will automatically delete any transient
|
|
|
030326 |
# drop-in conf that has been created earlier
|
|
|
030326 |
fi
|
|
|
030326 |
|
|
|
030326 |
- if [ $? -ne 0 ]; then
|
|
|
030326 |
- ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
|
|
030326 |
- return $OCF_ERR_GENERIC
|
|
|
030326 |
+ if [ $rc -ne 0 ]; then
|
|
|
030326 |
+ # If the stop failed, it could be because the controlling conmon
|
|
|
030326 |
+ # process died unexpectedly. If so, a generic error code is returned
|
|
|
030326 |
+ # but the associated container exit code is -1. If that's the case,
|
|
|
030326 |
+ # assume there's no failure and continue with the rm as usual.
|
|
|
030326 |
+ if [ $rc -eq 125 ] && \
|
|
|
030326 |
+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then
|
|
|
030326 |
+ ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
|
|
030326 |
+ else
|
|
|
030326 |
+ ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
|
|
030326 |
+ return $OCF_ERR_GENERIC
|
|
|
030326 |
+ fi
|
|
|
030326 |
fi
|
|
|
030326 |
|
|
|
030326 |
remove_container
|