Blame SOURCES/bz1886262-podman-recover-from-killed-conmon.patch

184fb6
From 3aa0dda4e0c2a3b801d65aeacc4fdfd713a604f2 Mon Sep 17 00:00:00 2001
184fb6
From: Damien Ciabrini <damien.ciabrini@gmail.com>
184fb6
Date: Tue, 27 Oct 2020 18:01:36 +0100
184fb6
Subject: [PATCH] podman: recover from killed conmon side process
184fb6
184fb6
When podman containers are created by the resource-agent, the podman
184fb6
runtime spawns a side process (conmon) to monitor the container and
184fb6
record the exit status.
184fb6
184fb6
If the conmon process dies unexpectedly (e.g. kill -9), the podman
184fb6
container can still be stopped, even if the cli returns a generic
184fb6
error.
184fb6
184fb6
Try to distinguish this specific failure condition and make the stop
184fb6
operation resilient; when it happens, just log a warning and finish
184fb6
the usual stop actions.
184fb6
---
184fb6
 heartbeat/podman | 18 +++++++++++++++---
184fb6
 1 file changed, 15 insertions(+), 3 deletions(-)
184fb6
184fb6
diff --git a/heartbeat/podman b/heartbeat/podman
184fb6
index 81b00ee6f..9f8c2a091 100755
184fb6
--- a/heartbeat/podman
184fb6
+++ b/heartbeat/podman
184fb6
@@ -419,6 +419,7 @@ podman_start()
184fb6
 podman_stop()
184fb6
 {
184fb6
 	local timeout=60
184fb6
+	local rc
184fb6
 	podman_simple_status
184fb6
 	if [ $? -eq  $OCF_NOT_RUNNING ]; then
184fb6
 		remove_container
184fb6
@@ -434,16 +435,27 @@ podman_stop()
184fb6
 
184fb6
 	if ocf_is_true "$OCF_RESKEY_force_kill"; then
184fb6
 		ocf_run podman kill $CONTAINER
184fb6
+		rc=$?
184fb6
 	else
184fb6
 		ocf_log debug "waiting $timeout second[s] before killing container"
184fb6
 		ocf_run podman stop -t=$timeout $CONTAINER
184fb6
+		rc=$?
184fb6
 		# on stop, systemd will automatically delete any transient
184fb6
 		# drop-in conf that has been created earlier
184fb6
 	fi
184fb6
 
184fb6
-	if [ $? -ne 0 ]; then
184fb6
-		ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
184fb6
-		return $OCF_ERR_GENERIC
184fb6
+	if [ $rc -ne 0 ]; then
184fb6
+		# If the stop failed, it could be because the controlling conmon
184fb6
+		# process died unexpectedly. If so, a generic error code is returned
184fb6
+		# but the associated container exit code is -1. If that's the case,
184fb6
+		# assume there's no failure and continue with the rm as usual.
184fb6
+		if [ $rc -eq 125 ] && \
184fb6
+		   podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then
184fb6
+			ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
184fb6
+		else
184fb6
+			ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
184fb6
+			return $OCF_ERR_GENERIC
184fb6
+		fi
184fb6
 	fi
184fb6
 
184fb6
 	remove_container