Blame SOURCES/bz1972743-podman-fix-container-creation-race.patch

fea0d9
From 7850aea1600389beb16c7aad40bba1b76ae694c4 Mon Sep 17 00:00:00 2001
fea0d9
From: Damien Ciabrini <dciabrin@redhat.com>
fea0d9
Date: Tue, 15 Jun 2021 20:03:20 +0200
fea0d9
Subject: [PATCH] podman: workaround race during container creation
fea0d9
fea0d9
podman and OCI runtime have a race that sometimes causes
fea0d9
a container to fail to be created and run [1] if the
fea0d9
cgroup to be used is not available yet. When that happens,
fea0d9
try to recreate it until it succeeds or the start
fea0d9
timeout is reached.
fea0d9
fea0d9
[1] https://bugzilla.redhat.com/show_bug.cgi?id=1972209
fea0d9
---
fea0d9
 heartbeat/podman | 32 ++++++++++++++++++++++++++++++--
fea0d9
 1 file changed, 30 insertions(+), 2 deletions(-)
fea0d9
fea0d9
diff --git a/heartbeat/podman b/heartbeat/podman
fea0d9
index 5b707f3f5..034dfff76 100755
fea0d9
--- a/heartbeat/podman
fea0d9
+++ b/heartbeat/podman
fea0d9
@@ -358,8 +358,18 @@ run_new_container()
fea0d9
 	local rc
fea0d9
 	
fea0d9
 	ocf_log info "running container $CONTAINER for the first time"
fea0d9
-	ocf_run podman run $opts $image $cmd
fea0d9
+	out=$(podman run $opts $image $cmd 2>&1)
fea0d9
 	rc=$?
fea0d9
+
fea0d9
+	if [ -n "$out" ]; then
fea0d9
+		out="$(echo "$out" | tr -s ' \t\r\n' ' ')"
fea0d9
+		if [ $rc -eq 0 ]; then
fea0d9
+			ocf_log info "$out"
fea0d9
+		else
fea0d9
+			ocf_log err "$out"
fea0d9
+		fi
fea0d9
+	fi
fea0d9
+
fea0d9
 	if [ $rc -eq 125 ]; then
fea0d9
 		# If an internal podman error occurred, it might be because
fea0d9
 		# the internal storage layer still references an old container
fea0d9
@@ -370,6 +380,24 @@ run_new_container()
fea0d9
 		ocf_run podman rm --storage $CONTAINER
fea0d9
 		ocf_run podman run $opts $image $cmd
fea0d9
 		rc=$?
fea0d9
+	elif [ $rc -eq 127 ]; then
fea0d9
+		# rhbz#1972209: podman 3.0.x seems to be hit by a race
fea0d9
+		# where the cgroup is not yet set up properly when the OCI
fea0d9
+		# runtime configures the container. If that happens, recreate
fea0d9
+		# the container as long as we get the same error code or
fea0d9
+		# until start timeout preempts us.
fea0d9
+		while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do
fea0d9
+			ocf_log warn "Internal podman error while assigning cgroup. Retrying."
fea0d9
+			# Arbitrary sleep to prevent consuming all CPU while looping
fea0d9
+			sleep 1
fea0d9
+			podman rm -f "$CONTAINER"
fea0d9
+			out=$(podman run $opts $image $cmd 2>&1)
fea0d9
+			rc=$?
fea0d9
+		done
fea0d9
+		# Log the created container ID if it succeeded
fea0d9
+		if  [ $rc -eq 0 ]; then
fea0d9
+			ocf_log info "$out"
fea0d9
+		fi
fea0d9
 	fi
fea0d9
 	
fea0d9
 	return $rc
fea0d9
@@ -422,7 +450,7 @@ podman_start()
fea0d9
 	fi
fea0d9
 
fea0d9
 	if [ $rc -ne 0 ]; then
fea0d9
-		ocf_exit_reason "podman failed to launch container"
fea0d9
+		ocf_exit_reason "podman failed to launch container (rc: $rc)"
fea0d9
 		return $OCF_ERR_GENERIC
fea0d9
 	fi
fea0d9