|
|
fea0d9 |
From 7850aea1600389beb16c7aad40bba1b76ae694c4 Mon Sep 17 00:00:00 2001
|
|
|
fea0d9 |
From: Damien Ciabrini <dciabrin@redhat.com>
|
|
|
fea0d9 |
Date: Tue, 15 Jun 2021 20:03:20 +0200
|
|
|
fea0d9 |
Subject: [PATCH] podman: workaround race during container creation
|
|
|
fea0d9 |
|
|
|
fea0d9 |
podman and OCI runtime have a race that sometimes causes
|
|
|
fea0d9 |
a container to fail to be created and run [1] if the
|
|
|
fea0d9 |
cgroup to be used is not available yet. When that happens,
|
|
|
fea0d9 |
try to recreate it until it succeeds or the start
|
|
|
fea0d9 |
timeout is reached.
|
|
|
fea0d9 |
|
|
|
fea0d9 |
[1] https://bugzilla.redhat.com/show_bug.cgi?id=1972209
|
|
|
fea0d9 |
---
|
|
|
fea0d9 |
heartbeat/podman | 32 ++++++++++++++++++++++++++++++--
|
|
|
fea0d9 |
1 file changed, 30 insertions(+), 2 deletions(-)
|
|
|
fea0d9 |
|
|
|
fea0d9 |
diff --git a/heartbeat/podman b/heartbeat/podman
|
|
|
fea0d9 |
index 5b707f3f5..034dfff76 100755
|
|
|
fea0d9 |
--- a/heartbeat/podman
|
|
|
fea0d9 |
+++ b/heartbeat/podman
|
|
|
fea0d9 |
@@ -358,8 +358,18 @@ run_new_container()
|
|
|
fea0d9 |
local rc
|
|
|
fea0d9 |
|
|
|
fea0d9 |
ocf_log info "running container $CONTAINER for the first time"
|
|
|
fea0d9 |
- ocf_run podman run $opts $image $cmd
|
|
|
fea0d9 |
+ out=$(podman run $opts $image $cmd 2>&1)
|
|
|
fea0d9 |
rc=$?
|
|
|
fea0d9 |
+
|
|
|
fea0d9 |
+ if [ -n "$out" ]; then
|
|
|
fea0d9 |
+ out="$(echo "$out" | tr -s ' \t\r\n' ' ')"
|
|
|
fea0d9 |
+ if [ $rc -eq 0 ]; then
|
|
|
fea0d9 |
+ ocf_log info "$out"
|
|
|
fea0d9 |
+ else
|
|
|
fea0d9 |
+ ocf_log err "$out"
|
|
|
fea0d9 |
+ fi
|
|
|
fea0d9 |
+ fi
|
|
|
fea0d9 |
+
|
|
|
fea0d9 |
if [ $rc -eq 125 ]; then
|
|
|
fea0d9 |
# If an internal podman error occurred, it might be because
|
|
|
fea0d9 |
# the internal storage layer still references an old container
|
|
|
fea0d9 |
@@ -370,6 +380,24 @@ run_new_container()
|
|
|
fea0d9 |
ocf_run podman rm --storage $CONTAINER
|
|
|
fea0d9 |
ocf_run podman run $opts $image $cmd
|
|
|
fea0d9 |
rc=$?
|
|
|
fea0d9 |
+ elif [ $rc -eq 127 ]; then
|
|
|
fea0d9 |
+ # rhbz#1972209: podman 3.0.x seems to be hit by a race
|
|
|
fea0d9 |
+ # where the cgroup is not yet set up properly when the OCI
|
|
|
fea0d9 |
+ # runtime configures the container. If that happens, recreate
|
|
|
fea0d9 |
+ # the container as long as we get the same error code or
|
|
|
fea0d9 |
+ # until start timeout preempts us.
|
|
|
fea0d9 |
+ while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do
|
|
|
fea0d9 |
+ ocf_log warn "Internal podman error while assigning cgroup. Retrying."
|
|
|
fea0d9 |
+ # Arbitrary sleep to prevent consuming all CPU while looping
|
|
|
fea0d9 |
+ sleep 1
|
|
|
fea0d9 |
+ podman rm -f "$CONTAINER"
|
|
|
fea0d9 |
+ out=$(podman run $opts $image $cmd 2>&1)
|
|
|
fea0d9 |
+ rc=$?
|
|
|
fea0d9 |
+ done
|
|
|
fea0d9 |
+ # Log the created container ID if it succeeded
|
|
|
fea0d9 |
+ if [ $rc -eq 0 ]; then
|
|
|
fea0d9 |
+ ocf_log info "$out"
|
|
|
fea0d9 |
+ fi
|
|
|
fea0d9 |
fi
|
|
|
fea0d9 |
|
|
|
fea0d9 |
return $rc
|
|
|
fea0d9 |
@@ -422,7 +450,7 @@ podman_start()
|
|
|
fea0d9 |
fi
|
|
|
fea0d9 |
|
|
|
fea0d9 |
if [ $rc -ne 0 ]; then
|
|
|
fea0d9 |
- ocf_exit_reason "podman failed to launch container"
|
|
|
fea0d9 |
+ ocf_exit_reason "podman failed to launch container (rc: $rc)"
|
|
|
fea0d9 |
return $OCF_ERR_GENERIC
|
|
|
fea0d9 |
fi
|
|
|
fea0d9 |
|