0b7d0e
From 38447895a54daf52e9ec7670401554ae921a96b3 Mon Sep 17 00:00:00 2001
0b7d0e
From: Kir Kolyshkin <kolyshkin@gmail.com>
0b7d0e
Date: Tue, 29 Sep 2020 17:18:29 -0700
0b7d0e
Subject: [PATCH] libct/cgroups/systemd: eliminate runc/systemd race
0b7d0e
0b7d0e
In case it takes more than 1 second for systemd to create a unit,
0b7d0e
startUnit() times out with a warning and then runc proceeds
0b7d0e
(to create cgroups using fs manager and so on).
0b7d0e
0b7d0e
Now runc and systemd are racing, and multiple scenarios are possible.
0b7d0e
0b7d0e
In one such scenario, by the time runc calls systemd manager's Apply()
0b7d0e
the unit is not yet created, the dbusConnection.SetUnitProperties()
0b7d0e
call fails with "unit xxx.scope not found", and the whole container
0b7d0e
start also fails.
0b7d0e
0b7d0e
To eliminate the race, we need to return an error in case the timeout is
0b7d0e
hit.
0b7d0e
0b7d0e
To reduce the chance to fail, increase the timeout from 1 to 30 seconds,
0b7d0e
to not error out too early on a busy/slow system (and times like 3-5
0b7d0e
seconds are not unrealistic).
0b7d0e
0b7d0e
While at it, as the timeout is quite long now, make sure to not leave
0b7d0e
a stray timer.
0b7d0e
0b7d0e
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
0b7d0e
---
0b7d0e
 libcontainer/cgroups/systemd/common.go | 8 ++++++--
0b7d0e
 1 file changed, 6 insertions(+), 2 deletions(-)
0b7d0e
0b7d0e
diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go
0b7d0e
index b567f3e1fc..3f18f7cd0b 100644
0b7d0e
--- a/libcontainer/cgroups/systemd/common.go
0b7d0e
+++ b/libcontainer/cgroups/systemd/common.go
0b7d0e
@@ -325,6 +325,9 @@ func isUnitExists(err error) bool {
0b7d0e
 func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
0b7d0e
 	statusChan := make(chan string, 1)
0b7d0e
 	if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
0b7d0e
+		timeout := time.NewTimer(30 * time.Second)
0b7d0e
+		defer timeout.Stop()
0b7d0e
+
0b7d0e
 		select {
0b7d0e
 		case s := <-statusChan:
0b7d0e
 			close(statusChan)
0b7d0e
@@ -333,8 +336,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
0b7d0e
 				dbusConnection.ResetFailedUnit(unitName)
0b7d0e
 				return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
0b7d0e
 			}
0b7d0e
-		case <-time.After(time.Second):
0b7d0e
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
0b7d0e
+		case <-timeout.C:
0b7d0e
+			dbusConnection.ResetFailedUnit(unitName)
0b7d0e
+			return errors.New("Timeout waiting for systemd to create " + unitName)
0b7d0e
 		}
0b7d0e
 	} else if !isUnitExists(err) {
0b7d0e
 		return err