0331be
From 38447895a54daf52e9ec7670401554ae921a96b3 Mon Sep 17 00:00:00 2001
0331be
From: Kir Kolyshkin <kolyshkin@gmail.com>
0331be
Date: Tue, 29 Sep 2020 17:18:29 -0700
0331be
Subject: [PATCH] libct/cgroups/systemd: eliminate runc/systemd race
0331be
0331be
In case it takes more than 1 second for systemd to create a unit,
0331be
startUnit() times out with a warning and then runc proceeds
0331be
(to create cgroups using fs manager and so on).
0331be
0331be
Now runc and systemd are racing, and multiple scenarios are possible.
0331be
0331be
In one such scenario, by the time runc calls systemd manager's Apply()
0331be
the unit is not yet created, the dbusConnection.SetUnitProperties()
0331be
call fails with "unit xxx.scope not found", and the whole container
0331be
start also fails.
0331be
0331be
To eliminate the race, we need to return an error in case the timeout is
0331be
hit.
0331be
0331be
To reduce the chance to fail, increase the timeout from 1 to 30 seconds,
0331be
to not error out too early on a busy/slow system (and times like 3-5
0331be
seconds are not unrealistic).
0331be
0331be
While at it, as the timeout is quite long now, make sure to not leave
0331be
a stray timer.
0331be
0331be
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
0331be
---
0331be
 libcontainer/cgroups/systemd/common.go | 8 ++++++--
0331be
 1 file changed, 6 insertions(+), 2 deletions(-)
0331be
0331be
diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go
0331be
index b567f3e1fc..3f18f7cd0b 100644
0331be
--- a/libcontainer/cgroups/systemd/common.go
0331be
+++ b/libcontainer/cgroups/systemd/common.go
0331be
@@ -325,6 +325,9 @@ func isUnitExists(err error) bool {
0331be
 func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
0331be
 	statusChan := make(chan string, 1)
0331be
 	if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
0331be
+		timeout := time.NewTimer(30 * time.Second)
0331be
+		defer timeout.Stop()
0331be
+
0331be
 		select {
0331be
 		case s := <-statusChan:
0331be
 			close(statusChan)
0331be
@@ -333,8 +336,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
0331be
 				dbusConnection.ResetFailedUnit(unitName)
0331be
 				return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
0331be
 			}
0331be
-		case <-time.After(time.Second):
0331be
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
0331be
+		case <-timeout.C:
0331be
+			dbusConnection.ResetFailedUnit(unitName)
0331be
+			return errors.New("Timeout waiting for systemd to create " + unitName)
0331be
 		}
0331be
 	} else if !isUnitExists(err) {
0331be
 		return err