Blob Blame History Raw
From 38447895a54daf52e9ec7670401554ae921a96b3 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Tue, 29 Sep 2020 17:18:29 -0700
Subject: [PATCH] libct/cgroups/systemd: eliminate runc/systemd race

In case it takes more than 1 second for systemd to create a unit,
startUnit() times out with a warning and then runc proceeds
(to create cgroups using fs manager and so on).

Now runc and systemd are racing, and multiple scenarios are possible.

In one such scenario, by the time runc calls systemd manager's Apply()
the unit is not yet created, the dbusConnection.SetUnitProperties()
call fails with "unit xxx.scope not found", and the whole container
start also fails.

To eliminate the race, we need to return an error in case the timeout is
hit.

To reduce the chance to fail, increase the timeout from 1 to 30 seconds,
to not error out too early on a busy/slow system (and times like 3-5
seconds are not unrealistic).

While at it, as the timeout is quite long now, make sure to not leave
a stray timer.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/cgroups/systemd/common.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go
index b567f3e1fc..3f18f7cd0b 100644
--- a/libcontainer/cgroups/systemd/common.go
+++ b/libcontainer/cgroups/systemd/common.go
@@ -325,6 +325,9 @@ func isUnitExists(err error) bool {
 func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
 	statusChan := make(chan string, 1)
 	if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+		timeout := time.NewTimer(30 * time.Second)
+		defer timeout.Stop()
+
 		select {
 		case s := <-statusChan:
 			close(statusChan)
@@ -333,8 +336,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
 				dbusConnection.ResetFailedUnit(unitName)
 				return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
 			}
-		case <-time.After(time.Second):
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+		case <-timeout.C:
+			dbusConnection.ResetFailedUnit(unitName)
+			return errors.New("Timeout waiting for systemd to create " + unitName)
 		}
 	} else if !isUnitExists(err) {
 		return err