|
|
0b7d0e |
From 38447895a54daf52e9ec7670401554ae921a96b3 Mon Sep 17 00:00:00 2001
|
|
|
0b7d0e |
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
|
|
0b7d0e |
Date: Tue, 29 Sep 2020 17:18:29 -0700
|
|
|
0b7d0e |
Subject: [PATCH] libct/cgroups/systemd: eliminate runc/systemd race
|
|
|
0b7d0e |
|
|
|
0b7d0e |
In case it takes more than 1 second for systemd to create a unit,
|
|
|
0b7d0e |
startUnit() times out with a warning and then runc proceeds
|
|
|
0b7d0e |
(to create cgroups using fs manager and so on).
|
|
|
0b7d0e |
|
|
|
0b7d0e |
Now runc and systemd are racing, and multiple scenarios are possible.
|
|
|
0b7d0e |
|
|
|
0b7d0e |
In one such scenario, by the time runc calls systemd manager's Apply()
|
|
|
0b7d0e |
the unit is not yet created, the dbusConnection.SetUnitProperties()
|
|
|
0b7d0e |
call fails with "unit xxx.scope not found", and the whole container
|
|
|
0b7d0e |
start also fails.
|
|
|
0b7d0e |
|
|
|
0b7d0e |
To eliminate the race, we need to return an error in case the timeout is
|
|
|
0b7d0e |
hit.
|
|
|
0b7d0e |
|
|
|
0b7d0e |
To reduce the chance to fail, increase the timeout from 1 to 30 seconds,
|
|
|
0b7d0e |
to not error out too early on a busy/slow system (and times like 3-5
|
|
|
0b7d0e |
seconds are not unrealistic).
|
|
|
0b7d0e |
|
|
|
0b7d0e |
While at it, as the timeout is quite long now, make sure to not leave
|
|
|
0b7d0e |
a stray timer.
|
|
|
0b7d0e |
|
|
|
0b7d0e |
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
|
|
0b7d0e |
---
|
|
|
0b7d0e |
libcontainer/cgroups/systemd/common.go | 8 ++++++--
|
|
|
0b7d0e |
1 file changed, 6 insertions(+), 2 deletions(-)
|
|
|
0b7d0e |
|
|
|
0b7d0e |
diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go
|
|
|
0b7d0e |
index b567f3e1fc..3f18f7cd0b 100644
|
|
|
0b7d0e |
--- a/libcontainer/cgroups/systemd/common.go
|
|
|
0b7d0e |
+++ b/libcontainer/cgroups/systemd/common.go
|
|
|
0b7d0e |
@@ -325,6 +325,9 @@ func isUnitExists(err error) bool {
|
|
|
0b7d0e |
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
|
|
|
0b7d0e |
statusChan := make(chan string, 1)
|
|
|
0b7d0e |
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
|
|
|
0b7d0e |
+ timeout := time.NewTimer(30 * time.Second)
|
|
|
0b7d0e |
+ defer timeout.Stop()
|
|
|
0b7d0e |
+
|
|
|
0b7d0e |
select {
|
|
|
0b7d0e |
case s := <-statusChan:
|
|
|
0b7d0e |
close(statusChan)
|
|
|
0b7d0e |
@@ -333,8 +336,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
|
|
|
0b7d0e |
dbusConnection.ResetFailedUnit(unitName)
|
|
|
0b7d0e |
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
|
|
|
0b7d0e |
}
|
|
|
0b7d0e |
- case <-time.After(time.Second):
|
|
|
0b7d0e |
- logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
|
|
|
0b7d0e |
+ case <-timeout.C:
|
|
|
0b7d0e |
+ dbusConnection.ResetFailedUnit(unitName)
|
|
|
0b7d0e |
+ return errors.New("Timeout waiting for systemd to create " + unitName)
|
|
|
0b7d0e |
}
|
|
|
0b7d0e |
} else if !isUnitExists(err) {
|
|
|
0b7d0e |
return err
|