|
|
b9462c |
From 97eff6cf6c9b58f8239b28be2f080e23c9da62c0 Mon Sep 17 00:00:00 2001
|
|
|
b9462c |
From: Ulrich Obergfell <uobergfe@redhat.com>
|
|
|
b9462c |
Date: Thu, 11 Jul 2019 14:33:55 +0200
|
|
|
b9462c |
Subject: [PATCH] fix deadlock in restore() function
|
|
|
b9462c |
|
|
|
b9462c |
When containerd starts up, the restore() function walks through the directory
|
|
|
b9462c |
hierarchy under /run/docker/libcontainerd/containerd and imports the state of
|
|
|
b9462c |
processes from files in /run/docker/libcontainerd/containerd/CONTAINERID and
|
|
|
b9462c |
in /run/docker/libcontainerd/containerd/CONTAINERID/PROCESSID. The restore()
|
|
|
b9462c |
function adds an ExitTask entry to the s.tasks queue for each process that is
|
|
|
b9462c |
no longer in state 'running'. The size of the s.tasks queue is hard-coded and
|
|
|
b9462c |
limited to 2048 (defaultBufferSize). If more than 2048 ExitTask entries need
|
|
|
b9462c |
to be added to the queue, the restore() function gets blocked (queue is full).
|
|
|
b9462c |
If this happens, containerd is in a kind of deadlock situation because the
|
|
|
b9462c |
handleTask() function (which would drain the ExitTask entries from the queue)
|
|
|
b9462c |
has not been started in a separate goroutine yet, and the main goroutine is
|
|
|
b9462c |
blocked in the restore() function (unable to start the handleTask() function).
|
|
|
b9462c |
|
|
|
b9462c |
This patch introduces the dynamically allocated restoreExitTasks slice which
|
|
|
b9462c |
the restore() function uses to store the ExitTask entries separately instead
|
|
|
b9462c |
of adding them to the s.tasks queue. The task handler goroutine subsequently
|
|
|
b9462c |
drains all entries from restoreExitTasks frist before it enters the loop that
|
|
|
b9462c |
handles entries from the s.tasks queue.
|
|
|
b9462c |
|
|
|
b9462c |
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
|
|
|
b9462c |
---
|
|
|
b9462c |
supervisor/supervisor.go | 35 +++++++++++++++++++++++++++++++++--
|
|
|
b9462c |
1 file changed, 33 insertions(+), 2 deletions(-)
|
|
|
b9462c |
|
|
|
b9462c |
diff --git a/supervisor/supervisor.go b/supervisor/supervisor.go
|
|
|
b9462c |
index 8a26af0..d92de8a 100644
|
|
|
b9462c |
--- a/supervisor/supervisor.go
|
|
|
b9462c |
+++ b/supervisor/supervisor.go
|
|
|
b9462c |
@@ -18,6 +18,16 @@ const (
|
|
|
b9462c |
defaultBufferSize = 2048 // size of queue in eventloop
|
|
|
b9462c |
)
|
|
|
b9462c |
|
|
|
b9462c |
+// Pointers to all ExitTask that are created by the restore() function are stored in this slice.
|
|
|
b9462c |
+var restoreExitTasks []*ExitTask
|
|
|
b9462c |
+
|
|
|
b9462c |
+func max(x, y int) int {
|
|
|
b9462c |
+ if x < y {
|
|
|
b9462c |
+ return y
|
|
|
b9462c |
+ }
|
|
|
b9462c |
+ return x
|
|
|
b9462c |
+}
|
|
|
b9462c |
+
|
|
|
b9462c |
// New returns an initialized Process supervisor.
|
|
|
b9462c |
func New(stateDir string, runtimeName, shimName string, runtimeArgs []string, timeout time.Duration, retainCount int) (*Supervisor, error) {
|
|
|
b9462c |
startTasks := make(chan *startTask, 10)
|
|
|
b9462c |
@@ -207,7 +217,9 @@ type eventV1 struct {
|
|
|
b9462c |
// Events returns an event channel that external consumers can use to receive updates
|
|
|
b9462c |
// on container events
|
|
|
b9462c |
func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Event {
|
|
|
b9462c |
- c := make(chan Event, defaultBufferSize)
|
|
|
b9462c |
+ var c chan Event
|
|
|
b9462c |
+
|
|
|
b9462c |
+ c = make(chan Event, defaultBufferSize)
|
|
|
b9462c |
if storedOnly {
|
|
|
b9462c |
defer s.Unsubscribe(c)
|
|
|
b9462c |
}
|
|
|
b9462c |
@@ -216,6 +228,9 @@ func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Eve
|
|
|
b9462c |
if !from.IsZero() {
|
|
|
b9462c |
// replay old event
|
|
|
b9462c |
s.eventLock.Lock()
|
|
|
b9462c |
+ close(c)
|
|
|
b9462c |
+ // Allocate a channel that has enough space for the entire event log.
|
|
|
b9462c |
+ c = make(chan Event, max(defaultBufferSize, len(s.eventLog)))
|
|
|
b9462c |
past := s.eventLog[:]
|
|
|
b9462c |
s.eventLock.Unlock()
|
|
|
b9462c |
for _, e := range past {
|
|
|
b9462c |
@@ -276,6 +291,21 @@ func (s *Supervisor) Start() error {
|
|
|
b9462c |
"cpus": s.machine.Cpus,
|
|
|
b9462c |
}).Debug("containerd: supervisor running")
|
|
|
b9462c |
go func() {
|
|
|
b9462c |
+ if (len(restoreExitTasks) > 0) {
|
|
|
b9462c |
+ logrus.Infof("containerd: found %d exited processes after restart", len(restoreExitTasks))
|
|
|
b9462c |
+ //
|
|
|
b9462c |
+ // If the restore() function stored any ExitTask in the dedicated slice,
|
|
|
b9462c |
+ // then handle those tasks first. The purpose of the one second delay is
|
|
|
b9462c |
+ // to give dockerd a chance to establish its event stream connection to
|
|
|
b9462c |
+ // containerd. If the connection is established before the ExitTask are
|
|
|
b9462c |
+ // being handled, then dockerd can receive exit notifications directly
|
|
|
b9462c |
+ // (rather than having to replay the notifications from the event log).
|
|
|
b9462c |
+ //
|
|
|
b9462c |
+ time.Sleep(time.Second)
|
|
|
b9462c |
+ for _, e := range restoreExitTasks {
|
|
|
b9462c |
+ s.handleTask(e)
|
|
|
b9462c |
+ }
|
|
|
b9462c |
+ }
|
|
|
b9462c |
for i := range s.tasks {
|
|
|
b9462c |
s.handleTask(i)
|
|
|
b9462c |
}
|
|
|
b9462c |
@@ -385,7 +415,8 @@ func (s *Supervisor) restore() error {
|
|
|
b9462c |
Process: p,
|
|
|
b9462c |
}
|
|
|
b9462c |
e.WithContext(context.Background())
|
|
|
b9462c |
- s.SendTask(e)
|
|
|
b9462c |
+ // Store pointer to ExitTask in dedicated slice.
|
|
|
b9462c |
+ restoreExitTasks = append(restoreExitTasks, e)
|
|
|
b9462c |
}
|
|
|
b9462c |
}
|
|
|
b9462c |
}
|