Blob Blame History Raw
From 97eff6cf6c9b58f8239b28be2f080e23c9da62c0 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Thu, 11 Jul 2019 14:33:55 +0200
Subject: [PATCH] fix deadlock in restore() function

When containerd starts up, the restore() function walks through the directory
hierarchy under /run/docker/libcontainerd/containerd and imports the state of
processes from files in /run/docker/libcontainerd/containerd/CONTAINERID and
in /run/docker/libcontainerd/containerd/CONTAINERID/PROCESSID. The restore()
function adds an ExitTask entry to the s.tasks queue for each process that is
no longer in state 'running'. The size of the s.tasks queue is hard-coded and
limited to 2048 (defaultBufferSize). If more than 2048 ExitTask entries need
to be added to the queue, the restore() function gets blocked (queue is full).
If this happens, containerd is in a kind of deadlock situation because the
handleTask() function (which would drain the ExitTask entries from the queue)
has not been started in a separate goroutine yet, and the main goroutine is
blocked in the restore() function (unable to start the handleTask() function).

This patch introduces the dynamically allocated restoreExitTasks slice which
the restore() function uses to store the ExitTask entries separately instead
of adding them to the s.tasks queue. The task handler goroutine subsequently
drains all entries from restoreExitTasks frist before it enters the loop that
handles entries from the s.tasks queue.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
---
 supervisor/supervisor.go | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/supervisor/supervisor.go b/supervisor/supervisor.go
index 8a26af0..d92de8a 100644
--- a/supervisor/supervisor.go
+++ b/supervisor/supervisor.go
@@ -18,6 +18,16 @@ const (
 	defaultBufferSize = 2048 // size of queue in eventloop
 )
 
+// Pointers to all ExitTask that are created by the restore() function are stored in this slice.
+var restoreExitTasks []*ExitTask
+
+func max(x, y int) int {
+	if x < y {
+		return y
+	}
+	return x
+}
+
 // New returns an initialized Process supervisor.
 func New(stateDir string, runtimeName, shimName string, runtimeArgs []string, timeout time.Duration, retainCount int) (*Supervisor, error) {
 	startTasks := make(chan *startTask, 10)
@@ -207,7 +217,9 @@ type eventV1 struct {
 // Events returns an event channel that external consumers can use to receive updates
 // on container events
 func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Event {
-	c := make(chan Event, defaultBufferSize)
+	var c chan Event
+
+	c = make(chan Event, defaultBufferSize)
 	if storedOnly {
 		defer s.Unsubscribe(c)
 	}
@@ -216,6 +228,9 @@ func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Eve
 	if !from.IsZero() {
 		// replay old event
 		s.eventLock.Lock()
+		close(c)
+		// Allocate a channel that has enough space for the entire event log.
+		c = make(chan Event, max(defaultBufferSize, len(s.eventLog)))
 		past := s.eventLog[:]
 		s.eventLock.Unlock()
 		for _, e := range past {
@@ -276,6 +291,21 @@ func (s *Supervisor) Start() error {
 		"cpus":        s.machine.Cpus,
 	}).Debug("containerd: supervisor running")
 	go func() {
+		if (len(restoreExitTasks) > 0) {
+			logrus.Infof("containerd: found %d exited processes after restart", len(restoreExitTasks))
+			//
+			// If the restore() function stored any ExitTask in the dedicated slice,
+			// then handle those tasks first. The purpose of the one second delay is
+			// to give dockerd a chance to establish its event stream connection to
+			// containerd. If the connection is established before the ExitTask are
+			// being handled, then dockerd can receive exit notifications directly
+			// (rather than having to replay the notifications from the event log).
+			//
+			time.Sleep(time.Second)
+			for _, e := range restoreExitTasks {
+				s.handleTask(e)
+			}
+		}
 		for i := range s.tasks {
 			s.handleTask(i)
 		}
@@ -385,7 +415,8 @@ func (s *Supervisor) restore() error {
 					Process: p,
 				}
 				e.WithContext(context.Background())
-				s.SendTask(e)
+				// Store pointer to ExitTask in dedicated slice.
+				restoreExitTasks = append(restoreExitTasks, e)
 			}
 		}
 	}