Blame SOURCES/97eff6cf6c9b58f8239b28be2f080e23c9da62c0.patch

b9462c
From 97eff6cf6c9b58f8239b28be2f080e23c9da62c0 Mon Sep 17 00:00:00 2001
b9462c
From: Ulrich Obergfell <uobergfe@redhat.com>
b9462c
Date: Thu, 11 Jul 2019 14:33:55 +0200
b9462c
Subject: [PATCH] fix deadlock in restore() function
b9462c
b9462c
When containerd starts up, the restore() function walks through the directory
b9462c
hierarchy under /run/docker/libcontainerd/containerd and imports the state of
b9462c
processes from files in /run/docker/libcontainerd/containerd/CONTAINERID and
b9462c
in /run/docker/libcontainerd/containerd/CONTAINERID/PROCESSID. The restore()
b9462c
function adds an ExitTask entry to the s.tasks queue for each process that is
b9462c
no longer in state 'running'. The size of the s.tasks queue is hard-coded and
b9462c
limited to 2048 (defaultBufferSize). If more than 2048 ExitTask entries need
b9462c
to be added to the queue, the restore() function gets blocked (queue is full).
b9462c
If this happens, containerd is in a kind of deadlock situation because the
b9462c
handleTask() function (which would drain the ExitTask entries from the queue)
b9462c
has not been started in a separate goroutine yet, and the main goroutine is
b9462c
blocked in the restore() function (unable to start the handleTask() function).
b9462c
b9462c
This patch introduces the dynamically allocated restoreExitTasks slice which
b9462c
the restore() function uses to store the ExitTask entries separately instead
b9462c
of adding them to the s.tasks queue. The task handler goroutine subsequently
b9462c
drains all entries from restoreExitTasks frist before it enters the loop that
b9462c
handles entries from the s.tasks queue.
b9462c
b9462c
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
b9462c
---
b9462c
 supervisor/supervisor.go | 35 +++++++++++++++++++++++++++++++++--
b9462c
 1 file changed, 33 insertions(+), 2 deletions(-)
b9462c
b9462c
diff --git a/supervisor/supervisor.go b/supervisor/supervisor.go
b9462c
index 8a26af0..d92de8a 100644
b9462c
--- a/supervisor/supervisor.go
b9462c
+++ b/supervisor/supervisor.go
b9462c
@@ -18,6 +18,16 @@ const (
b9462c
 	defaultBufferSize = 2048 // size of queue in eventloop
b9462c
 )
b9462c
 
b9462c
+// Pointers to all ExitTask that are created by the restore() function are stored in this slice.
b9462c
+var restoreExitTasks []*ExitTask
b9462c
+
b9462c
+func max(x, y int) int {
b9462c
+	if x < y {
b9462c
+		return y
b9462c
+	}
b9462c
+	return x
b9462c
+}
b9462c
+
b9462c
 // New returns an initialized Process supervisor.
b9462c
 func New(stateDir string, runtimeName, shimName string, runtimeArgs []string, timeout time.Duration, retainCount int) (*Supervisor, error) {
b9462c
 	startTasks := make(chan *startTask, 10)
b9462c
@@ -207,7 +217,9 @@ type eventV1 struct {
b9462c
 // Events returns an event channel that external consumers can use to receive updates
b9462c
 // on container events
b9462c
 func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Event {
b9462c
-	c := make(chan Event, defaultBufferSize)
b9462c
+	var c chan Event
b9462c
+
b9462c
+	c = make(chan Event, defaultBufferSize)
b9462c
 	if storedOnly {
b9462c
 		defer s.Unsubscribe(c)
b9462c
 	}
b9462c
@@ -216,6 +228,9 @@ func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Eve
b9462c
 	if !from.IsZero() {
b9462c
 		// replay old event
b9462c
 		s.eventLock.Lock()
b9462c
+		close(c)
b9462c
+		// Allocate a channel that has enough space for the entire event log.
b9462c
+		c = make(chan Event, max(defaultBufferSize, len(s.eventLog)))
b9462c
 		past := s.eventLog[:]
b9462c
 		s.eventLock.Unlock()
b9462c
 		for _, e := range past {
b9462c
@@ -276,6 +291,21 @@ func (s *Supervisor) Start() error {
b9462c
 		"cpus":        s.machine.Cpus,
b9462c
 	}).Debug("containerd: supervisor running")
b9462c
 	go func() {
b9462c
+		if (len(restoreExitTasks) > 0) {
b9462c
+			logrus.Infof("containerd: found %d exited processes after restart", len(restoreExitTasks))
b9462c
+			//
b9462c
+			// If the restore() function stored any ExitTask in the dedicated slice,
b9462c
+			// then handle those tasks first. The purpose of the one second delay is
b9462c
+			// to give dockerd a chance to establish its event stream connection to
b9462c
+			// containerd. If the connection is established before the ExitTask are
b9462c
+			// being handled, then dockerd can receive exit notifications directly
b9462c
+			// (rather than having to replay the notifications from the event log).
b9462c
+			//
b9462c
+			time.Sleep(time.Second)
b9462c
+			for _, e := range restoreExitTasks {
b9462c
+				s.handleTask(e)
b9462c
+			}
b9462c
+		}
b9462c
 		for i := range s.tasks {
b9462c
 			s.handleTask(i)
b9462c
 		}
b9462c
@@ -385,7 +415,8 @@ func (s *Supervisor) restore() error {
b9462c
 					Process: p,
b9462c
 				}
b9462c
 				e.WithContext(context.Background())
b9462c
-				s.SendTask(e)
b9462c
+				// Store pointer to ExitTask in dedicated slice.
b9462c
+				restoreExitTasks = append(restoreExitTasks, e)
b9462c
 			}
b9462c
 		}
b9462c
 	}