Blob Blame History Raw
From 21482202567979b8a17cc750b095272b3270ee76 Mon Sep 17 00:00:00 2001
From: Quentin Armitage <quentin@armitage.org.uk>
Date: Wed, 13 Nov 2019 10:37:38 +0000
Subject: [PATCH] Fix intermittent "child lost" messages

Issue #1364 identified that occassionaly a "child lost" message could
be logged. Although keepalived continued working as expected, the
"child lost" message indicated that something wasn't working properly.

If a vrrp track script had a timeout in the script that was the
same as the script timeout configured in keepalived, when the system
was heavily loaded it was possible for the timeout to occur, followed
by the termination before the timeout thread was run, in which case
the termination would be lost because the child thread was no longer
on the child_pid queue, but on the ready queue.

This commit leaves threads on the child_pid queue after a timeout, and
only removes it when the timeout thread is run. That means that if the
termination is received before the timeout thread is run, the thread
(now on the ready queue) can be updated to be a termination rather than
a timeout.

Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
---
 lib/scheduler.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/lib/scheduler.c b/lib/scheduler.c
index 0a1c334c..f6d9bad1 100644
--- a/lib/scheduler.c
+++ b/lib/scheduler.c
@@ -1708,6 +1708,14 @@ process_threads(thread_master_t *m)
 		 * We only want timer and signal fd, and don't want inotify, vrrp socket,
 		 * snmp_read, bfd_receiver, bfd pipe in vrrp/check, dbus pipe or netlink fds. */
 		thread = thread_trim_head(thread_list);
+
+		if (thread && thread->type == THREAD_CHILD_TIMEOUT) {
+			/* We remove the thread from the child_pid queue here so that
+			 * if the termination arrives before we processed the timeout
+			 * we can still handle the termination. */
+			rb_erase(&thread->rb_data, &master->child_pid);
+		}
+
 		if (!shutting_down ||
 		    (thread->type == THREAD_READY_FD &&
 		     (thread->u.fd == m->timer_fd || thread->u.fd == m->signal_fd)) ||
@@ -1773,6 +1781,12 @@ process_child_termination(pid_t pid, int status)
 
 		thread_add_terminate_event(m);
 	}
+	else if (thread->type == THREAD_CHILD_TIMEOUT) {
+		/* The child had been timed out, but we have not processed the timeout
+		 * and it is still on the thread->ready queue. Since we have now got
+		 * the termination, just handle the termination instead. */
+		thread->type = THREAD_CHILD_TERMINATED;
+	}
 	else
 		thread_move_ready(m, &m->child, thread, THREAD_CHILD_TERMINATED);
 }
-- 
2.26.2