Blob Blame History Raw
From 496f922dad9b58b891f417b937ac5fb0b0a8649a Mon Sep 17 00:00:00 2001
From: Eugene Syromyatnikov <evgsyr@gmail.com>
Date: Wed, 8 Aug 2018 21:41:39 +0200
Subject: [PATCH 27/27] Implement queueing of threads before dispatching them

It is possible that some tracees call a lot of cheap syscalls too fast,
and that can stress the tracer to the point some tracees are not served
for indefinite amount of time.  In order to avoid that unfairness, try
to collect all the pending tracees first along with the relevant
information and only then dispatch the events.

* defs.h: Include "list.h".
(struct tcb): Add wait_data_idx, delayed_wait_data, and wait_list
fields.
* strace.c (struct tcb_wait_data): Add "msg" field.
(tcb_wait_tab, tcb_wait_tab_size): New static variables.
(alloctcb): Initialize wait_list.
(droptcb): Remove tcp from wait_list.
(maybe_switch_tcbs): Get old pid from
tcb_wait_tab[tcp->wait_data_idx].msg instead of calling
ptrace(PTRACE_GETEVENTMSG).
(trace_wait_data_size, init_trace_wait_data, copy_trace_wait_data,
free_trace_wait_data, tcb_wait_tab_check_size): New functions, in order
to allow the code outside next_event to operate with wait_data as with
an opaque object (needed for dispatch_event and restart_delayed_tcb).
(next_event): Add pending_tcps, extra_tcp, wait_nohang, elem, and
wait_tab_pos variables; check for elements in pending_tcps and skip
waiting if the list is not empty; check for extra_tcp and skip waiting
along with swapping wait_data_idx with wait_extra_data_idx;
after the initial wait4(), call wait4() in loop with WNOHANG flag set;
fetch siginfo on signal and eventmsg on PTRACE_EVENT_EXEC;
return the first tcp in pending_tcps list.
(dispatch_event): Store a pointer to a copy of tcb_wait_data in
tcp->delayed_wait_data if tcp's restart has to be delayed.
(restart_delayed_tcb): Use tcp->delayed_wait_data, create a stub
tcb_wait_data if it is NULL, free temporary trace_wait_data.
* tests/Makefile.am (XFAIL_TEST): Remove looping_threads.test.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=478419
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=526740
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=851457
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1609318
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1610774
Co-Authored-by: Dmitry V. Levin <ldv@altlinux.org>
Co-Authored-by: Denys Vlasenko <dvlasenk@redhat.com>
Co-Authored-by: Andreas Schwab <aschwab@redhat.com>
Co-Authored-by: Jeff Law <law@redhat.com>
Co-Authored-by: DJ Delorie <dj@redhat.com>

Conflicts:
	defs.h
	tests/Makefile.am
---
 defs.h            |  12 ++
 strace.c          | 393 +++++++++++++++++++++++++++++++++++++-----------------
 tests/Makefile.am |   3 +-
 3 files changed, 283 insertions(+), 125 deletions(-)

diff --git a/defs.h b/defs.h
index 811bb0d..2a614a7 100644
--- a/defs.h
+++ b/defs.h
@@ -36,6 +36,7 @@
 #include "arch_defs.h"
 #include "error_prints.h"
 #include "gcc_compat.h"
+#include "list.h"
 #include "kernel_types.h"
 #include "macros.h"
 #include "mpers_type.h"
@@ -218,6 +219,17 @@ struct tcb {
 
 	struct mmap_cache_t *mmap_cache;
 
+	/*
+	 * Data that is stored during process wait traversal.
+	 * We use indices as the actual data is stored in an array
+	 * that is realloc'ed at runtime.
+	 */
+	size_t wait_data_idx;
+	/** Wait data storage for a delayed process. */
+	struct tcb_wait_data *delayed_wait_data;
+	struct list_item wait_list;
+
+
 #ifdef HAVE_LINUX_KVM_H
 	struct vcpu_info *vcpu_info_list;
 #endif
diff --git a/strace.c b/strace.c
index 0745838..0914dc7 100644
--- a/strace.c
+++ b/strace.c
@@ -141,6 +141,7 @@ static struct tcb *current_tcp;
 struct tcb_wait_data {
 	enum trace_event te; /**< Event passed to dispatch_event() */
 	int status;          /**< status, returned by wait4() */
+	unsigned long msg;   /**< Value returned by PTRACE_GETEVENTMSG */
 	siginfo_t si;        /**< siginfo, returned by PTRACE_GETSIGINFO */
 };
 
@@ -148,6 +149,10 @@ static struct tcb **tcbtab;
 static unsigned int nprocs;
 static size_t tcbtabsize;
 
+static struct tcb_wait_data *tcb_wait_tab;
+static size_t tcb_wait_tab_size;
+
+
 #ifndef HAVE_PROGRAM_INVOCATION_NAME
 char *program_invocation_name;
 #endif
@@ -745,6 +750,7 @@ alloctcb(int pid)
 		tcp = tcbtab[i];
 		if (!tcp->pid) {
 			memset(tcp, 0, sizeof(*tcp));
+			list_init(&tcp->wait_list);
 			tcp->pid = pid;
 #if SUPPORTED_PERSONALITIES > 1
 			tcp->currpers = current_personality;
@@ -833,6 +839,8 @@ droptcb(struct tcb *tcp)
 	if (printing_tcp == tcp)
 		printing_tcp = NULL;
 
+	list_remove(&tcp->wait_list);
+
 	memset(tcp, 0, sizeof(*tcp));
 }
 
@@ -2051,10 +2059,8 @@ maybe_switch_tcbs(struct tcb *tcp, const int pid)
 {
 	FILE *fp;
 	struct tcb *execve_thread;
-	long old_pid = 0;
+	long old_pid = tcb_wait_tab[tcp->wait_data_idx].msg;
 
-	if (ptrace(PTRACE_GETEVENTMSG, pid, NULL, &old_pid) < 0)
-		return tcp;
 	/* Avoid truncation in pid2tcb() param passing */
 	if (old_pid <= 0 || old_pid == pid)
 		return tcp;
@@ -2207,20 +2213,74 @@ print_event_exit(struct tcb *tcp)
 	line_ended();
 }
 
-static const struct tcb_wait_data *
-next_event(void)
+static size_t
+trace_wait_data_size(struct tcb *tcp)
 {
-	static struct tcb_wait_data wait_data;
+	return sizeof(struct tcb_wait_data);
+}
 
-	int pid;
-	int status;
-	struct tcb *tcp;
-	struct tcb_wait_data *wd = &wait_data;
-	struct rusage ru;
+static struct tcb_wait_data *
+init_trace_wait_data(void *p)
+{
+	struct tcb_wait_data *wd = p;
+
+	memset(wd, 0, sizeof(*wd));
+
+	return wd;
+}
 
+static struct tcb_wait_data *
+copy_trace_wait_data(const struct tcb_wait_data *wd)
+{
+	struct tcb_wait_data *new_wd = xmalloc(sizeof(*new_wd));
+
+	memcpy(new_wd, wd, sizeof(*wd));
+
+	return new_wd;
+}
+
+static void
+free_trace_wait_data(struct tcb_wait_data *wd)
+{
+	free(wd);
+}
+
+static void
+tcb_wait_tab_check_size(const size_t size)
+{
+	while (size >= tcb_wait_tab_size) {
+		tcb_wait_tab = xgrowarray(tcb_wait_tab,
+					  &tcb_wait_tab_size,
+					  sizeof(tcb_wait_tab[0]));
+	}
+}
+
+static const struct tcb_wait_data *
+next_event(void)
+{
 	if (interrupted)
 		return NULL;
 
+	struct tcb *tcp = NULL;
+	struct list_item *elem;
+
+	static EMPTY_LIST(pending_tcps);
+	/* Handle the queued tcbs before waiting for new events.  */
+	if (!list_is_empty(&pending_tcps))
+		goto next_event_get_tcp;
+
+	static struct tcb *extra_tcp;
+	static size_t wait_extra_data_idx;
+	/* Handle the extra tcb event.  */
+	if (extra_tcp) {
+		tcp = extra_tcp;
+		extra_tcp = NULL;
+		tcp->wait_data_idx = wait_extra_data_idx;
+
+		debug_msg("dequeued extra event for pid %u", tcp->pid);
+		goto next_event_exit;
+	}
+
 	/*
 	 * Used to exit simply when nprocs hits zero, but in this testcase:
 	 *  int main(void) { _exit(!!fork()); }
@@ -2262,8 +2322,10 @@ next_event(void)
 	 * then the system call will be interrupted and
 	 * the expiration will be handled by the signal handler.
 	 */
-	pid = wait4(-1, &status, __WALL, (cflag ? &ru : NULL));
-	const int wait_errno = errno;
+	int status;
+	struct rusage ru;
+	int pid = wait4(-1, &status, __WALL, (cflag ? &ru : NULL));
+	int wait_errno = errno;
 
 	/*
 	 * The window of opportunity to handle expirations
@@ -2279,135 +2341,202 @@ next_event(void)
 			return NULL;
 	}
 
-	if (pid < 0) {
-		if (wait_errno == EINTR) {
-			wd->te = TE_NEXT;
-			return wd;
+	size_t wait_tab_pos = 0;
+	bool wait_nohang = false;
+
+	/*
+	 * Wait for new events until wait4() returns 0 (meaning that there's
+	 * nothing more to wait for for now), or a second event for some tcb
+	 * appears (which may happen if a tracee was SIGKILL'ed, for example).
+	 */
+	for (;;) {
+		struct tcb_wait_data *wd;
+
+		if (pid < 0) {
+			if (wait_errno == EINTR)
+				break;
+			if (wait_nohang)
+				break;
+			if (nprocs == 0 && wait_errno == ECHILD)
+				return NULL;
+			/*
+			 * If nprocs > 0, ECHILD is not expected,
+			 * treat it as any other error here:
+			 */
+			errno = wait_errno;
+			perror_msg_and_die("wait4(__WALL)");
 		}
-		if (nprocs == 0 && wait_errno == ECHILD)
-			return NULL;
-		/*
-		 * If nprocs > 0, ECHILD is not expected,
-		 * treat it as any other error here:
-		 */
-		errno = wait_errno;
-		perror_msg_and_die("wait4(__WALL)");
-	}
 
-	wd->status = status;
+		if (!pid)
+			break;
 
-	if (pid == popen_pid) {
-		if (!WIFSTOPPED(status))
-			popen_pid = 0;
-		wd->te = TE_NEXT;
-		return wd;
-	}
+		if (pid == popen_pid) {
+			if (!WIFSTOPPED(status))
+				popen_pid = 0;
+			break;
+		}
 
-	if (debug_flag)
-		print_debug_info(pid, status);
+		if (debug_flag)
+			print_debug_info(pid, status);
 
-	/* Look up 'pid' in our table. */
-	tcp = pid2tcb(pid);
+		/* Look up 'pid' in our table. */
+		tcp = pid2tcb(pid);
 
-	if (!tcp) {
-		tcp = maybe_allocate_tcb(pid, status);
 		if (!tcp) {
-			wd->te = TE_NEXT;
-			return wd;
+			tcp = maybe_allocate_tcb(pid, status);
+			if (!tcp)
+				goto next_event_wait_next;
 		}
-	}
 
-	clear_regs(tcp);
+		if (cflag) {
+			struct timespec stime = {
+				.tv_sec = ru.ru_stime.tv_sec,
+				.tv_nsec = ru.ru_stime.tv_usec * 1000
+			};
+			ts_sub(&tcp->dtime, &stime, &tcp->stime);
+			tcp->stime = stime;
+		}
 
-	/* Set current output file */
-	set_current_tcp(tcp);
+		tcb_wait_tab_check_size(wait_tab_pos);
 
-	if (cflag) {
-		struct timespec stime = {
-			.tv_sec = ru.ru_stime.tv_sec,
-			.tv_nsec = ru.ru_stime.tv_usec * 1000
-		};
-		ts_sub(&tcp->dtime, &stime, &tcp->stime);
-		tcp->stime = stime;
-	}
+		/* Initialise a new wait data structure.  */
+		wd = tcb_wait_tab + wait_tab_pos;
+		init_trace_wait_data(wd);
+		wd->status = status;
 
-	if (WIFSIGNALED(status)) {
-		wd->te = TE_SIGNALLED;
-		return wd;
-	}
+		if (WIFSIGNALED(status)) {
+			wd->te = TE_SIGNALLED;
+		} else if (WIFEXITED(status)) {
+			wd->te = TE_EXITED;
+		} else {
+			/*
+			 * As WCONTINUED flag has not been specified to wait4,
+			 * it cannot be WIFCONTINUED(status), so the only case
+			 * that remains is WIFSTOPPED(status).
+			 */
 
-	if (WIFEXITED(status)) {
-		wd->te = TE_EXITED;
-		return wd;
+			const unsigned int sig = WSTOPSIG(status);
+			const unsigned int event = (unsigned int) status >> 16;
+
+			switch (event) {
+			case 0:
+				/*
+				 * Is this post-attach SIGSTOP?
+				 * Interestingly, the process may stop
+				 * with STOPSIG equal to some other signal
+				 * than SIGSTOP if we happened to attach
+				 * just before the process takes a signal.
+				 */
+				if (sig == SIGSTOP &&
+				    (tcp->flags & TCB_IGNORE_ONE_SIGSTOP)) {
+					debug_func_msg("ignored SIGSTOP on "
+						       "pid %d", tcp->pid);
+					tcp->flags &= ~TCB_IGNORE_ONE_SIGSTOP;
+					wd->te = TE_RESTART;
+				} else if (sig == syscall_trap_sig) {
+					wd->te = TE_SYSCALL_STOP;
+				} else {
+					/*
+					 * True if tracee is stopped by signal
+					 * (as opposed to "tracee received
+					 * signal").
+					 * TODO: shouldn't we check for
+					 * errno == EINVAL too?
+					 * We can get ESRCH instead, you know...
+					 */
+					bool stopped = ptrace(PTRACE_GETSIGINFO,
+						pid, 0, &wd->si) < 0;
+
+					wd->te = stopped ? TE_GROUP_STOP
+							 : TE_SIGNAL_DELIVERY_STOP;
+				}
+				break;
+			case PTRACE_EVENT_STOP:
+				/*
+				 * PTRACE_INTERRUPT-stop or group-stop.
+				 * PTRACE_INTERRUPT-stop has sig == SIGTRAP here.
+				 */
+				switch (sig) {
+				case SIGSTOP:
+				case SIGTSTP:
+				case SIGTTIN:
+				case SIGTTOU:
+					wd->te = TE_GROUP_STOP;
+					break;
+				default:
+					wd->te = TE_RESTART;
+				}
+				break;
+			case PTRACE_EVENT_EXEC:
+					/*
+					 * TODO: shouldn't we check for
+					 * errno == EINVAL here, too?
+					 * We can get ESRCH instead, you know...
+					 */
+				if (ptrace(PTRACE_GETEVENTMSG, pid, NULL,
+				    &wd->msg) < 0)
+					wd->msg = 0;
+
+				wd->te = TE_STOP_BEFORE_EXECVE;
+				break;
+			case PTRACE_EVENT_EXIT:
+				wd->te = TE_STOP_BEFORE_EXIT;
+				break;
+			default:
+				wd->te = TE_RESTART;
+			}
+		}
+
+		if (!wd->te)
+			error_func_msg("Tracing event hasn't been determined "
+				       "for pid %d, status %0#x", pid, status);
+
+		if (!list_is_empty(&tcp->wait_list)) {
+			wait_extra_data_idx = wait_tab_pos;
+			extra_tcp = tcp;
+			debug_func_msg("queued extra pid %d", tcp->pid);
+		} else {
+			tcp->wait_data_idx = wait_tab_pos;
+			list_append(&pending_tcps, &tcp->wait_list);
+			debug_func_msg("queued pid %d", tcp->pid);
+		}
+
+		wait_tab_pos++;
+
+		if (extra_tcp)
+			break;
+
+next_event_wait_next:
+		pid = wait4(-1, &status, __WALL | WNOHANG, (cflag ? &ru : NULL));
+		wait_errno = errno;
+		wait_nohang = true;
 	}
 
-	/*
-	 * As WCONTINUED flag has not been specified to wait4,
-	 * it cannot be WIFCONTINUED(status), so the only case
-	 * that remains is WIFSTOPPED(status).
-	 */
+next_event_get_tcp:
+	elem = list_remove_head(&pending_tcps);
+
+	if (!elem) {
+		tcb_wait_tab_check_size(0);
+		memset(tcb_wait_tab, 0, sizeof(*tcb_wait_tab));
+		tcb_wait_tab->te = TE_NEXT;
 
+		return tcb_wait_tab;
+	} else {
+		tcp = list_elem(elem, struct tcb, wait_list);
+		debug_func_msg("dequeued pid %d", tcp->pid);
+	}
+
+next_event_exit:
 	/* Is this the very first time we see this tracee stopped? */
 	if (tcp->flags & TCB_STARTUP)
 		startup_tcb(tcp);
 
-	const unsigned int sig = WSTOPSIG(status);
-	const unsigned int event = (unsigned int) status >> 16;
+	clear_regs(tcp);
 
-	switch (event) {
-	case 0:
-		/*
-		 * Is this post-attach SIGSTOP?
-		 * Interestingly, the process may stop
-		 * with STOPSIG equal to some other signal
-		 * than SIGSTOP if we happened to attach
-		 * just before the process takes a signal.
-		 */
-		if (sig == SIGSTOP && (tcp->flags & TCB_IGNORE_ONE_SIGSTOP)) {
-			debug_func_msg("ignored SIGSTOP on pid %d", tcp->pid);
-			tcp->flags &= ~TCB_IGNORE_ONE_SIGSTOP;
-			wd->te = TE_RESTART;
-		} else if (sig == syscall_trap_sig) {
-			wd->te = TE_SYSCALL_STOP;
-		} else {
-			memset(&wd->si, 0, sizeof(wd->si));
-			/*
-			 * True if tracee is stopped by signal
-			 * (as opposed to "tracee received signal").
-			 * TODO: shouldn't we check for errno == EINVAL too?
-			 * We can get ESRCH instead, you know...
-			 */
-			bool stopped = ptrace(PTRACE_GETSIGINFO, pid, 0, &wd->si) < 0;
-			wd->te = stopped ? TE_GROUP_STOP : TE_SIGNAL_DELIVERY_STOP;
-		}
-		break;
-	case PTRACE_EVENT_STOP:
-		/*
-		 * PTRACE_INTERRUPT-stop or group-stop.
-		 * PTRACE_INTERRUPT-stop has sig == SIGTRAP here.
-		 */
-		switch (sig) {
-		case SIGSTOP:
-		case SIGTSTP:
-		case SIGTTIN:
-		case SIGTTOU:
-			wd->te = TE_GROUP_STOP;
-			break;
-		default:
-			wd->te = TE_RESTART;
-		}
-		break;
-	case PTRACE_EVENT_EXEC:
-		wd->te = TE_STOP_BEFORE_EXECVE;
-		break;
-	case PTRACE_EVENT_EXIT:
-		wd->te = TE_STOP_BEFORE_EXIT;
-		break;
-	default:
-		wd->te = TE_RESTART;
-	}
+	/* Set current output file */
+	set_current_tcp(tcp);
 
-	return wd;
+	return tcb_wait_tab + tcp->wait_data_idx;
 }
 
 static int
@@ -2569,8 +2698,15 @@ dispatch_event(const struct tcb_wait_data *wd)
 		return false;
 
 	/* If the process is being delayed, do not ptrace_restart just yet */
-	if (syscall_delayed(current_tcp))
+	if (syscall_delayed(current_tcp)) {
+		if (current_tcp->delayed_wait_data)
+			error_func_msg("pid %d has delayed wait data set"
+				       " already", current_tcp->pid);
+
+		current_tcp->delayed_wait_data = copy_trace_wait_data(wd);
+
 		return true;
+	}
 
 	if (ptrace_restart(restart_op, current_tcp, restart_sig) < 0) {
 		/* Note: ptrace_restart emitted error message */
@@ -2583,7 +2719,15 @@ dispatch_event(const struct tcb_wait_data *wd)
 static bool
 restart_delayed_tcb(struct tcb *const tcp)
 {
-	const struct tcb_wait_data wd = { .te = TE_RESTART };
+	struct tcb_wait_data *wd = tcp->delayed_wait_data;
+
+	if (!wd) {
+		error_func_msg("No delayed wait data found for pid %d",
+			       tcp->pid);
+		wd = init_trace_wait_data(alloca(trace_wait_data_size(tcp)));
+	}
+
+	wd->te = TE_RESTART;
 
 	debug_func_msg("pid %d", tcp->pid);
 
@@ -2591,9 +2735,12 @@ restart_delayed_tcb(struct tcb *const tcp)
 
 	struct tcb *const prev_tcp = current_tcp;
 	current_tcp = tcp;
-	bool ret = dispatch_event(&wd);
+	bool ret = dispatch_event(wd);
 	current_tcp = prev_tcp;
 
+	free_trace_wait_data(tcp->delayed_wait_data);
+	tcp->delayed_wait_data = NULL;
+
 	return ret;
 }
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index de7a3d2..51f00a9 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -350,8 +350,7 @@ XFAIL_TESTS_m32 = $(STACKTRACE_TESTS)
 XFAIL_TESTS_mx32 = $(STACKTRACE_TESTS)
 XFAIL_TESTS_x86_64 = int_0x80.gen.test
 XFAIL_TESTS_x32 = int_0x80.gen.test
-XFAIL_TESTS = $(XFAIL_TESTS_$(MPERS_NAME)) $(XFAIL_TESTS_$(ARCH)) \
-	      looping_threads.test
+XFAIL_TESTS = $(XFAIL_TESTS_$(MPERS_NAME)) $(XFAIL_TESTS_$(ARCH))
 
 TEST_LOG_COMPILER = env
 AM_TEST_LOG_FLAGS = STRACE_ARCH=$(ARCH) STRACE_NATIVE_ARCH=$(NATIVE_ARCH) \
-- 
2.1.4