Blob Blame History Raw
From 6681535d2cecea1f67255c140f11b217518a2f4e Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Wed, 17 Jun 2015 20:41:05 +0100
Subject: [PATCH] Fix BZ1184724 - gdb output a internal-error: Assertion
 `num_lwps (GET_PID (inferior_ptid)) == 1'

The real issue with this bug is in the ptrace error:

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  (gdb) c
  Continuing.
  netcfStateCleanup () at interface/interface_backend_netcf.c:116
  116	}
  ptrace: No such process.
  ^^^^^^^^^^^^^^^^^^^^^^^^
  (gdb) quit
	  Inferior 1 [process 11205] will be detached.
  Quit anyway? (y or n) y
  [Thread 0x7fefdf322880 (LWP 11205) exited]

  ../../gdb/linux-nat.c:1869: internal-error: linux_nat_detach: Assertion `num_lwps (GET_PID (inferior_ptid)) == 1' failed.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The assertion is just a consequence.

So the main fix is to backport the GDB [1] bits of upstream 23f238d3
(Fix race exposed by gdb.threads/killed.exp).  That makes
linux_resume_one_lwp detect when the LWP being resumed disappears.
However, we need to backport also part of 8a99810d (linux-nat.c: clean
up pending status checking and resuming LWPs), which is what
introduced the centralized linux_resume_one_lwp in the first place,
and also one bit from 8817a6f2 (PR gdb/15713 - errors from
i386_linux_resume lead to lock-up), which makes linux_nat_resume not
clear the leader LWP's lwp->stopped flag too early, otherwise we'd hit
the new assertion in check_ptrace_stopped_lwp_gone.

The test is new (not a backport), and without the fix triggers the
"detach" assertion shown above.

[1] - The equivalent GDBserver bits are harder to backport, as they
rely on GDB exceptions, which GDBserver 7.6.1 doesn't support yet.
Fortunately, GDBserver already handles ESRCH from within its
linux_resume_one_lwp, so doesn't trigger this particular bug and
passes the test.

gdb/ChangeLog:
2015-06-17  Pedro Alves  <palves@redhat.com>

	* common/linux-procfs.c (linux_proc_pid_is_trace_stopped): New
	function.
	* common/linux-procfs.h (linux_proc_pid_is_trace_stopped): New
	declaration.
	* linux-nat.c (linux_resume_one_lwp_throw, check_ptrace_stopped_lwp_gone)
	(linux_resume_one_lwp): New functions.
	(resume_lwp): Use linux_resume_one_lwp.
	(linux_nat_resume_callback): Skip if LWP is the same as the passed
	in data pointer.
	(linux_nat_resume): Don't clear the selected LWP's stopped flag
	before resuming the sibling LWPs.  Instead pass LWP to
	linux_nat_resume_callback.  Use linux_resume_one_lwp.
	(linux_handle_extended_wait, linux_nat_filter_event)
	(linux_nat_wait_1): Use linux_resume_one_lwp.
	(resume_stopped_resumed_lwps): Try register reads in TRY/CATCH and
	swallows errors if the LWP is gone.  Use
	linux_resume_one_lwp_throw instead of linux_resume_one_lwp.

gdb/testsuite/ChangeLog:
2015-06-17  Pedro Alves  <palves@redhat.com>

	* gdb.threads/thread-kills-process.c: New file.
	* gdb.threads/thread-kills-process.exp: New file.
---
 gdb/ChangeLog                                      |  13 ++
 gdb/common/linux-procfs.c                          |   6 +
 gdb/common/linux-procfs.h                          |   2 +
 gdb/linux-nat.c                                    | 200 +++++++++++++--------
 gdb/testsuite/gdb.threads/thread-kills-process.c   |  62 +++++++
 gdb/testsuite/gdb.threads/thread-kills-process.exp |  70 ++++++++
 6 files changed, 277 insertions(+), 76 deletions(-)
 create mode 100644 gdb/testsuite/gdb.threads/thread-kills-process.c
 create mode 100644 gdb/testsuite/gdb.threads/thread-kills-process.exp

Index: gdb-7.6.1/gdb/common/linux-procfs.c
===================================================================
--- gdb-7.6.1.orig/gdb/common/linux-procfs.c
+++ gdb-7.6.1/gdb/common/linux-procfs.c
@@ -111,6 +111,12 @@ linux_proc_pid_is_stopped (pid_t pid)
   return linux_proc_pid_has_state (pid, "T (stopped)");
 }
 
+int
+linux_proc_pid_is_trace_stopped (pid_t pid)
+{
+  return linux_proc_pid_has_state (pid, "T (tracing stop)");
+}
+
 /* See linux-procfs.h declaration.  */
 
 int
Index: gdb-7.6.1/gdb/common/linux-procfs.h
===================================================================
--- gdb-7.6.1.orig/gdb/common/linux-procfs.h
+++ gdb-7.6.1/gdb/common/linux-procfs.h
@@ -36,6 +36,8 @@ extern pid_t linux_proc_get_tracerpid (p
 
 extern int linux_proc_pid_is_stopped (pid_t pid);
 
+extern int linux_proc_pid_is_trace_stopped (pid_t pid);
+
 /* Return non-zero if PID is a zombie.  */
 
 extern int linux_proc_pid_is_zombie (pid_t pid);
Index: gdb-7.6.1/gdb/linux-nat.c
===================================================================
--- gdb-7.6.1.orig/gdb/linux-nat.c
+++ gdb-7.6.1/gdb/linux-nat.c
@@ -1928,6 +1928,85 @@ linux_nat_detach (struct target_ops *ops
     linux_ops->to_detach (ops, args, from_tty);
 }
 
+/* Resume execution of the inferior process.  If STEP is nonzero,
+   single-step it.  If SIGNAL is nonzero, give it that signal.  */
+
+static void
+linux_resume_one_lwp_throw (struct lwp_info *lp, int step,
+			    enum gdb_signal signo)
+{
+  ptid_t ptid;
+
+  lp->step = step;
+
+  if (linux_nat_prepare_to_resume != NULL)
+    linux_nat_prepare_to_resume (lp);
+
+  ptid = pid_to_ptid (GET_LWP (lp->ptid));
+  linux_ops->to_resume (linux_ops, ptid, step, signo);
+
+  /* Successfully resumed.  Clear state that no longer makes sense,
+     and mark the LWP as running.  Must not do this before resuming
+     otherwise if that fails other code will be confused.  E.g., we'd
+     later try to stop the LWP and hang forever waiting for a stop
+     status.  Note that we must not throw after this is cleared,
+     otherwise handle_zombie_lwp_error would get confused.  */
+  lp->stopped = 0;
+  lp->stopped_by_watchpoint = 0;
+  registers_changed_ptid (lp->ptid);
+}
+
+/* Called when we try to resume a stopped LWP and that errors out.  If
+   the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+   or about to become), discard the error, clear any pending status
+   the LWP may have, and return true (we'll collect the exit status
+   soon enough).  Otherwise, return false.  */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+  /* If we get an error after resuming the LWP successfully, we'd
+     confuse !T state for the LWP being gone.  */
+  gdb_assert (lp->stopped);
+
+  /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+     because even if ptrace failed with ESRCH, the tracee may be "not
+     yet fully dead", but already refusing ptrace requests.  In that
+     case the tracee has 'R (Running)' state for a little bit
+     (observed in Linux 3.18).  See also the note on ESRCH in the
+     ptrace(2) man page.  Instead, check whether the LWP has any state
+     other than ptrace-stopped.  */
+
+  /* Don't assume anything if /proc/PID/status can't be read.  */
+  if (linux_proc_pid_is_trace_stopped (ptid_get_lwp (lp->ptid)) == 0)
+    {
+      lp->status = 0;
+      lp->waitstatus.kind = TARGET_WAITKIND_IGNORE;
+      lp->stopped_by_watchpoint = 0;
+      return 1;
+    }
+  return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+   disappears while we try to resume it.  */
+
+static void
+linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+{
+  volatile struct gdb_exception ex;
+
+  TRY_CATCH (ex, RETURN_MASK_ERROR)
+    {
+      linux_resume_one_lwp_throw (lp, step, signo);
+    }
+  if (ex.reason < 0)
+    {
+      if (!check_ptrace_stopped_lwp_gone (lp))
+	throw_exception (ex);
+    }
+}
+
 /* Resume LP.  */
 
 static void
@@ -1956,14 +2035,7 @@ resume_lwp (struct lwp_info *lp, int ste
 				 : "0"),
 				step ? "step" : "resume");
 
-	  if (linux_nat_prepare_to_resume != NULL)
-	    linux_nat_prepare_to_resume (lp);
-	  linux_ops->to_resume (linux_ops,
-				pid_to_ptid (GET_LWP (lp->ptid)),
-				step, signo);
-	  lp->stopped = 0;
-	  lp->step = step;
-	  lp->stopped_by_watchpoint = 0;
+	  linux_resume_one_lwp (lp, step, signo);
 	}
       else
 	{
@@ -1982,13 +2054,17 @@ resume_lwp (struct lwp_info *lp, int ste
     }
 }
 
-/* Resume LWP, with the last stop signal, if it is in pass state.  */
+/* Callback for iterate_over_lwps.  If LWP is EXCEPT, do nothing.
+   Resume LWP with the last stop signal, if it is in pass state.  */
 
 static int
-linux_nat_resume_callback (struct lwp_info *lp, void *data)
+linux_nat_resume_callback (struct lwp_info *lp, void *except)
 {
   enum gdb_signal signo = GDB_SIGNAL_0;
 
+  if (lp == except)
+    return 0;
+
   if (lp->stopped)
     {
       struct thread_info *thread;
@@ -2108,20 +2184,10 @@ linux_nat_resume (struct target_ops *ops
       return;
     }
 
-  /* Mark LWP as not stopped to prevent it from being continued by
-     linux_nat_resume_callback.  */
-  lp->stopped = 0;
-
   if (resume_many)
-    iterate_over_lwps (ptid, linux_nat_resume_callback, NULL);
+    iterate_over_lwps (ptid, linux_nat_resume_callback, lp);
 
-  /* Convert to something the lower layer understands.  */
-  ptid = pid_to_ptid (GET_LWP (lp->ptid));
-
-  if (linux_nat_prepare_to_resume != NULL)
-    linux_nat_prepare_to_resume (lp);
-  linux_ops->to_resume (linux_ops, ptid, step, signo);
-  lp->stopped_by_watchpoint = 0;
+  linux_resume_one_lwp (lp, step, signo);
 
   if (debug_linux_nat)
     fprintf_unfiltered (gdb_stdlog,
@@ -2287,11 +2353,7 @@ linux_handle_syscall_trap (struct lwp_in
 
   /* Note that gdbarch_get_syscall_number may access registers, hence
      fill a regcache.  */
-  registers_changed ();
-  if (linux_nat_prepare_to_resume != NULL)
-    linux_nat_prepare_to_resume (lp);
-  linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-			lp->step, GDB_SIGNAL_0);
+  linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
   return 1;
 }
 
@@ -2486,22 +2548,15 @@ linux_handle_extended_wait (struct lwp_i
 		    fprintf_unfiltered (gdb_stdlog,
 					"LHEW: resuming new LWP %ld\n",
 					GET_LWP (new_lp->ptid));
-		  if (linux_nat_prepare_to_resume != NULL)
-		    linux_nat_prepare_to_resume (new_lp);
-		  linux_ops->to_resume (linux_ops, pid_to_ptid (new_pid),
-					0, GDB_SIGNAL_0);
-		  new_lp->stopped = 0;
+
+		  linux_resume_one_lwp (new_lp, 0, GDB_SIGNAL_0);
 		}
 	    }
 
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"LHEW: resuming parent LWP %d\n", pid);
-	  if (linux_nat_prepare_to_resume != NULL)
-	    linux_nat_prepare_to_resume (lp);
-	  linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-				0, GDB_SIGNAL_0);
-
+	  linux_resume_one_lwp (lp, 0, GDB_SIGNAL_0);
 	  return 1;
 	}
 
@@ -3382,12 +3437,7 @@ linux_nat_filter_event (int lwpid, int s
 	{
 	  /* This is a delayed SIGSTOP.  */
 
-	  registers_changed ();
-
-	  if (linux_nat_prepare_to_resume != NULL)
-	    linux_nat_prepare_to_resume (lp);
-	  linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-			    lp->step, GDB_SIGNAL_0);
+	  linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"LLW: %s %s, 0, 0 (discard SIGSTOP)\n",
@@ -3395,7 +3445,6 @@ linux_nat_filter_event (int lwpid, int s
 				"PTRACE_SINGLESTEP" : "PTRACE_CONT",
 				target_pid_to_str (lp->ptid));
 
-	  lp->stopped = 0;
 	  gdb_assert (lp->resumed);
 
 	  /* Discard the event.  */
@@ -3416,11 +3465,7 @@ linux_nat_filter_event (int lwpid, int s
       /* This is a delayed SIGINT.  */
       lp->ignore_sigint = 0;
 
-      registers_changed ();
-      if (linux_nat_prepare_to_resume != NULL)
-	linux_nat_prepare_to_resume (lp);
-      linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-			    lp->step, GDB_SIGNAL_0);
+      linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
       if (debug_linux_nat)
 	fprintf_unfiltered (gdb_stdlog,
 			    "LLW: %s %s, 0, 0 (discard SIGINT)\n",
@@ -3428,7 +3473,6 @@ linux_nat_filter_event (int lwpid, int s
 			    "PTRACE_SINGLESTEP" : "PTRACE_CONT",
 			    target_pid_to_str (lp->ptid));
 
-      lp->stopped = 0;
       gdb_assert (lp->resumed);
 
       /* Discard the event.  */
@@ -3796,11 +3840,7 @@ retry:
 	     other threads to run.  On the other hand, not resuming
 	     newly attached threads may cause an unwanted delay in
 	     getting them running.  */
-	  registers_changed ();
-	  if (linux_nat_prepare_to_resume != NULL)
-	    linux_nat_prepare_to_resume (lp);
-	  linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-				lp->step, signo);
+	  linux_resume_one_lwp (lp, lp->step, signo);
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"LLW: %s %s, %s (preempt 'handle')\n",
@@ -3810,7 +3850,6 @@ retry:
 				(signo != GDB_SIGNAL_0
 				 ? strsignal (gdb_signal_to_host (signo))
 				 : "0"));
-	  lp->stopped = 0;
 	  goto retry;
 	}
 
@@ -3935,32 +3974,41 @@ resume_stopped_resumed_lwps (struct lwp_
     {
       struct regcache *regcache = get_thread_regcache (lp->ptid);
       struct gdbarch *gdbarch = get_regcache_arch (regcache);
-      CORE_ADDR pc = regcache_read_pc (regcache);
+      volatile struct gdb_exception ex;
 
       gdb_assert (is_executing (lp->ptid));
 
-      /* Don't bother if there's a breakpoint at PC that we'd hit
-	 immediately, and we're not waiting for this LWP.  */
-      if (!ptid_match (lp->ptid, *wait_ptid_p))
+      TRY_CATCH (ex, RETURN_MASK_ERROR)
 	{
-	  if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
-	    return 0;
-	}
+	  CORE_ADDR pc = regcache_read_pc (regcache);
+	  int leave_stopped = 0;
 
-      if (debug_linux_nat)
-	fprintf_unfiltered (gdb_stdlog,
-			    "RSRL: resuming stopped-resumed LWP %s at %s: step=%d\n",
-			    target_pid_to_str (lp->ptid),
-			    paddress (gdbarch, pc),
-			    lp->step);
+	  /* Don't bother if there's a breakpoint at PC that we'd hit
+	     immediately, and we're not waiting for this LWP.  */
+	  if (!ptid_match (lp->ptid, *wait_ptid_p))
+	    {
+	      if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
+		leave_stopped = 1;
+	    }
 
-      registers_changed ();
-      if (linux_nat_prepare_to_resume != NULL)
-	linux_nat_prepare_to_resume (lp);
-      linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
-			    lp->step, GDB_SIGNAL_0);
-      lp->stopped = 0;
-      lp->stopped_by_watchpoint = 0;
+	  if (!leave_stopped)
+	    {
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "RSRL: resuming stopped-resumed LWP %s at "
+				    "%s: step=%d\n",
+				    target_pid_to_str (lp->ptid),
+				    paddress (gdbarch, pc),
+				    lp->step);
+
+	      linux_resume_one_lwp_throw (lp, lp->step, GDB_SIGNAL_0);
+	    }
+	}
+      if (ex.reason < 0)
+	{
+	  if (!check_ptrace_stopped_lwp_gone (lp))
+	    throw_exception (ex);
+	}
     }
 
   return 0;
Index: gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.c
===================================================================
--- /dev/null
+++ gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.c
@@ -0,0 +1,62 @@
+/* Copyright 2015 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <unistd.h>
+
+volatile int call_exit;
+static pthread_barrier_t barrier;
+#define NUMTHREADS 256
+
+void *
+thread_function (void *arg)
+{
+  pthread_barrier_wait (&barrier);
+
+  while (!call_exit)
+    usleep (1);
+
+  _exit (0);
+  return NULL;
+}
+
+void
+all_threads_started (void)
+{
+  call_exit = 1;
+}
+
+int
+main (int argc, char **argv)
+{
+  pthread_t threads[NUMTHREADS];
+  int i;
+
+  pthread_barrier_init (&barrier, NULL, NUMTHREADS + 1);
+
+  for (i = 0; i < NUMTHREADS; ++i)
+    pthread_create (&threads[i], NULL, thread_function, NULL);
+
+  pthread_barrier_wait (&barrier);
+
+  all_threads_started ();
+
+  for (i = 0; i < NUMTHREADS; ++i)
+    pthread_join (threads[i], NULL);
+
+  return 0;
+}
Index: gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.exp
===================================================================
--- /dev/null
+++ gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.exp
@@ -0,0 +1,70 @@
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+standard_testfile
+
+if {[gdb_compile_pthreads "${srcdir}/${subdir}/${srcfile}" "${binfile}" executable {debug}] != "" } {
+    return -1
+}
+
+clean_restart ${binfile}
+if {![runto_main]} {
+    fail "Couldn't run to main"
+    return
+}
+
+gdb_breakpoint "all_threads_started"
+
+gdb_continue_to_breakpoint "all_threads_started"
+
+# Update the thread list, otherwise when testing against GDBserver,
+# GDB won't know about thread 2.  (Only necessary with GDB < 7.9.)
+gdb_test "info threads" ".*"
+
+# Select any thread but the leader.
+gdb_test "thread 2" ".*" "switch to non-leader thread"
+
+# Delete breakpoints so that GDB doesn't switch back the to leader to
+# step over its breakpoint.
+delete_breakpoints
+
+# Let threads exit the process on next resume.
+gdb_test "p call_exit = 0" " = 0"
+
+# While GDB is busy resuming all threads one by one, one of the
+# threads should manage to exit the process.  GDB should handle that
+# gracefully instead of erroring out.
+#
+# gdb_continue_to_end doesn't work with GDBserver until the
+# introduction of the "exit_is_reliable" board variable
+# (b477a5e649150) in GDB 7.7.
+#gdb_continue_to_end "" continue 1
+gdb_test "continue" "$inferior_exited_re normally.*"
+
+# On the buggy GDB where the "continue" above would error out, a
+# subsequent "detach" (e.g., the user tries to quit GDB, and quit
+# offers to detach) would hit this assertion:
+#
+#   linux-nat.c:1869: internal-error: linux_nat_detach: Assertion `num_lwps (GET_PID (inferior_ptid)) == 1' failed.
+
+# That was a consequence of the original bug, but let's make sure that
+# even when "continue" is handled properly, detach doesn't stumble on
+# anything stale.
+gdb_test "detach" "The program is not being run\\." \
+    "detach after exit"
+
+# Likewise "continue".
+gdb_test "continue" "The program is not being run\\." \
+    "continue after exit"