From 6681535d2cecea1f67255c140f11b217518a2f4e Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Wed, 17 Jun 2015 20:41:05 +0100
Subject: [PATCH] Fix BZ1184724 - gdb output a internal-error: Assertion
`num_lwps (GET_PID (inferior_ptid)) == 1'
The real issue with this bug is in the ptrace error:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(gdb) c
Continuing.
netcfStateCleanup () at interface/interface_backend_netcf.c:116
116 }
ptrace: No such process.
^^^^^^^^^^^^^^^^^^^^^^^^
(gdb) quit
Inferior 1 [process 11205] will be detached.
Quit anyway? (y or n) y
[Thread 0x7fefdf322880 (LWP 11205) exited]
../../gdb/linux-nat.c:1869: internal-error: linux_nat_detach: Assertion `num_lwps (GET_PID (inferior_ptid)) == 1' failed.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The assertion is just a consequence.
So the main fix is to backport the GDB [1] bits of upstream 23f238d3
(Fix race exposed by gdb.threads/killed.exp). That makes
linux_resume_one_lwp detect when the LWP being resumed disappears.
However, we need to backport also part of 8a99810d (linux-nat.c: clean
up pending status checking and resuming LWPs), which is what
introduced the centralized linux_resume_one_lwp in the first place,
and also one bit from 8817a6f2 (PR gdb/15713 - errors from
i386_linux_resume lead to lock-up), which makes linux_nat_resume not
clear the leader LWP's lwp->stopped flag too early, otherwise we'd hit
the new assertion in check_ptrace_stopped_lwp_gone.
The test is new (not a backport), and without the fix triggers the
"detach" assertion shown above.
[1] - The equivalent GDBserver bits are harder to backport, as they
rely on GDB exceptions, which GDBserver 7.6.1 doesn't support yet.
Fortunately, GDBserver already handles ESRCH from within its
linux_resume_one_lwp, so doesn't trigger this particular bug and
passes the test.
gdb/ChangeLog:
2015-06-17 Pedro Alves <palves@redhat.com>
* common/linux-procfs.c (linux_proc_pid_is_trace_stopped): New
function.
* common/linux-procfs.h (linux_proc_pid_is_trace_stopped): New
declaration.
* linux-nat.c (linux_resume_one_lwp_throw, check_ptrace_stopped_lwp_gone)
(linux_resume_one_lwp): New functions.
(resume_lwp): Use linux_resume_one_lwp.
(linux_nat_resume_callback): Skip if LWP is the same as the passed
in data pointer.
(linux_nat_resume): Don't clear the selected LWP's stopped flag
before resuming the sibling LWPs. Instead pass LWP to
linux_nat_resume_callback. Use linux_resume_one_lwp.
(linux_handle_extended_wait, linux_nat_filter_event)
(linux_nat_wait_1): Use linux_resume_one_lwp.
(resume_stopped_resumed_lwps): Try register reads in TRY/CATCH and
swallows errors if the LWP is gone. Use
linux_resume_one_lwp_throw instead of linux_resume_one_lwp.
gdb/testsuite/ChangeLog:
2015-06-17 Pedro Alves <palves@redhat.com>
* gdb.threads/thread-kills-process.c: New file.
* gdb.threads/thread-kills-process.exp: New file.
---
gdb/ChangeLog | 13 ++
gdb/common/linux-procfs.c | 6 +
gdb/common/linux-procfs.h | 2 +
gdb/linux-nat.c | 200 +++++++++++++--------
gdb/testsuite/gdb.threads/thread-kills-process.c | 62 +++++++
gdb/testsuite/gdb.threads/thread-kills-process.exp | 70 ++++++++
6 files changed, 277 insertions(+), 76 deletions(-)
create mode 100644 gdb/testsuite/gdb.threads/thread-kills-process.c
create mode 100644 gdb/testsuite/gdb.threads/thread-kills-process.exp
Index: gdb-7.6.1/gdb/common/linux-procfs.c
===================================================================
--- gdb-7.6.1.orig/gdb/common/linux-procfs.c
+++ gdb-7.6.1/gdb/common/linux-procfs.c
@@ -111,6 +111,12 @@ linux_proc_pid_is_stopped (pid_t pid)
return linux_proc_pid_has_state (pid, "T (stopped)");
}
+int
+linux_proc_pid_is_trace_stopped (pid_t pid)
+{
+ return linux_proc_pid_has_state (pid, "T (tracing stop)");
+}
+
/* See linux-procfs.h declaration. */
int
Index: gdb-7.6.1/gdb/common/linux-procfs.h
===================================================================
--- gdb-7.6.1.orig/gdb/common/linux-procfs.h
+++ gdb-7.6.1/gdb/common/linux-procfs.h
@@ -36,6 +36,8 @@ extern pid_t linux_proc_get_tracerpid (p
extern int linux_proc_pid_is_stopped (pid_t pid);
+extern int linux_proc_pid_is_trace_stopped (pid_t pid);
+
/* Return non-zero if PID is a zombie. */
extern int linux_proc_pid_is_zombie (pid_t pid);
Index: gdb-7.6.1/gdb/linux-nat.c
===================================================================
--- gdb-7.6.1.orig/gdb/linux-nat.c
+++ gdb-7.6.1/gdb/linux-nat.c
@@ -1928,6 +1928,85 @@ linux_nat_detach (struct target_ops *ops
linux_ops->to_detach (ops, args, from_tty);
}
+/* Resume execution of the inferior process. If STEP is nonzero,
+ single-step it. If SIGNAL is nonzero, give it that signal. */
+
+static void
+linux_resume_one_lwp_throw (struct lwp_info *lp, int step,
+ enum gdb_signal signo)
+{
+ ptid_t ptid;
+
+ lp->step = step;
+
+ if (linux_nat_prepare_to_resume != NULL)
+ linux_nat_prepare_to_resume (lp);
+
+ ptid = pid_to_ptid (GET_LWP (lp->ptid));
+ linux_ops->to_resume (linux_ops, ptid, step, signo);
+
+ /* Successfully resumed. Clear state that no longer makes sense,
+ and mark the LWP as running. Must not do this before resuming
+ otherwise if that fails other code will be confused. E.g., we'd
+ later try to stop the LWP and hang forever waiting for a stop
+ status. Note that we must not throw after this is cleared,
+ otherwise handle_zombie_lwp_error would get confused. */
+ lp->stopped = 0;
+ lp->stopped_by_watchpoint = 0;
+ registers_changed_ptid (lp->ptid);
+}
+
+/* Called when we try to resume a stopped LWP and that errors out. If
+ the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+ or about to become), discard the error, clear any pending status
+ the LWP may have, and return true (we'll collect the exit status
+ soon enough). Otherwise, return false. */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+ /* If we get an error after resuming the LWP successfully, we'd
+ confuse !T state for the LWP being gone. */
+ gdb_assert (lp->stopped);
+
+ /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+ because even if ptrace failed with ESRCH, the tracee may be "not
+ yet fully dead", but already refusing ptrace requests. In that
+ case the tracee has 'R (Running)' state for a little bit
+ (observed in Linux 3.18). See also the note on ESRCH in the
+ ptrace(2) man page. Instead, check whether the LWP has any state
+ other than ptrace-stopped. */
+
+ /* Don't assume anything if /proc/PID/status can't be read. */
+ if (linux_proc_pid_is_trace_stopped (ptid_get_lwp (lp->ptid)) == 0)
+ {
+ lp->status = 0;
+ lp->waitstatus.kind = TARGET_WAITKIND_IGNORE;
+ lp->stopped_by_watchpoint = 0;
+ return 1;
+ }
+ return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+ disappears while we try to resume it. */
+
+static void
+linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+{
+ volatile struct gdb_exception ex;
+
+ TRY_CATCH (ex, RETURN_MASK_ERROR)
+ {
+ linux_resume_one_lwp_throw (lp, step, signo);
+ }
+ if (ex.reason < 0)
+ {
+ if (!check_ptrace_stopped_lwp_gone (lp))
+ throw_exception (ex);
+ }
+}
+
/* Resume LP. */
static void
@@ -1956,14 +2035,7 @@ resume_lwp (struct lwp_info *lp, int ste
: "0"),
step ? "step" : "resume");
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops,
- pid_to_ptid (GET_LWP (lp->ptid)),
- step, signo);
- lp->stopped = 0;
- lp->step = step;
- lp->stopped_by_watchpoint = 0;
+ linux_resume_one_lwp (lp, step, signo);
}
else
{
@@ -1982,13 +2054,17 @@ resume_lwp (struct lwp_info *lp, int ste
}
}
-/* Resume LWP, with the last stop signal, if it is in pass state. */
+/* Callback for iterate_over_lwps. If LWP is EXCEPT, do nothing.
+ Resume LWP with the last stop signal, if it is in pass state. */
static int
-linux_nat_resume_callback (struct lwp_info *lp, void *data)
+linux_nat_resume_callback (struct lwp_info *lp, void *except)
{
enum gdb_signal signo = GDB_SIGNAL_0;
+ if (lp == except)
+ return 0;
+
if (lp->stopped)
{
struct thread_info *thread;
@@ -2108,20 +2184,10 @@ linux_nat_resume (struct target_ops *ops
return;
}
- /* Mark LWP as not stopped to prevent it from being continued by
- linux_nat_resume_callback. */
- lp->stopped = 0;
-
if (resume_many)
- iterate_over_lwps (ptid, linux_nat_resume_callback, NULL);
+ iterate_over_lwps (ptid, linux_nat_resume_callback, lp);
- /* Convert to something the lower layer understands. */
- ptid = pid_to_ptid (GET_LWP (lp->ptid));
-
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, ptid, step, signo);
- lp->stopped_by_watchpoint = 0;
+ linux_resume_one_lwp (lp, step, signo);
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
@@ -2287,11 +2353,7 @@ linux_handle_syscall_trap (struct lwp_in
/* Note that gdbarch_get_syscall_number may access registers, hence
fill a regcache. */
- registers_changed ();
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- lp->step, GDB_SIGNAL_0);
+ linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
return 1;
}
@@ -2486,22 +2548,15 @@ linux_handle_extended_wait (struct lwp_i
fprintf_unfiltered (gdb_stdlog,
"LHEW: resuming new LWP %ld\n",
GET_LWP (new_lp->ptid));
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (new_lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (new_pid),
- 0, GDB_SIGNAL_0);
- new_lp->stopped = 0;
+
+ linux_resume_one_lwp (new_lp, 0, GDB_SIGNAL_0);
}
}
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LHEW: resuming parent LWP %d\n", pid);
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- 0, GDB_SIGNAL_0);
-
+ linux_resume_one_lwp (lp, 0, GDB_SIGNAL_0);
return 1;
}
@@ -3382,12 +3437,7 @@ linux_nat_filter_event (int lwpid, int s
{
/* This is a delayed SIGSTOP. */
- registers_changed ();
-
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- lp->step, GDB_SIGNAL_0);
+ linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LLW: %s %s, 0, 0 (discard SIGSTOP)\n",
@@ -3395,7 +3445,6 @@ linux_nat_filter_event (int lwpid, int s
"PTRACE_SINGLESTEP" : "PTRACE_CONT",
target_pid_to_str (lp->ptid));
- lp->stopped = 0;
gdb_assert (lp->resumed);
/* Discard the event. */
@@ -3416,11 +3465,7 @@ linux_nat_filter_event (int lwpid, int s
/* This is a delayed SIGINT. */
lp->ignore_sigint = 0;
- registers_changed ();
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- lp->step, GDB_SIGNAL_0);
+ linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LLW: %s %s, 0, 0 (discard SIGINT)\n",
@@ -3428,7 +3473,6 @@ linux_nat_filter_event (int lwpid, int s
"PTRACE_SINGLESTEP" : "PTRACE_CONT",
target_pid_to_str (lp->ptid));
- lp->stopped = 0;
gdb_assert (lp->resumed);
/* Discard the event. */
@@ -3796,11 +3840,7 @@ retry:
other threads to run. On the other hand, not resuming
newly attached threads may cause an unwanted delay in
getting them running. */
- registers_changed ();
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- lp->step, signo);
+ linux_resume_one_lwp (lp, lp->step, signo);
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LLW: %s %s, %s (preempt 'handle')\n",
@@ -3810,7 +3850,6 @@ retry:
(signo != GDB_SIGNAL_0
? strsignal (gdb_signal_to_host (signo))
: "0"));
- lp->stopped = 0;
goto retry;
}
@@ -3935,32 +3974,41 @@ resume_stopped_resumed_lwps (struct lwp_
{
struct regcache *regcache = get_thread_regcache (lp->ptid);
struct gdbarch *gdbarch = get_regcache_arch (regcache);
- CORE_ADDR pc = regcache_read_pc (regcache);
+ volatile struct gdb_exception ex;
gdb_assert (is_executing (lp->ptid));
- /* Don't bother if there's a breakpoint at PC that we'd hit
- immediately, and we're not waiting for this LWP. */
- if (!ptid_match (lp->ptid, *wait_ptid_p))
+ TRY_CATCH (ex, RETURN_MASK_ERROR)
{
- if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
- return 0;
- }
+ CORE_ADDR pc = regcache_read_pc (regcache);
+ int leave_stopped = 0;
- if (debug_linux_nat)
- fprintf_unfiltered (gdb_stdlog,
- "RSRL: resuming stopped-resumed LWP %s at %s: step=%d\n",
- target_pid_to_str (lp->ptid),
- paddress (gdbarch, pc),
- lp->step);
+ /* Don't bother if there's a breakpoint at PC that we'd hit
+ immediately, and we're not waiting for this LWP. */
+ if (!ptid_match (lp->ptid, *wait_ptid_p))
+ {
+ if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
+ leave_stopped = 1;
+ }
- registers_changed ();
- if (linux_nat_prepare_to_resume != NULL)
- linux_nat_prepare_to_resume (lp);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
- lp->step, GDB_SIGNAL_0);
- lp->stopped = 0;
- lp->stopped_by_watchpoint = 0;
+ if (!leave_stopped)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "RSRL: resuming stopped-resumed LWP %s at "
+ "%s: step=%d\n",
+ target_pid_to_str (lp->ptid),
+ paddress (gdbarch, pc),
+ lp->step);
+
+ linux_resume_one_lwp_throw (lp, lp->step, GDB_SIGNAL_0);
+ }
+ }
+ if (ex.reason < 0)
+ {
+ if (!check_ptrace_stopped_lwp_gone (lp))
+ throw_exception (ex);
+ }
}
return 0;
Index: gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.c
===================================================================
--- /dev/null
+++ gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.c
@@ -0,0 +1,62 @@
+/* Copyright 2015 Free Software Foundation, Inc.
+
+ This file is part of GDB.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <pthread.h>
+#include <unistd.h>
+
+volatile int call_exit;
+static pthread_barrier_t barrier;
+#define NUMTHREADS 256
+
+void *
+thread_function (void *arg)
+{
+ pthread_barrier_wait (&barrier);
+
+ while (!call_exit)
+ usleep (1);
+
+ _exit (0);
+ return NULL;
+}
+
+void
+all_threads_started (void)
+{
+ call_exit = 1;
+}
+
+int
+main (int argc, char **argv)
+{
+ pthread_t threads[NUMTHREADS];
+ int i;
+
+ pthread_barrier_init (&barrier, NULL, NUMTHREADS + 1);
+
+ for (i = 0; i < NUMTHREADS; ++i)
+ pthread_create (&threads[i], NULL, thread_function, NULL);
+
+ pthread_barrier_wait (&barrier);
+
+ all_threads_started ();
+
+ for (i = 0; i < NUMTHREADS; ++i)
+ pthread_join (threads[i], NULL);
+
+ return 0;
+}
Index: gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.exp
===================================================================
--- /dev/null
+++ gdb-7.6.1/gdb/testsuite/gdb.threads/thread-kills-process.exp
@@ -0,0 +1,70 @@
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+standard_testfile
+
+if {[gdb_compile_pthreads "${srcdir}/${subdir}/${srcfile}" "${binfile}" executable {debug}] != "" } {
+ return -1
+}
+
+clean_restart ${binfile}
+if {![runto_main]} {
+ fail "Couldn't run to main"
+ return
+}
+
+gdb_breakpoint "all_threads_started"
+
+gdb_continue_to_breakpoint "all_threads_started"
+
+# Update the thread list, otherwise when testing against GDBserver,
+# GDB won't know about thread 2. (Only necessary with GDB < 7.9.)
+gdb_test "info threads" ".*"
+
+# Select any thread but the leader.
+gdb_test "thread 2" ".*" "switch to non-leader thread"
+
+# Delete breakpoints so that GDB doesn't switch back the to leader to
+# step over its breakpoint.
+delete_breakpoints
+
+# Let threads exit the process on next resume.
+gdb_test "p call_exit = 0" " = 0"
+
+# While GDB is busy resuming all threads one by one, one of the
+# threads should manage to exit the process. GDB should handle that
+# gracefully instead of erroring out.
+#
+# gdb_continue_to_end doesn't work with GDBserver until the
+# introduction of the "exit_is_reliable" board variable
+# (b477a5e649150) in GDB 7.7.
+#gdb_continue_to_end "" continue 1
+gdb_test "continue" "$inferior_exited_re normally.*"
+
+# On the buggy GDB where the "continue" above would error out, a
+# subsequent "detach" (e.g., the user tries to quit GDB, and quit
+# offers to detach) would hit this assertion:
+#
+# linux-nat.c:1869: internal-error: linux_nat_detach: Assertion `num_lwps (GET_PID (inferior_ptid)) == 1' failed.
+
+# That was a consequence of the original bug, but let's make sure that
+# even when "continue" is handled properly, detach doesn't stumble on
+# anything stale.
+gdb_test "detach" "The program is not being run\\." \
+ "detach after exit"
+
+# Likewise "continue".
+gdb_test "continue" "The program is not being run\\." \
+ "continue after exit"