commit e5d262effe3a87164308a3f37e61b32d0348692a Author: Tulio Magno Quites Machado Filho Date: Fri Nov 30 18:05:32 2018 -0200 Fix _dl_profile_fixup data-dependency issue (Bug 23690) There is a data-dependency between the fields of struct l_reloc_result and the field used as the initialization guard. Users of the guard expect writes to the structure to be observable when they also observe the guard initialized. The solution for this problem is to use an acquire and release load and store to ensure previous writes to the structure are observable if the guard is initialized. The previous implementation used DL_FIXUP_VALUE_ADDR (l_reloc_result->addr) as the initialization guard, making it impossible for some architectures to load and store it atomically, i.e. hppa and ia64, due to its larger size. This commit adds an unsigned int to l_reloc_result to be used as the new initialization guard of the struct, making it possible to load and store it atomically in all architectures. The fix ensures that the values observed in l_reloc_result are consistent and do not lead to crashes. The algorithm is documented in the code in elf/dl-runtime.c (_dl_profile_fixup). Not all data races have been eliminated. Tested with build-many-glibcs and on powerpc, powerpc64, and powerpc64le. [BZ #23690] * elf/dl-runtime.c (_dl_profile_fixup): Guarantee memory modification order when accessing reloc_result->addr. * include/link.h (reloc_result): Add field init. * nptl/Makefile (tests): Add tst-audit-threads. (modules-names): Add tst-audit-threads-mod1 and tst-audit-threads-mod2. Add rules to build tst-audit-threads. * nptl/tst-audit-threads-mod1.c: New file. * nptl/tst-audit-threads-mod2.c: Likewise. * nptl/tst-audit-threads.c: Likewise. * nptl/tst-audit-threads.h: Likewise. Signed-off-by: Tulio Magno Quites Machado Filho Reviewed-by: Carlos O'Donell (elf/dl-runtime.c adjusted here for lack of __builtin_expect cleanup, nptl/Makefile for the usual test-related conflicts.) diff --git a/elf/dl-runtime.c b/elf/dl-runtime.c index a42e3c4924e067ba..3678a98c98d726f3 100644 --- a/elf/dl-runtime.c +++ b/elf/dl-runtime.c @@ -183,10 +183,36 @@ _dl_profile_fixup ( /* This is the address in the array where we store the result of previous relocations. */ struct reloc_result *reloc_result = &l->l_reloc_result[reloc_index]; - DL_FIXUP_VALUE_TYPE *resultp = &reloc_result->addr; - DL_FIXUP_VALUE_TYPE value = *resultp; - if (DL_FIXUP_VALUE_CODE_ADDR (value) == 0) + /* CONCURRENCY NOTES: + + Multiple threads may be calling the same PLT sequence and with + LD_AUDIT enabled they will be calling into _dl_profile_fixup to + update the reloc_result with the result of the lazy resolution. + The reloc_result guard variable is reloc_init, and we use + acquire/release loads and store to it to ensure that the results of + the structure are consistent with the loaded value of the guard. + This does not fix all of the data races that occur when two or more + threads read reloc_result->reloc_init with a value of zero and read + and write to that reloc_result concurrently. The expectation is + generally that while this is a data race it works because the + threads write the same values. Until the data races are fixed + there is a potential for problems to arise from these data races. + The reloc result updates should happen in parallel but there should + be an atomic RMW which does the final update to the real result + entry (see bug 23790). + + The following code uses reloc_result->init set to 0 to indicate if it is + the first time this object is being relocated, otherwise 1 which + indicates the object has already been relocated. + + Reading/Writing from/to reloc_result->reloc_init must not happen + before previous writes to reloc_result complete as they could + end-up with an incomplete struct. */ + DL_FIXUP_VALUE_TYPE value; + unsigned int init = atomic_load_acquire (&reloc_result->init); + + if (init == 0) { /* This is the first time we have to relocate this object. */ const ElfW(Sym) *const symtab @@ -347,20 +373,32 @@ _dl_profile_fixup ( #endif /* Store the result for later runs. */ - if (__builtin_expect (! GLRO(dl_bind_not), 1)) - *resultp = value; + if (__glibc_likely (! GLRO(dl_bind_not))) + { + reloc_result->addr = value; + /* Guarantee all previous writes complete before + init is updated. See CONCURRENCY NOTES earlier */ + atomic_store_release (&reloc_result->init, 1); + } + init = 1; } + else + value = reloc_result->addr; /* By default we do not call the pltexit function. */ long int framesize = -1; + #ifdef SHARED /* Auditing checkpoint: report the PLT entering and allow the auditors to change the value. */ - if (DL_FIXUP_VALUE_CODE_ADDR (value) != 0 && GLRO(dl_naudit) > 0 + if (GLRO(dl_naudit) > 0 /* Don't do anything if no auditor wants to intercept this call. */ && (reloc_result->enterexit & LA_SYMB_NOPLTENTER) == 0) { + /* Sanity check: DL_FIXUP_VALUE_CODE_ADDR (value) should have been + initialized earlier in this function or in another thread. */ + assert (DL_FIXUP_VALUE_CODE_ADDR (value) != 0); ElfW(Sym) *defsym = ((ElfW(Sym) *) D_PTR (reloc_result->bound, l_info[DT_SYMTAB]) + reloc_result->boundndx); diff --git a/include/link.h b/include/link.h index d7590640aa9285e5..22d020d833ae3a7c 100644 --- a/include/link.h +++ b/include/link.h @@ -206,6 +206,10 @@ struct link_map unsigned int boundndx; uint32_t enterexit; unsigned int flags; + /* CONCURRENCY NOTE: This is used to guard the concurrent initialization + of the relocation result across multiple threads. See the more + detailed notes in elf/dl-runtime.c. */ + unsigned int init; } *l_reloc_result; /* Pointer to the version information if available. */ diff --git a/nptl/Makefile b/nptl/Makefile index cf47a6f097916766..1b9639f3566a63fd 100644 --- a/nptl/Makefile +++ b/nptl/Makefile @@ -298,7 +298,7 @@ tests += tst-cancelx2 tst-cancelx3 tst-cancelx4 tst-cancelx5 \ endif ifeq ($(build-shared),yes) tests += tst-atfork2 tst-tls3 tst-tls4 tst-tls5 tst-_res1 tst-fini1 \ - tst-stackguard1 + tst-stackguard1 tst-audit-threads tests-nolibpthread += tst-fini1 ifeq ($(have-z-execstack),yes) tests += tst-execstack @@ -309,7 +309,7 @@ modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \ tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \ tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \ tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \ - tst-join7mod + tst-join7mod tst-audit-threads-mod1 tst-audit-threads-mod2 extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o test-extras += $(modules-names) tst-cleanup4aux test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names))) @@ -627,6 +627,14 @@ $(objpfx)tst-oddstacklimit.out: $(objpfx)tst-oddstacklimit $(objpfx)tst-basic1 $(run-program-prefix) $< --command '$(host-built-program-cmd)' > $@ endif +# Protect against a build using -Wl,-z,now. +LDFLAGS-tst-audit-threads-mod1.so = -Wl,-z,lazy +LDFLAGS-tst-audit-threads-mod2.so = -Wl,-z,lazy +LDFLAGS-tst-audit-threads = -Wl,-z,lazy +$(objpfx)tst-audit-threads: $(objpfx)tst-audit-threads-mod2.so +$(objpfx)tst-audit-threads.out: $(objpfx)tst-audit-threads-mod1.so +tst-audit-threads-ENV = LD_AUDIT=$(objpfx)tst-audit-threads-mod1.so + # The tests here better do not run in parallel ifneq ($(filter %tests,$(MAKECMDGOALS)),) .NOTPARALLEL: diff --git a/nptl/tst-audit-threads-mod1.c b/nptl/tst-audit-threads-mod1.c new file mode 100644 index 0000000000000000..615d5ee5121962df --- /dev/null +++ b/nptl/tst-audit-threads-mod1.c @@ -0,0 +1,74 @@ +/* Dummy audit library for test-audit-threads. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +/* We must use a dummy LD_AUDIT module to force the dynamic loader to + *not* update the real PLT, and instead use a cached value for the + lazy resolution result. It is the update of that cached value that + we are testing for correctness by doing this. */ + +/* Library to be audited. */ +#define LIB "tst-audit-threads-mod2.so" +/* CALLNUM is the number of retNum functions. */ +#define CALLNUM 7999 + +#define CONCATX(a, b) __CONCAT (a, b) + +static int previous = 0; + +unsigned int +la_version (unsigned int ver) +{ + return 1; +} + +unsigned int +la_objopen (struct link_map *map, Lmid_t lmid, uintptr_t *cookie) +{ + return LA_FLG_BINDTO | LA_FLG_BINDFROM; +} + +uintptr_t +CONCATX(la_symbind, __ELF_NATIVE_CLASS) (ElfW(Sym) *sym, + unsigned int ndx, + uintptr_t *refcook, + uintptr_t *defcook, + unsigned int *flags, + const char *symname) +{ + const char * retnum = "retNum"; + char * num = strstr (symname, retnum); + int n; + /* Validate if the symbols are getting called in the correct order. + This code is here to verify binutils does not optimize out the PLT + entries that require the symbol binding. */ + if (num != NULL) + { + n = atoi (num); + assert (n >= previous); + assert (n <= CALLNUM); + previous = n; + } + return sym->st_value; +} diff --git a/nptl/tst-audit-threads-mod2.c b/nptl/tst-audit-threads-mod2.c new file mode 100644 index 0000000000000000..f9817dd3dc7f4910 --- /dev/null +++ b/nptl/tst-audit-threads-mod2.c @@ -0,0 +1,22 @@ +/* Shared object with a huge number of functions for test-audit-threads. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define all the retNumN functions in a library. */ +#define definenum +#include "tst-audit-threads.h" diff --git a/nptl/tst-audit-threads.c b/nptl/tst-audit-threads.c new file mode 100644 index 0000000000000000..e4bf433bd85f3715 --- /dev/null +++ b/nptl/tst-audit-threads.c @@ -0,0 +1,97 @@ +/* Test multi-threading using LD_AUDIT. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* This test uses a dummy LD_AUDIT library (test-audit-threads-mod1) and a + library with a huge number of functions in order to validate lazy symbol + binding with an audit library. We use one thread per CPU to test that + concurrent lazy resolution does not have any defects which would cause + the process to fail. We use an LD_AUDIT library to force the testing of + the relocation resolution caching code in the dynamic loader i.e. + _dl_runtime_profile and _dl_profile_fixup. */ + +#include +#include +#include +#include + +static int do_test (void); + +/* This test usually takes less than 3s to run. However, there are cases that + take up to 30s. */ +#define TIMEOUT 60 +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" + +/* Declare the functions we are going to call. */ +#define externnum +#include "tst-audit-threads.h" +#undef externnum + +int num_threads; +pthread_barrier_t barrier; + +void +sync_all (int num) +{ + pthread_barrier_wait (&barrier); +} + +void +call_all_ret_nums (void) +{ + /* Call each function one at a time from all threads. */ +#define callnum +#include "tst-audit-threads.h" +#undef callnum +} + +void * +thread_main (void *unused) +{ + call_all_ret_nums (); + return NULL; +} + +#define STR2(X) #X +#define STR(X) STR2(X) + +static int +do_test (void) +{ + int i; + pthread_t *threads; + + num_threads = get_nprocs (); + if (num_threads <= 1) + num_threads = 2; + + /* Used to synchronize all the threads after calling each retNumN. */ + xpthread_barrier_init (&barrier, NULL, num_threads); + + threads = (pthread_t *) xcalloc (num_threads, sizeof(pthread_t)); + for (i = 0; i < num_threads; i++) + threads[i] = xpthread_create(NULL, thread_main, NULL); + + for (i = 0; i < num_threads; i++) + xpthread_join(threads[i]); + + free (threads); + + return 0; +} diff --git a/nptl/tst-audit-threads.h b/nptl/tst-audit-threads.h new file mode 100644 index 0000000000000000..1c9ecc08dfcd3e65 --- /dev/null +++ b/nptl/tst-audit-threads.h @@ -0,0 +1,92 @@ +/* Helper header for test-audit-threads. + + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* We use this helper to create a large number of functions, all of + which will be resolved lazily and thus have their PLT updated. + This is done to provide enough functions that we can statistically + observe a thread vs. PLT resolution failure if one exists. */ + +#define CONCAT(a, b) a ## b +#define NUM(x, y) CONCAT (x, y) + +#define FUNC10(x) \ + FUNC (NUM (x, 0)); \ + FUNC (NUM (x, 1)); \ + FUNC (NUM (x, 2)); \ + FUNC (NUM (x, 3)); \ + FUNC (NUM (x, 4)); \ + FUNC (NUM (x, 5)); \ + FUNC (NUM (x, 6)); \ + FUNC (NUM (x, 7)); \ + FUNC (NUM (x, 8)); \ + FUNC (NUM (x, 9)) + +#define FUNC100(x) \ + FUNC10 (NUM (x, 0)); \ + FUNC10 (NUM (x, 1)); \ + FUNC10 (NUM (x, 2)); \ + FUNC10 (NUM (x, 3)); \ + FUNC10 (NUM (x, 4)); \ + FUNC10 (NUM (x, 5)); \ + FUNC10 (NUM (x, 6)); \ + FUNC10 (NUM (x, 7)); \ + FUNC10 (NUM (x, 8)); \ + FUNC10 (NUM (x, 9)) + +#define FUNC1000(x) \ + FUNC100 (NUM (x, 0)); \ + FUNC100 (NUM (x, 1)); \ + FUNC100 (NUM (x, 2)); \ + FUNC100 (NUM (x, 3)); \ + FUNC100 (NUM (x, 4)); \ + FUNC100 (NUM (x, 5)); \ + FUNC100 (NUM (x, 6)); \ + FUNC100 (NUM (x, 7)); \ + FUNC100 (NUM (x, 8)); \ + FUNC100 (NUM (x, 9)) + +#define FUNC7000() \ + FUNC1000 (1); \ + FUNC1000 (2); \ + FUNC1000 (3); \ + FUNC1000 (4); \ + FUNC1000 (5); \ + FUNC1000 (6); \ + FUNC1000 (7); + +#ifdef FUNC +# undef FUNC +#endif + +#ifdef externnum +# define FUNC(x) extern int CONCAT (retNum, x) (void) +#endif + +#ifdef definenum +# define FUNC(x) int CONCAT (retNum, x) (void) { return x; } +#endif + +#ifdef callnum +# define FUNC(x) CONCAT (retNum, x) (); sync_all (x) +#endif + +/* A value of 7000 functions is chosen as an arbitrarily large + number of functions that will allow us enough attempts to + verify lazy resolution operation. */ +FUNC7000 ();