b1dca6
commit ffb17e7ba3a5ba9632cee97330b325072fbe41dd
b1dca6
Author: Szabolcs Nagy <szabolcs.nagy@arm.com>
b1dca6
Date:   Wed Jun 10 13:40:40 2020 +0100
b1dca6
b1dca6
    rtld: Avoid using up static TLS surplus for optimizations [BZ #25051]
b1dca6
    
b1dca6
    On some targets static TLS surplus area can be used opportunistically
b1dca6
    for dynamically loaded modules such that the TLS access then becomes
b1dca6
    faster (TLSDESC and powerpc TLS optimization). However we don't want
b1dca6
    all surplus TLS to be used for this optimization because dynamically
b1dca6
    loaded modules with initial-exec model TLS can only use surplus TLS.
b1dca6
    
b1dca6
    The new contract for surplus static TLS use is:
b1dca6
    
b1dca6
    - libc.so can have up to 192 bytes of IE TLS,
b1dca6
    - other system libraries together can have up to 144 bytes of IE TLS.
b1dca6
    - Some "optional" static TLS is available for opportunistic use.
b1dca6
    
b1dca6
    The optional TLS is now tunable: rtld.optional_static_tls, so users
b1dca6
    can directly affect the allocated static TLS size. (Note that module
b1dca6
    unloading with dlclose does not reclaim static TLS. After the optional
b1dca6
    TLS runs out, TLS access is no longer optimized to use static TLS.)
b1dca6
    
b1dca6
    The default setting of rtld.optional_static_tls is 512 so the surplus
b1dca6
    TLS is 3*192 + 4*144 + 512 = 1664 by default, the same as before.
b1dca6
    
b1dca6
    Fixes BZ #25051.
b1dca6
    
b1dca6
    Tested on aarch64-linux-gnu and x86_64-linux-gnu.
b1dca6
    
b1dca6
    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
b1dca6
b1dca6
Conflicts:
b1dca6
	elf/Makefile
b1dca6
	  (Missing __libc_single_threaded downstream.)
b1dca6
b1dca6
diff --git a/csu/libc-tls.c b/csu/libc-tls.c
b1dca6
index 6f2a47dc86222407..76aa1b98ea059a43 100644
b1dca6
--- a/csu/libc-tls.c
b1dca6
+++ b/csu/libc-tls.c
b1dca6
@@ -62,6 +62,9 @@ size_t _dl_tls_static_align;
b1dca6
    loaded modules with IE-model TLS or for TLSDESC optimization.
b1dca6
    See comments in elf/dl-tls.c where it is initialized.  */
b1dca6
 size_t _dl_tls_static_surplus;
b1dca6
+/* Remaining amount of static TLS that may be used for optimizing
b1dca6
+   dynamic TLS access (e.g. with TLSDESC).  */
b1dca6
+size_t _dl_tls_static_optional;
b1dca6
 
b1dca6
 /* Generation counter for the dtv.  */
b1dca6
 size_t _dl_tls_generation;
b1dca6
diff --git a/elf/Makefile b/elf/Makefile
b1dca6
index cbced7605ebe2443..8b96bfefd852b79f 100644
b1dca6
--- a/elf/Makefile
b1dca6
+++ b/elf/Makefile
b1dca6
@@ -197,7 +197,8 @@ tests += restest1 preloadtest loadfail multiload origtest resolvfail \
b1dca6
 	 tst-auditmany tst-initfinilazyfail \
b1dca6
 	 tst-dlopenfail tst-dlopenfail-2 \
b1dca6
 	 tst-filterobj tst-filterobj-dlopen tst-auxobj tst-auxobj-dlopen \
b1dca6
-	 tst-audit14 tst-audit15 tst-audit16
b1dca6
+	 tst-audit14 tst-audit15 tst-audit16 \
b1dca6
+	 tst-tls-ie tst-tls-ie-dlmopen
b1dca6
 #	 reldep9
b1dca6
 tests-internal += loadtest unload unload2 circleload1 \
b1dca6
 	 neededtest neededtest2 neededtest3 neededtest4 \
b1dca6
@@ -313,7 +314,10 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \
b1dca6
 		tst-dlopenfailmod1 tst-dlopenfaillinkmod tst-dlopenfailmod2 \
b1dca6
 		tst-dlopenfailmod3 \
b1dca6
 		tst-filterobj-flt tst-filterobj-aux tst-filterobj-filtee \
b1dca6
-		tst-auditlogmod-1 tst-auditlogmod-2 tst-auditlogmod-3
b1dca6
+		tst-auditlogmod-1 tst-auditlogmod-2 tst-auditlogmod-3 \
b1dca6
+		tst-tls-ie-mod0 tst-tls-ie-mod1 tst-tls-ie-mod2 \
b1dca6
+		tst-tls-ie-mod3 tst-tls-ie-mod4 tst-tls-ie-mod5 \
b1dca6
+		tst-tls-ie-mod6
b1dca6
 
b1dca6
 # Most modules build with _ISOMAC defined, but those filtered out
b1dca6
 # depend on internal headers.
b1dca6
@@ -1690,3 +1694,23 @@ $(objpfx)tst-auxobj: $(objpfx)tst-filterobj-aux.so
b1dca6
 $(objpfx)tst-auxobj-dlopen: $(libdl)
b1dca6
 $(objpfx)tst-auxobj.out: $(objpfx)tst-filterobj-filtee.so
b1dca6
 $(objpfx)tst-auxobj-dlopen.out: $(objpfx)tst-filterobj-filtee.so
b1dca6
+
b1dca6
+$(objpfx)tst-tls-ie: $(libdl) $(shared-thread-library)
b1dca6
+$(objpfx)tst-tls-ie.out: \
b1dca6
+  $(objpfx)tst-tls-ie-mod0.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod1.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod2.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod3.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod4.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod5.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod6.so
b1dca6
+
b1dca6
+$(objpfx)tst-tls-ie-dlmopen: $(libdl) $(shared-thread-library)
b1dca6
+$(objpfx)tst-tls-ie-dlmopen.out: \
b1dca6
+  $(objpfx)tst-tls-ie-mod0.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod1.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod2.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod3.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod4.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod5.so \
b1dca6
+  $(objpfx)tst-tls-ie-mod6.so
b1dca6
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
b1dca6
index afeace4d3e49180c..c6139b89d4ecddc8 100644
b1dca6
--- a/elf/dl-reloc.c
b1dca6
+++ b/elf/dl-reloc.c
b1dca6
@@ -39,13 +39,16 @@
b1dca6
 /* We are trying to perform a static TLS relocation in MAP, but it was
b1dca6
    dynamically loaded.  This can only work if there is enough surplus in
b1dca6
    the static TLS area already allocated for each running thread.  If this
b1dca6
-   object's TLS segment is too big to fit, we fail.  If it fits,
b1dca6
-   we set MAP->l_tls_offset and return.
b1dca6
-   This function intentionally does not return any value but signals error
b1dca6
-   directly, as static TLS should be rare and code handling it should
b1dca6
-   not be inlined as much as possible.  */
b1dca6
+   object's TLS segment is too big to fit, we fail with -1.  If it fits,
b1dca6
+   we set MAP->l_tls_offset and return 0.
b1dca6
+   A portion of the surplus static TLS can be optionally used to optimize
b1dca6
+   dynamic TLS access (with TLSDESC or powerpc TLS optimizations).
b1dca6
+   If OPTIONAL is true then TLS is allocated for such optimization and
b1dca6
+   the caller must have a fallback in case the optional portion of surplus
b1dca6
+   TLS runs out.  If OPTIONAL is false then the entire surplus TLS area is
b1dca6
+   considered and the allocation only fails if that runs out.  */
b1dca6
 int
b1dca6
-_dl_try_allocate_static_tls (struct link_map *map)
b1dca6
+_dl_try_allocate_static_tls (struct link_map *map, bool optional)
b1dca6
 {
b1dca6
   /* If we've already used the variable with dynamic access, or if the
b1dca6
      alignment requirements are too high, fail.  */
b1dca6
@@ -68,8 +71,14 @@ _dl_try_allocate_static_tls (struct link_map *map)
b1dca6
 
b1dca6
   size_t n = (freebytes - blsize) / map->l_tls_align;
b1dca6
 
b1dca6
-  size_t offset = GL(dl_tls_static_used) + (freebytes - n * map->l_tls_align
b1dca6
-					    - map->l_tls_firstbyte_offset);
b1dca6
+  /* Account optional static TLS surplus usage.  */
b1dca6
+  size_t use = freebytes - n * map->l_tls_align - map->l_tls_firstbyte_offset;
b1dca6
+  if (optional && use > GL(dl_tls_static_optional))
b1dca6
+    goto fail;
b1dca6
+  else if (optional)
b1dca6
+    GL(dl_tls_static_optional) -= use;
b1dca6
+
b1dca6
+  size_t offset = GL(dl_tls_static_used) + use;
b1dca6
 
b1dca6
   map->l_tls_offset = GL(dl_tls_static_used) = offset;
b1dca6
 #elif TLS_DTV_AT_TP
b1dca6
@@ -83,6 +92,13 @@ _dl_try_allocate_static_tls (struct link_map *map)
b1dca6
   if (used > GL(dl_tls_static_size))
b1dca6
     goto fail;
b1dca6
 
b1dca6
+  /* Account optional static TLS surplus usage.  */
b1dca6
+  size_t use = used - GL(dl_tls_static_used);
b1dca6
+  if (optional && use > GL(dl_tls_static_optional))
b1dca6
+    goto fail;
b1dca6
+  else if (optional)
b1dca6
+    GL(dl_tls_static_optional) -= use;
b1dca6
+
b1dca6
   map->l_tls_offset = offset;
b1dca6
   map->l_tls_firstbyte_offset = GL(dl_tls_static_used);
b1dca6
   GL(dl_tls_static_used) = used;
b1dca6
@@ -110,12 +126,15 @@ _dl_try_allocate_static_tls (struct link_map *map)
b1dca6
   return 0;
b1dca6
 }
b1dca6
 
b1dca6
+/* This function intentionally does not return any value but signals error
b1dca6
+   directly, as static TLS should be rare and code handling it should
b1dca6
+   not be inlined as much as possible.  */
b1dca6
 void
b1dca6
 __attribute_noinline__
b1dca6
 _dl_allocate_static_tls (struct link_map *map)
b1dca6
 {
b1dca6
   if (map->l_tls_offset == FORCED_DYNAMIC_TLS_OFFSET
b1dca6
-      || _dl_try_allocate_static_tls (map))
b1dca6
+      || _dl_try_allocate_static_tls (map, false))
b1dca6
     {
b1dca6
       _dl_signal_error (0, map->l_name, NULL, N_("\
b1dca6
 cannot allocate memory in static TLS block"));
b1dca6
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
b1dca6
index cfda76f6de96df57..4f8c35b7d37bfc18 100644
b1dca6
--- a/elf/dl-tls.c
b1dca6
+++ b/elf/dl-tls.c
b1dca6
@@ -60,8 +60,6 @@
b1dca6
    This should be large enough to cover runtime libraries of the
b1dca6
    compiler such as libgomp and libraries in libc other than libc.so.  */
b1dca6
 #define OTHER_IE_TLS 144
b1dca6
-/* Size of additional surplus TLS, placeholder for TLS optimizations.  */
b1dca6
-#define OPT_SURPLUS_TLS 512
b1dca6
 
b1dca6
 /* Calculate the size of the static TLS surplus, when the given
b1dca6
    number of audit modules are loaded.  Must be called after the
b1dca6
@@ -69,13 +67,15 @@
b1dca6
 void
b1dca6
 _dl_tls_static_surplus_init (size_t naudit)
b1dca6
 {
b1dca6
-  size_t nns;
b1dca6
+  size_t nns, opt_tls;
b1dca6
 
b1dca6
 #if HAVE_TUNABLES
b1dca6
   nns = TUNABLE_GET (nns, size_t, NULL);
b1dca6
+  opt_tls = TUNABLE_GET (optional_static_tls, size_t, NULL);
b1dca6
 #else
b1dca6
   /* Default values of the tunables.  */
b1dca6
   nns = 4;
b1dca6
+  opt_tls = 512;
b1dca6
 #endif
b1dca6
   if (nns > DL_NNS)
b1dca6
     nns = DL_NNS;
b1dca6
@@ -84,9 +84,10 @@ _dl_tls_static_surplus_init (size_t naudit)
b1dca6
 		      (unsigned long) naudit, (unsigned long) (DL_NNS - nns));
b1dca6
   nns += naudit;
b1dca6
 
b1dca6
+  GL(dl_tls_static_optional) = opt_tls;
b1dca6
   GLRO(dl_tls_static_surplus) = ((nns - 1) * LIBC_IE_TLS
b1dca6
 				 + nns * OTHER_IE_TLS
b1dca6
-				 + OPT_SURPLUS_TLS);
b1dca6
+				 + opt_tls);
b1dca6
 }
b1dca6
 
b1dca6
 /* Out-of-memory handler.  */
b1dca6
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
b1dca6
index 7337fb85062c91a7..6408a8e5ae92d2c6 100644
b1dca6
--- a/elf/dl-tunables.list
b1dca6
+++ b/elf/dl-tunables.list
b1dca6
@@ -134,5 +134,10 @@ glibc {
b1dca6
       maxval: 16
b1dca6
       default: 4
b1dca6
     }
b1dca6
+    optional_static_tls {
b1dca6
+      type: SIZE_T
b1dca6
+      minval: 0
b1dca6
+      default: 512
b1dca6
+    }
b1dca6
   }
b1dca6
 }
b1dca6
diff --git a/elf/dynamic-link.h b/elf/dynamic-link.h
b1dca6
index 9e9d5a3b28bc06c5..2fc3c91b7defe84e 100644
b1dca6
--- a/elf/dynamic-link.h
b1dca6
+++ b/elf/dynamic-link.h
b1dca6
@@ -40,9 +40,10 @@
b1dca6
     (__builtin_expect ((sym_map)->l_tls_offset				\
b1dca6
 		       != FORCED_DYNAMIC_TLS_OFFSET, 1)			\
b1dca6
      && (__builtin_expect ((sym_map)->l_tls_offset != NO_TLS_OFFSET, 1)	\
b1dca6
-	 || _dl_try_allocate_static_tls (sym_map) == 0))
b1dca6
+	 || _dl_try_allocate_static_tls (sym_map, true) == 0))
b1dca6
 
b1dca6
-int _dl_try_allocate_static_tls (struct link_map *map) attribute_hidden;
b1dca6
+int _dl_try_allocate_static_tls (struct link_map *map, bool optional)
b1dca6
+  attribute_hidden;
b1dca6
 
b1dca6
 #include <elf.h>
b1dca6
 
b1dca6
diff --git a/elf/tst-tls-ie-dlmopen.c b/elf/tst-tls-ie-dlmopen.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..c7b5c688e362c861
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-dlmopen.c
b1dca6
@@ -0,0 +1,112 @@
b1dca6
+/* Test dlopen of modules with initial-exec TLS after dlmopen.
b1dca6
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
b1dca6
+   This file is part of the GNU C Library.
b1dca6
+
b1dca6
+   The GNU C Library is free software; you can redistribute it and/or
b1dca6
+   modify it under the terms of the GNU Lesser General Public
b1dca6
+   License as published by the Free Software Foundation; either
b1dca6
+   version 2.1 of the License, or (at your option) any later version.
b1dca6
+
b1dca6
+   The GNU C Library is distributed in the hope that it will be useful,
b1dca6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
b1dca6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
b1dca6
+   Lesser General Public License for more details.
b1dca6
+
b1dca6
+   You should have received a copy of the GNU Lesser General Public
b1dca6
+   License along with the GNU C Library; if not, see
b1dca6
+   <https://www.gnu.org/licenses/>.  */
b1dca6
+
b1dca6
+/* This test tries to check that surplus static TLS is not used up for
b1dca6
+   dynamic TLS optimizations and 4*144 = 576 bytes of static TLS is
b1dca6
+   still available for dlopening modules with initial-exec TLS after 3
b1dca6
+   new dlmopen namespaces are created.  It depends on rtld.nns=4 and
b1dca6
+   rtld.optional_static_tls=512 tunable settings.  */
b1dca6
+
b1dca6
+#include <errno.h>
b1dca6
+#include <pthread.h>
b1dca6
+#include <stdio.h>
b1dca6
+#include <stdlib.h>
b1dca6
+#include <string.h>
b1dca6
+
b1dca6
+static int do_test (void);
b1dca6
+#include <support/xthread.h>
b1dca6
+#include <support/xdlfcn.h>
b1dca6
+#include <support/check.h>
b1dca6
+#include <support/test-driver.c>
b1dca6
+
b1dca6
+/* Have some big TLS in the main exe: should not use surplus TLS.  */
b1dca6
+__thread char maintls[1000];
b1dca6
+
b1dca6
+static pthread_barrier_t barrier;
b1dca6
+
b1dca6
+/* Forces multi-threaded behaviour.  */
b1dca6
+static void *
b1dca6
+blocked_thread_func (void *closure)
b1dca6
+{
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  /* TLS load and access tests run here in the main thread.  */
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  return NULL;
b1dca6
+}
b1dca6
+
b1dca6
+static void *
b1dca6
+load_and_access (Lmid_t lmid, const char *mod, const char *func)
b1dca6
+{
b1dca6
+  /* Load module with TLS.  */
b1dca6
+  void *p = xdlmopen (lmid, mod, RTLD_NOW);
b1dca6
+  /* Access the TLS variable to ensure it is allocated.  */
b1dca6
+  void (*f) (void) = (void (*) (void))xdlsym (p, func);
b1dca6
+  f ();
b1dca6
+  return p;
b1dca6
+}
b1dca6
+
b1dca6
+static int
b1dca6
+do_test (void)
b1dca6
+{
b1dca6
+  void *mods[5];
b1dca6
+
b1dca6
+  {
b1dca6
+    int ret = pthread_barrier_init (&barrier, NULL, 2);
b1dca6
+    if (ret != 0)
b1dca6
+      {
b1dca6
+        errno = ret;
b1dca6
+        printf ("error: pthread_barrier_init: %m\n");
b1dca6
+        exit (1);
b1dca6
+      }
b1dca6
+  }
b1dca6
+
b1dca6
+  pthread_t blocked_thread = xpthread_create (NULL, blocked_thread_func, NULL);
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+
b1dca6
+  printf ("maintls[%zu]:\t %p .. %p\n",
b1dca6
+	   sizeof maintls, maintls, maintls + sizeof maintls);
b1dca6
+  memset (maintls, 1, sizeof maintls);
b1dca6
+
b1dca6
+  /* Load modules with dynamic TLS (use surplus static TLS for libc
b1dca6
+     in new namespaces and may be for TLS optimizations too).  */
b1dca6
+  mods[0] = load_and_access (LM_ID_BASE, "tst-tls-ie-mod0.so", "access0");
b1dca6
+  mods[1] = load_and_access (LM_ID_NEWLM, "tst-tls-ie-mod1.so", "access1");
b1dca6
+  mods[2] = load_and_access (LM_ID_NEWLM, "tst-tls-ie-mod2.so", "access2");
b1dca6
+  mods[3] = load_and_access (LM_ID_NEWLM, "tst-tls-ie-mod3.so", "access3");
b1dca6
+  /* Load modules with initial-exec TLS (can only use surplus static TLS).  */
b1dca6
+  mods[4] = load_and_access (LM_ID_BASE, "tst-tls-ie-mod6.so", "access6");
b1dca6
+
b1dca6
+  /* Here 576 bytes + 3 * libc use of surplus static TLS is in use so less
b1dca6
+     than 1024 bytes are available (exact number depends on TLS optimizations
b1dca6
+     and the libc TLS use).  */
b1dca6
+  printf ("The next dlmopen should fail...\n");
b1dca6
+  void *p = dlmopen (LM_ID_BASE, "tst-tls-ie-mod4.so", RTLD_NOW);
b1dca6
+  if (p != NULL)
b1dca6
+    FAIL_EXIT1 ("error: expected dlmopen to fail because there is "
b1dca6
+		"not enough surplus static TLS.\n");
b1dca6
+  printf ("...OK failed with: %s.\n", dlerror ());
b1dca6
+
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  xpthread_join (blocked_thread);
b1dca6
+
b1dca6
+  /* Close the modules.  */
b1dca6
+  for (int i = 0; i < 5; ++i)
b1dca6
+    xdlclose (mods[i]);
b1dca6
+
b1dca6
+  return 0;
b1dca6
+}
b1dca6
diff --git a/elf/tst-tls-ie-mod.h b/elf/tst-tls-ie-mod.h
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..46b362a9b783d214
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod.h
b1dca6
@@ -0,0 +1,40 @@
b1dca6
+/* Module with specified TLS size and model.
b1dca6
+   Copyright (C) 2020 Free Software Foundation, Inc.
b1dca6
+   This file is part of the GNU C Library.
b1dca6
+
b1dca6
+   The GNU C Library is free software; you can redistribute it and/or
b1dca6
+   modify it under the terms of the GNU Lesser General Public
b1dca6
+   License as published by the Free Software Foundation; either
b1dca6
+   version 2.1 of the License, or (at your option) any later version.
b1dca6
+
b1dca6
+   The GNU C Library is distributed in the hope that it will be useful,
b1dca6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
b1dca6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
b1dca6
+   Lesser General Public License for more details.
b1dca6
+
b1dca6
+   You should have received a copy of the GNU Lesser General Public
b1dca6
+   License along with the GNU C Library; if not, see
b1dca6
+   <https://www.gnu.org/licenses/>.  */
b1dca6
+
b1dca6
+/* This file is parameterized by macros N, SIZE and MODEL.  */
b1dca6
+
b1dca6
+#include <stdio.h>
b1dca6
+#include <string.h>
b1dca6
+
b1dca6
+#define CONCATX(x, y) x ## y
b1dca6
+#define CONCAT(x, y) CONCATX (x, y)
b1dca6
+#define STRX(x) #x
b1dca6
+#define STR(x) STRX (x)
b1dca6
+
b1dca6
+#define VAR CONCAT (var, N)
b1dca6
+
b1dca6
+__attribute__ ((aligned (8), tls_model (MODEL)))
b1dca6
+__thread char VAR[SIZE];
b1dca6
+
b1dca6
+void
b1dca6
+CONCAT (access, N) (void)
b1dca6
+{
b1dca6
+  printf (STR (VAR) "[%d]:\t %p .. %p " MODEL "\n", SIZE, VAR, VAR + SIZE);
b1dca6
+  fflush (stdout);
b1dca6
+  memset (VAR, 1, SIZE);
b1dca6
+}
b1dca6
diff --git a/elf/tst-tls-ie-mod0.c b/elf/tst-tls-ie-mod0.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..2450686e400e1141
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod0.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 0
b1dca6
+#define SIZE 480
b1dca6
+#define MODEL "global-dynamic"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod1.c b/elf/tst-tls-ie-mod1.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..849ff91e53b0a518
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod1.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 1
b1dca6
+#define SIZE 120
b1dca6
+#define MODEL "global-dynamic"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod2.c b/elf/tst-tls-ie-mod2.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..23915ab67bab0ada
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod2.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 2
b1dca6
+#define SIZE 24
b1dca6
+#define MODEL "global-dynamic"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod3.c b/elf/tst-tls-ie-mod3.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..5395f844a5999ea9
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod3.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 3
b1dca6
+#define SIZE 16
b1dca6
+#define MODEL "global-dynamic"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod4.c b/elf/tst-tls-ie-mod4.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..93ac2eacae292d86
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod4.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 4
b1dca6
+#define SIZE 1024
b1dca6
+#define MODEL "initial-exec"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod5.c b/elf/tst-tls-ie-mod5.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..84b3fd285b5b5a3e
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod5.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 5
b1dca6
+#define SIZE 128
b1dca6
+#define MODEL "initial-exec"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie-mod6.c b/elf/tst-tls-ie-mod6.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..c736bf0684f3b08f
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie-mod6.c
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define N 6
b1dca6
+#define SIZE 576
b1dca6
+#define MODEL "initial-exec"
b1dca6
+#include "tst-tls-ie-mod.h"
b1dca6
diff --git a/elf/tst-tls-ie.c b/elf/tst-tls-ie.c
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..2dc0894480417389
b1dca6
--- /dev/null
b1dca6
+++ b/elf/tst-tls-ie.c
b1dca6
@@ -0,0 +1,111 @@
b1dca6
+/* Test dlopen of modules with initial-exec TLS.
b1dca6
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
b1dca6
+   This file is part of the GNU C Library.
b1dca6
+
b1dca6
+   The GNU C Library is free software; you can redistribute it and/or
b1dca6
+   modify it under the terms of the GNU Lesser General Public
b1dca6
+   License as published by the Free Software Foundation; either
b1dca6
+   version 2.1 of the License, or (at your option) any later version.
b1dca6
+
b1dca6
+   The GNU C Library is distributed in the hope that it will be useful,
b1dca6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
b1dca6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
b1dca6
+   Lesser General Public License for more details.
b1dca6
+
b1dca6
+   You should have received a copy of the GNU Lesser General Public
b1dca6
+   License along with the GNU C Library; if not, see
b1dca6
+   <https://www.gnu.org/licenses/>.  */
b1dca6
+
b1dca6
+/* This test tries to check that surplus static TLS is not used up for
b1dca6
+   dynamic TLS optimizations and 3*192 + 4*144 = 1152 bytes of static
b1dca6
+   TLS is available for dlopening modules with initial-exec TLS.  It
b1dca6
+   depends on rtld.nns=4 and rtld.optional_static_tls=512 tunable setting.  */
b1dca6
+
b1dca6
+#include <errno.h>
b1dca6
+#include <pthread.h>
b1dca6
+#include <stdio.h>
b1dca6
+#include <stdlib.h>
b1dca6
+#include <string.h>
b1dca6
+
b1dca6
+static int do_test (void);
b1dca6
+#include <support/xthread.h>
b1dca6
+#include <support/xdlfcn.h>
b1dca6
+#include <support/check.h>
b1dca6
+#include <support/test-driver.c>
b1dca6
+
b1dca6
+/* Have some big TLS in the main exe: should not use surplus TLS.  */
b1dca6
+__thread char maintls[1000];
b1dca6
+
b1dca6
+static pthread_barrier_t barrier;
b1dca6
+
b1dca6
+/* Forces multi-threaded behaviour.  */
b1dca6
+static void *
b1dca6
+blocked_thread_func (void *closure)
b1dca6
+{
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  /* TLS load and access tests run here in the main thread.  */
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  return NULL;
b1dca6
+}
b1dca6
+
b1dca6
+static void *
b1dca6
+load_and_access (const char *mod, const char *func)
b1dca6
+{
b1dca6
+  /* Load module with TLS.  */
b1dca6
+  void *p = xdlopen (mod, RTLD_NOW);
b1dca6
+  /* Access the TLS variable to ensure it is allocated.  */
b1dca6
+  void (*f) (void) = (void (*) (void))xdlsym (p, func);
b1dca6
+  f ();
b1dca6
+  return p;
b1dca6
+}
b1dca6
+
b1dca6
+static int
b1dca6
+do_test (void)
b1dca6
+{
b1dca6
+  void *mods[6];
b1dca6
+
b1dca6
+  {
b1dca6
+    int ret = pthread_barrier_init (&barrier, NULL, 2);
b1dca6
+    if (ret != 0)
b1dca6
+      {
b1dca6
+        errno = ret;
b1dca6
+        printf ("error: pthread_barrier_init: %m\n");
b1dca6
+        exit (1);
b1dca6
+      }
b1dca6
+  }
b1dca6
+
b1dca6
+  pthread_t blocked_thread = xpthread_create (NULL, blocked_thread_func, NULL);
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+
b1dca6
+  printf ("maintls[%zu]:\t %p .. %p\n",
b1dca6
+	   sizeof maintls, maintls, maintls + sizeof maintls);
b1dca6
+  memset (maintls, 1, sizeof maintls);
b1dca6
+
b1dca6
+  /* Load modules with dynamic TLS (may use surplus static TLS
b1dca6
+     opportunistically).  */
b1dca6
+  mods[0] = load_and_access ("tst-tls-ie-mod0.so", "access0");
b1dca6
+  mods[1] = load_and_access ("tst-tls-ie-mod1.so", "access1");
b1dca6
+  mods[2] = load_and_access ("tst-tls-ie-mod2.so", "access2");
b1dca6
+  mods[3] = load_and_access ("tst-tls-ie-mod3.so", "access3");
b1dca6
+  /* Load modules with initial-exec TLS (can only use surplus static TLS).  */
b1dca6
+  mods[4] = load_and_access ("tst-tls-ie-mod4.so", "access4");
b1dca6
+  mods[5] = load_and_access ("tst-tls-ie-mod5.so", "access5");
b1dca6
+
b1dca6
+  /* Here 1152 bytes of surplus static TLS is in use and at most 512 bytes
b1dca6
+     are available (depending on TLS optimizations).  */
b1dca6
+  printf ("The next dlopen should fail...\n");
b1dca6
+  void *p = dlopen ("tst-tls-ie-mod6.so", RTLD_NOW);
b1dca6
+  if (p != NULL)
b1dca6
+    FAIL_EXIT1 ("error: expected dlopen to fail because there is "
b1dca6
+		"not enough surplus static TLS.\n");
b1dca6
+  printf ("...OK failed with: %s.\n", dlerror ());
b1dca6
+
b1dca6
+  xpthread_barrier_wait (&barrier);
b1dca6
+  xpthread_join (blocked_thread);
b1dca6
+
b1dca6
+  /* Close the modules.  */
b1dca6
+  for (int i = 0; i < 6; ++i)
b1dca6
+    xdlclose (mods[i]);
b1dca6
+
b1dca6
+  return 0;
b1dca6
+}
b1dca6
diff --git a/manual/tunables.texi b/manual/tunables.texi
b1dca6
index e6a3e9a2cf5c959c..bd737b5d57080462 100644
b1dca6
--- a/manual/tunables.texi
b1dca6
+++ b/manual/tunables.texi
b1dca6
@@ -249,6 +249,23 @@ increase the per-thread memory usage as necessary, so this tunable does
b1dca6
 not need to be changed to allow many audit modules e.g. via @env{LD_AUDIT}.
b1dca6
 @end deftp
b1dca6
 
b1dca6
+@deftp Tunable glibc.rtld.optional_static_tls
b1dca6
+Sets the amount of surplus static TLS in bytes to allocate at program
b1dca6
+startup.  Every thread created allocates this amount of specified surplus
b1dca6
+static TLS. This is a minimum value and additional space may be allocated
b1dca6
+for internal purposes including alignment.  Optional static TLS is used for
b1dca6
+optimizing dynamic TLS access for platforms that support such optimizations
b1dca6
+e.g. TLS descriptors or optimized TLS access for POWER (@code{DT_PPC64_OPT}
b1dca6
+and @code{DT_PPC_OPT}).  In order to make the best use of such optimizations
b1dca6
+the value should be as many bytes as would be required to hold all TLS
b1dca6
+variables in all dynamic loaded shared libraries.  The value cannot be known
b1dca6
+by the dynamic loader because it doesn't know the expected set of shared
b1dca6
+libraries which will be loaded.  The existing static TLS space cannot be
b1dca6
+changed once allocated at process startup.  The default allocation of
b1dca6
+optional static TLS is 512 bytes and is allocated in every thread.
b1dca6
+@end deftp
b1dca6
+
b1dca6
+
b1dca6
 @node Elision Tunables
b1dca6
 @section Elision Tunables
b1dca6
 @cindex elision tunables
b1dca6
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
b1dca6
index 293f3ab5a496afdf..37f1915b0c75a020 100644
b1dca6
--- a/sysdeps/generic/ldsodefs.h
b1dca6
+++ b/sysdeps/generic/ldsodefs.h
b1dca6
@@ -441,6 +441,9 @@ struct rtld_global
b1dca6
   EXTERN size_t _dl_tls_static_used;
b1dca6
   /* Alignment requirement of the static TLS block.  */
b1dca6
   EXTERN size_t _dl_tls_static_align;
b1dca6
+  /* Remaining amount of static TLS that may be used for optimizing
b1dca6
+     dynamic TLS access (e.g. with TLSDESC).  */
b1dca6
+  EXTERN size_t _dl_tls_static_optional;
b1dca6
 
b1dca6
 /* Number of additional entries in the slotinfo array of each slotinfo
b1dca6
    list element.  A large number makes it almost certain take we never