f5ed34
From d2123d68275acc0f061e73d5f86ca504e0d5a344 Mon Sep 17 00:00:00 2001
f5ed34
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
f5ed34
Date: Tue, 16 Feb 2021 12:55:13 +0000
f5ed34
Subject: elf: Fix slow tls access after dlopen [BZ #19924]
f5ed34
f5ed34
In short: __tls_get_addr checks the global generation counter and if
f5ed34
the current dtv is older then _dl_update_slotinfo updates dtv up to the
f5ed34
generation of the accessed module. So if the global generation is newer
f5ed34
than generation of the module then __tls_get_addr keeps hitting the
f5ed34
slow dtv update path. The dtv update path includes a number of checks
f5ed34
to see if any update is needed and this already causes measurable tls
f5ed34
access slow down after dlopen.
f5ed34
f5ed34
It may be possible to detect up-to-date dtv faster.  But if there are
f5ed34
many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
f5ed34
least walking the slotinfo list.
f5ed34
f5ed34
This patch tries to update the dtv to the global generation instead, so
f5ed34
after a dlopen the tls access slow path is only hit once.  The modules
f5ed34
with larger generation than the accessed one were not necessarily
f5ed34
synchronized before, so additional synchronization is needed.
f5ed34
f5ed34
This patch uses acquire/release synchronization when accessing the
f5ed34
generation counter.
f5ed34
f5ed34
Note: in the x86_64 version of dl-tls.c the generation is only loaded
f5ed34
once, since relaxed mo is not faster than acquire mo load.
f5ed34
f5ed34
I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
f5ed34
powerpc, sparc, x86 who reported that it fixes the performance issue
f5ed34
of bug 19924.
f5ed34
f5ed34
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
f5ed34
f5ed34
[rebased to c8s by DJ]
f5ed34
f5ed34
diff -rup a/elf/dl-close.c b/elf/dl-close.c
f5ed34
--- a/elf/dl-close.c	2023-10-13 16:24:27.068217519 -0400
f5ed34
+++ b/elf/dl-close.c	2023-10-13 16:28:59.936019397 -0400
f5ed34
@@ -739,7 +739,7 @@ _dl_close_worker (struct link_map *map,
f5ed34
       if (__glibc_unlikely (newgen == 0))
f5ed34
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
f5ed34
       /* Can be read concurrently.  */
f5ed34
-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
f5ed34
+      atomic_store_release (&GL(dl_tls_generation), newgen);
f5ed34
 
f5ed34
       if (tls_free_end == GL(dl_tls_static_used))
f5ed34
 	GL(dl_tls_static_used) = tls_free_start;
f5ed34
diff -rup a/elf/dl-open.c b/elf/dl-open.c
f5ed34
--- a/elf/dl-open.c	2023-10-13 16:24:26.930212160 -0400
f5ed34
+++ b/elf/dl-open.c	2023-10-13 16:28:59.936019397 -0400
f5ed34
@@ -403,7 +403,7 @@ update_tls_slotinfo (struct link_map *ne
f5ed34
     _dl_fatal_printf (N_("\
f5ed34
 TLS generation counter wrapped!  Please report this."));
f5ed34
   /* Can be read concurrently.  */
f5ed34
-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
f5ed34
+  atomic_store_release (&GL(dl_tls_generation), newgen);
f5ed34
 
f5ed34
   /* We need a second pass for static tls data, because
f5ed34
      _dl_update_slotinfo must not be run while calls to
f5ed34
@@ -420,8 +420,8 @@ TLS generation counter wrapped!  Please
f5ed34
 	     now, but we can delay updating the DTV.  */
f5ed34
 	  imap->l_need_tls_init = 0;
f5ed34
 #ifdef SHARED
f5ed34
-	  /* Update the slot information data for at least the
f5ed34
-	     generation of the DSO we are allocating data for.  */
f5ed34
+	  /* Update the slot information data for the current
f5ed34
+	     generation.  */
f5ed34
 
f5ed34
 	  /* FIXME: This can terminate the process on memory
f5ed34
 	     allocation failure.  It is not possible to raise
f5ed34
@@ -429,7 +429,7 @@ TLS generation counter wrapped!  Please
f5ed34
 	     _dl_update_slotinfo would have to be split into two
f5ed34
 	     operations, similar to resize_scopes and update_scopes
f5ed34
 	     above.  This is related to bug 16134.  */
f5ed34
-	  _dl_update_slotinfo (imap->l_tls_modid);
f5ed34
+	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
f5ed34
 #endif
f5ed34
 
f5ed34
 	  GL(dl_init_static_tls) (imap);
f5ed34
diff -rup a/elf/dl-reloc.c b/elf/dl-reloc.c
f5ed34
--- a/elf/dl-reloc.c	2023-10-13 16:24:26.390191189 -0400
f5ed34
+++ b/elf/dl-reloc.c	2023-10-13 16:28:59.937019438 -0400
f5ed34
@@ -111,11 +111,11 @@ _dl_try_allocate_static_tls (struct link
f5ed34
   if (map->l_real->l_relocated)
f5ed34
     {
f5ed34
 #ifdef SHARED
f5ed34
+      /* Update the DTV of the current thread.  Note: GL(dl_load_tls_lock)
f5ed34
+	 is held here so normal load of the generation counter is valid.  */
f5ed34
       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
f5ed34
 			    0))
f5ed34
-	/* Update the slot information data for at least the generation of
f5ed34
-	   the DSO we are allocating data for.  */
f5ed34
-	(void) _dl_update_slotinfo (map->l_tls_modid);
f5ed34
+	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
f5ed34
 #endif
f5ed34
 
f5ed34
       GL(dl_init_static_tls) (map);
f5ed34
diff -rup a/elf/dl-tls.c b/elf/dl-tls.c
f5ed34
--- a/elf/dl-tls.c	2023-10-13 16:24:26.564197946 -0400
f5ed34
+++ b/elf/dl-tls.c	2023-10-13 16:28:59.937019438 -0400
f5ed34
@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
f5ed34
 
f5ed34
 
f5ed34
 struct link_map *
f5ed34
-_dl_update_slotinfo (unsigned long int req_modid)
f5ed34
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
f5ed34
 {
f5ed34
   struct link_map *the_map = NULL;
f5ed34
   dtv_t *dtv = THREAD_DTV ();
f5ed34
 
f5ed34
-  /* The global dl_tls_dtv_slotinfo array contains for each module
f5ed34
-     index the generation counter current when the entry was created.
f5ed34
+  /* CONCURRENCY NOTES:
f5ed34
+
f5ed34
+     The global dl_tls_dtv_slotinfo_list array contains for each module
f5ed34
+     index the generation counter current when that entry was updated.
f5ed34
      This array never shrinks so that all module indices which were
f5ed34
-     valid at some time can be used to access it.  Before the first
f5ed34
-     use of a new module index in this function the array was extended
f5ed34
-     appropriately.  Access also does not have to be guarded against
f5ed34
-     modifications of the array.  It is assumed that pointer-size
f5ed34
-     values can be read atomically even in SMP environments.  It is
f5ed34
-     possible that other threads at the same time dynamically load
f5ed34
-     code and therefore add to the slotinfo list.  This is a problem
f5ed34
-     since we must not pick up any information about incomplete work.
f5ed34
-     The solution to this is to ignore all dtv slots which were
f5ed34
-     created after the one we are currently interested.  We know that
f5ed34
-     dynamic loading for this module is completed and this is the last
f5ed34
-     load operation we know finished.  */
f5ed34
-  unsigned long int idx = req_modid;
f5ed34
+     valid at some time can be used to access it.  Concurrent loading
f5ed34
+     and unloading of modules can update slotinfo entries or extend
f5ed34
+     the array.  The updates happen under the GL(dl_load_tls_lock) and
f5ed34
+     finish with the release store of the generation counter to
f5ed34
+     GL(dl_tls_generation) which is synchronized with the load of
f5ed34
+     new_gen in the caller.  So updates up to new_gen are synchronized
f5ed34
+     but updates for later generations may not be.
f5ed34
+
f5ed34
+     Here we update the thread dtv from old_gen (== dtv[0].counter) to
f5ed34
+     new_gen generation.  For this, each dtv[i] entry is either set to
f5ed34
+     an unallocated state (set), or left unmodified (nop).  Where (set)
f5ed34
+     may resize the dtv first if modid i >= dtv[-1].counter. The rules
f5ed34
+     for the decision between (set) and (nop) are
f5ed34
+
f5ed34
+     (1) If slotinfo entry i is concurrently updated then either (set)
f5ed34
+         or (nop) is valid: TLS access cannot use dtv[i] unless it is
f5ed34
+         synchronized with a generation > new_gen.
f5ed34
+
f5ed34
+     Otherwise, if the generation of slotinfo entry i is gen and the
f5ed34
+     loaded module for this entry is map then
f5ed34
+
f5ed34
+     (2) If gen <= old_gen then do (nop).
f5ed34
+
f5ed34
+     (3) If old_gen < gen <= new_gen then
f5ed34
+         (3.1) if map != 0 then (set)
f5ed34
+         (3.2) if map == 0 then either (set) or (nop).
f5ed34
+
f5ed34
+     Note that (1) cannot be reliably detected, but since both actions
f5ed34
+     are valid it does not have to be.  Only (2) and (3.1) cases need
f5ed34
+     to be distinguished for which relaxed mo access of gen and map is
f5ed34
+     enough: their value is synchronized when it matters.
f5ed34
+
f5ed34
+     Note that a relaxed mo load may give an out-of-thin-air value since
f5ed34
+     it is used in decisions that can affect concurrent stores.  But this
f5ed34
+     should only happen if the OOTA value causes UB that justifies the
f5ed34
+     concurrent store of the value.  This is not expected to be an issue
f5ed34
+     in practice.  */
f5ed34
   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
f5ed34
 
f5ed34
-  while (idx >= listp->len)
f5ed34
+  if (dtv[0].counter < new_gen)
f5ed34
     {
f5ed34
-      idx -= listp->len;
f5ed34
-      listp = listp->next;
f5ed34
-    }
f5ed34
-
f5ed34
-  if (dtv[0].counter < listp->slotinfo[idx].gen)
f5ed34
-    {
f5ed34
-      /* CONCURRENCY NOTES:
f5ed34
-
f5ed34
-	 Here the dtv needs to be updated to new_gen generation count.
f5ed34
-
f5ed34
-	 This code may be called during TLS access when GL(dl_load_tls_lock)
f5ed34
-	 is not held.  In that case the user code has to synchronize with
f5ed34
-	 dlopen and dlclose calls of relevant modules.  A module m is
f5ed34
-	 relevant if the generation of m <= new_gen and dlclose of m is
f5ed34
-	 synchronized: a memory access here happens after the dlopen and
f5ed34
-	 before the dlclose of relevant modules.  The dtv entries for
f5ed34
-	 relevant modules need to be updated, other entries can be
f5ed34
-	 arbitrary.
f5ed34
-
f5ed34
-	 This e.g. means that the first part of the slotinfo list can be
f5ed34
-	 accessed race free, but the tail may be concurrently extended.
f5ed34
-	 Similarly relevant slotinfo entries can be read race free, but
f5ed34
-	 other entries are racy.  However updating a non-relevant dtv
f5ed34
-	 entry does not affect correctness.  For a relevant module m,
f5ed34
-	 max_modid >= modid of m.  */
f5ed34
-      size_t new_gen = listp->slotinfo[idx].gen;
f5ed34
       size_t total = 0;
f5ed34
       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
f5ed34
       assert (max_modid >= req_modid);
f5ed34
@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int r
f5ed34
 	    {
f5ed34
 	      size_t modid = total + cnt;
f5ed34
 
f5ed34
-	      /* Later entries are not relevant.  */
f5ed34
+	      /* Case (1) for all later modids.  */
f5ed34
 	      if (modid > max_modid)
f5ed34
 		break;
f5ed34
 
f5ed34
 	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
f5ed34
 
f5ed34
+	      /* Case (1).  */
f5ed34
 	      if (gen > new_gen)
f5ed34
-		/* Not relevant.  */
f5ed34
 		continue;
f5ed34
 
f5ed34
-	      /* If the entry is older than the current dtv layout we
f5ed34
-		 know we don't have to handle it.  */
f5ed34
+	      /* Case (2) or (1).  */
f5ed34
 	      if (gen <= dtv[0].counter)
f5ed34
 		continue;
f5ed34
 
f5ed34
+	      /* Case (3) or (1).  */
f5ed34
+
f5ed34
 	      /* If there is no map this means the entry is empty.  */
f5ed34
 	      struct link_map *map
f5ed34
 		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
f5ed34
 	      /* Check whether the current dtv array is large enough.  */
f5ed34
 	      if (dtv[-1].counter < modid)
f5ed34
 		{
f5ed34
+		  /* Case (3.2) or (1).  */
f5ed34
 		  if (map == NULL)
f5ed34
 		    continue;
f5ed34
 
f5ed34
-		  /* Resize the dtv.  */
f5ed34
+		  /* Resizing the dtv aborts on failure: bug 16134.  */
f5ed34
 		  dtv = _dl_resize_dtv (dtv, max_modid);
f5ed34
 
f5ed34
 		  assert (modid <= dtv[-1].counter);
f5ed34
@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int r
f5ed34
 		}
f5ed34
 
f5ed34
 	      /* If there is currently memory allocate for this
f5ed34
-		 dtv entry free it.  */
f5ed34
+		 dtv entry free it.  Note: this is not AS-safe.  */
f5ed34
 	      /* XXX Ideally we will at some point create a memory
f5ed34
 		 pool.  */
f5ed34
 	      free (dtv[modid].pointer.to_free);
f5ed34
@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
f5ed34
 
f5ed34
 static struct link_map *
f5ed34
 __attribute_noinline__
f5ed34
-update_get_addr (GET_ADDR_ARGS)
f5ed34
+update_get_addr (GET_ADDR_ARGS, size_t gen)
f5ed34
 {
f5ed34
-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
f5ed34
+  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
f5ed34
   dtv_t *dtv = THREAD_DTV ();
f5ed34
 
f5ed34
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
f5ed34
@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
f5ed34
   dtv_t *dtv = THREAD_DTV ();
f5ed34
 
f5ed34
   /* Update is needed if dtv[0].counter < the generation of the accessed
f5ed34
-     module.  The global generation counter is used here as it is easier
f5ed34
-     to check.  Synchronization for the relaxed MO access is guaranteed
f5ed34
-     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
f5ed34
+     module, but the global generation counter is easier to check (which
f5ed34
+     must be synchronized up to the generation of the accessed module by
f5ed34
+     user code doing the TLS access so relaxed mo read is enough).  */
f5ed34
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
f5ed34
   if (__glibc_unlikely (dtv[0].counter != gen))
f5ed34
-    return update_get_addr (GET_ADDR_PARAM);
f5ed34
+    {
f5ed34
+      /* Update DTV up to the global generation, see CONCURRENCY NOTES
f5ed34
+         in _dl_update_slotinfo.  */
f5ed34
+      gen = atomic_load_acquire (&GL(dl_tls_generation));
f5ed34
+      return update_get_addr (GET_ADDR_PARAM, gen);
f5ed34
+    }
f5ed34
 
f5ed34
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
f5ed34
 
f5ed34
diff -rup a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
f5ed34
--- a/sysdeps/generic/ldsodefs.h	2023-10-13 16:24:27.136220160 -0400
f5ed34
+++ b/sysdeps/generic/ldsodefs.h	2023-10-13 16:28:59.937019438 -0400
f5ed34
@@ -1231,7 +1231,8 @@ extern void _dl_add_to_slotinfo (struct
f5ed34
 
f5ed34
 /* Update slot information data for at least the generation of the
f5ed34
    module with the given index.  */
f5ed34
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
f5ed34
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
f5ed34
+					     size_t gen)
f5ed34
      attribute_hidden;
f5ed34
 
f5ed34
 /* Look up the module's TLS block as for __tls_get_addr,
f5ed34
diff -rup a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
f5ed34
--- a/sysdeps/x86_64/dl-tls.c	2023-10-13 16:24:24.948135189 -0400
f5ed34
+++ b/sysdeps/x86_64/dl-tls.c	2023-10-13 16:28:59.938019479 -0400
f5ed34
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
f5ed34
 {
f5ed34
   dtv_t *dtv = THREAD_DTV ();
f5ed34
 
f5ed34
-  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
f5ed34
+  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
f5ed34
   if (__glibc_unlikely (dtv[0].counter != gen))
f5ed34
-    return update_get_addr (GET_ADDR_PARAM);
f5ed34
+    return update_get_addr (GET_ADDR_PARAM, gen);
f5ed34
 
f5ed34
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
f5ed34
 }