93dc2d
commit 15a0c5730d1d5aeb95f50c9ec7470640084feae8
93dc2d
Author: Chung-Lin Tang <cltang@codesourcery.com>
93dc2d
Date:   Thu Oct 21 21:41:22 2021 +0800
93dc2d
93dc2d
    elf: Fix slow DSO sorting behavior in dynamic loader (BZ #17645)
93dc2d
    
93dc2d
    This second patch contains the actual implementation of a new sorting algorithm
93dc2d
    for shared objects in the dynamic loader, which solves the slow behavior that
93dc2d
    the current "old" algorithm falls into when the DSO set contains circular
93dc2d
    dependencies.
93dc2d
    
93dc2d
    The new algorithm implemented here is simply depth-first search (DFS) to obtain
93dc2d
    the Reverse-Post Order (RPO) sequence, a topological sort. A new l_visited:1
93dc2d
    bitfield is added to struct link_map to more elegantly facilitate such a search.
93dc2d
    
93dc2d
    The DFS algorithm is applied to the input maps[nmap-1] backwards towards
93dc2d
    maps[0]. This has the effect of a more "shallow" recursion depth in general
93dc2d
    since the input is in BFS. Also, when combined with the natural order of
93dc2d
    processing l_initfini[] at each node, this creates a resulting output sorting
93dc2d
    closer to the intuitive "left-to-right" order in most cases.
93dc2d
    
93dc2d
    Another notable implementation adjustment related to this _dl_sort_maps change
93dc2d
    is the removing of two char arrays 'used' and 'done' in _dl_close_worker to
93dc2d
    represent two per-map attributes. This has been changed to simply use two new
93dc2d
    bit-fields l_map_used:1, l_map_done:1 added to struct link_map. This also allows
93dc2d
    discarding the clunky 'used' array sorting that _dl_sort_maps had to sometimes
93dc2d
    do along the way.
93dc2d
    
93dc2d
    Tunable support for switching between different sorting algorithms at runtime is
93dc2d
    also added. A new tunable 'glibc.rtld.dynamic_sort' with current valid values 1
93dc2d
    (old algorithm) and 2 (new DFS algorithm) has been added. At time of commit
93dc2d
    of this patch, the default setting is 1 (old algorithm).
93dc2d
    
93dc2d
    Signed-off-by: Chung-Lin Tang  <cltang@codesourcery.com>
93dc2d
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
93dc2d
93dc2d
diff --git a/elf/dl-close.c b/elf/dl-close.c
93dc2d
index cd7b9c9fe83a1a44..f6fbf9de7d78555b 100644
93dc2d
--- a/elf/dl-close.c
93dc2d
+++ b/elf/dl-close.c
93dc2d
@@ -167,8 +167,6 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 
93dc2d
   bool any_tls = false;
93dc2d
   const unsigned int nloaded = ns->_ns_nloaded;
93dc2d
-  char used[nloaded];
93dc2d
-  char done[nloaded];
93dc2d
   struct link_map *maps[nloaded];
93dc2d
 
93dc2d
   /* Run over the list and assign indexes to the link maps and enter
93dc2d
@@ -176,24 +174,21 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
   int idx = 0;
93dc2d
   for (struct link_map *l = ns->_ns_loaded; l != NULL; l = l->l_next)
93dc2d
     {
93dc2d
+      l->l_map_used = 0;
93dc2d
+      l->l_map_done = 0;
93dc2d
       l->l_idx = idx;
93dc2d
       maps[idx] = l;
93dc2d
       ++idx;
93dc2d
-
93dc2d
     }
93dc2d
   assert (idx == nloaded);
93dc2d
 
93dc2d
-  /* Prepare the bitmaps.  */
93dc2d
-  memset (used, '\0', sizeof (used));
93dc2d
-  memset (done, '\0', sizeof (done));
93dc2d
-
93dc2d
   /* Keep track of the lowest index link map we have covered already.  */
93dc2d
   int done_index = -1;
93dc2d
   while (++done_index < nloaded)
93dc2d
     {
93dc2d
       struct link_map *l = maps[done_index];
93dc2d
 
93dc2d
-      if (done[done_index])
93dc2d
+      if (l->l_map_done)
93dc2d
 	/* Already handled.  */
93dc2d
 	continue;
93dc2d
 
93dc2d
@@ -204,12 +199,12 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 	  /* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
93dc2d
 	     acquire is sufficient and correct.  */
93dc2d
 	  && atomic_load_acquire (&l->l_tls_dtor_count) == 0
93dc2d
-	  && !used[done_index])
93dc2d
+	  && !l->l_map_used)
93dc2d
 	continue;
93dc2d
 
93dc2d
       /* We need this object and we handle it now.  */
93dc2d
-      done[done_index] = 1;
93dc2d
-      used[done_index] = 1;
93dc2d
+      l->l_map_used = 1;
93dc2d
+      l->l_map_done = 1;
93dc2d
       /* Signal the object is still needed.  */
93dc2d
       l->l_idx = IDX_STILL_USED;
93dc2d
 
93dc2d
@@ -225,9 +220,9 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 		{
93dc2d
 		  assert ((*lp)->l_idx >= 0 && (*lp)->l_idx < nloaded);
93dc2d
 
93dc2d
-		  if (!used[(*lp)->l_idx])
93dc2d
+		  if (!(*lp)->l_map_used)
93dc2d
 		    {
93dc2d
-		      used[(*lp)->l_idx] = 1;
93dc2d
+		      (*lp)->l_map_used = 1;
93dc2d
 		      /* If we marked a new object as used, and we've
93dc2d
 			 already processed it, then we need to go back
93dc2d
 			 and process again from that point forward to
93dc2d
@@ -250,9 +245,9 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 	      {
93dc2d
 		assert (jmap->l_idx >= 0 && jmap->l_idx < nloaded);
93dc2d
 
93dc2d
-		if (!used[jmap->l_idx])
93dc2d
+		if (!jmap->l_map_used)
93dc2d
 		  {
93dc2d
-		    used[jmap->l_idx] = 1;
93dc2d
+		    jmap->l_map_used = 1;
93dc2d
 		    if (jmap->l_idx - 1 < done_index)
93dc2d
 		      done_index = jmap->l_idx - 1;
93dc2d
 		  }
93dc2d
@@ -262,8 +257,7 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 
93dc2d
   /* Sort the entries.  We can skip looking for the binary itself which is
93dc2d
      at the front of the search list for the main namespace.  */
93dc2d
-  _dl_sort_maps (maps + (nsid == LM_ID_BASE), nloaded - (nsid == LM_ID_BASE),
93dc2d
-		 used + (nsid == LM_ID_BASE), true);
93dc2d
+  _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
93dc2d
 
93dc2d
   /* Call all termination functions at once.  */
93dc2d
 #ifdef SHARED
93dc2d
@@ -280,7 +274,7 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
       /* All elements must be in the same namespace.  */
93dc2d
       assert (imap->l_ns == nsid);
93dc2d
 
93dc2d
-      if (!used[i])
93dc2d
+      if (!imap->l_map_used)
93dc2d
 	{
93dc2d
 	  assert (imap->l_type == lt_loaded && !imap->l_nodelete_active);
93dc2d
 
93dc2d
@@ -333,7 +327,7 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
 	  if (i < first_loaded)
93dc2d
 	    first_loaded = i;
93dc2d
 	}
93dc2d
-      /* Else used[i].  */
93dc2d
+      /* Else imap->l_map_used.  */
93dc2d
       else if (imap->l_type == lt_loaded)
93dc2d
 	{
93dc2d
 	  struct r_scope_elem *new_list = NULL;
93dc2d
@@ -560,7 +554,7 @@ _dl_close_worker (struct link_map *map, bool force)
93dc2d
   for (unsigned int i = first_loaded; i < nloaded; ++i)
93dc2d
     {
93dc2d
       struct link_map *imap = maps[i];
93dc2d
-      if (!used[i])
93dc2d
+      if (!imap->l_map_used)
93dc2d
 	{
93dc2d
 	  assert (imap->l_type == lt_loaded);
93dc2d
 
93dc2d
diff --git a/elf/dl-deps.c b/elf/dl-deps.c
93dc2d
index 087a49b212a96920..237d9636c5be780c 100644
93dc2d
--- a/elf/dl-deps.c
93dc2d
+++ b/elf/dl-deps.c
93dc2d
@@ -613,10 +613,9 @@ Filters not supported with LD_TRACE_PRELINKING"));
93dc2d
 
93dc2d
   /* If libc.so.6 is the main map, it participates in the sort, so
93dc2d
      that the relocation order is correct regarding libc.so.6.  */
93dc2d
-  if (l_initfini[0] == GL (dl_ns)[l_initfini[0]->l_ns].libc_map)
93dc2d
-    _dl_sort_maps (l_initfini, nlist, NULL, false);
93dc2d
-  else
93dc2d
-    _dl_sort_maps (&l_initfini[1], nlist - 1, NULL, false);
93dc2d
+  _dl_sort_maps (l_initfini, nlist,
93dc2d
+		 (l_initfini[0] != GL (dl_ns)[l_initfini[0]->l_ns].libc_map),
93dc2d
+		 false);
93dc2d
 
93dc2d
   /* Terminate the list of dependencies.  */
93dc2d
   l_initfini[nlist] = NULL;
93dc2d
diff --git a/elf/dl-fini.c b/elf/dl-fini.c
93dc2d
index 6dbdfe4b3ebbeb89..c683884c355dfd52 100644
93dc2d
--- a/elf/dl-fini.c
93dc2d
+++ b/elf/dl-fini.c
93dc2d
@@ -92,8 +92,7 @@ _dl_fini (void)
93dc2d
 	  /* Now we have to do the sorting.  We can skip looking for the
93dc2d
 	     binary itself which is at the front of the search list for
93dc2d
 	     the main namespace.  */
93dc2d
-	  _dl_sort_maps (maps + (ns == LM_ID_BASE), nmaps - (ns == LM_ID_BASE),
93dc2d
-			 NULL, true);
93dc2d
+	  _dl_sort_maps (maps, nmaps, (ns == LM_ID_BASE), true);
93dc2d
 
93dc2d
 	  /* We do not rely on the linked list of loaded object anymore
93dc2d
 	     from this point on.  We have our own list here (maps).  The
93dc2d
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
93dc2d
index d21770267a37e128..a274ed66cc987735 100644
93dc2d
--- a/elf/dl-sort-maps.c
93dc2d
+++ b/elf/dl-sort-maps.c
93dc2d
@@ -16,16 +16,24 @@
93dc2d
    License along with the GNU C Library; if not, see
93dc2d
    <https://www.gnu.org/licenses/>.  */
93dc2d
 
93dc2d
+#include <assert.h>
93dc2d
 #include <ldsodefs.h>
93dc2d
+#include <elf/dl-tunables.h>
93dc2d
 
93dc2d
+/* Note: this is the older, "original" sorting algorithm, being used as
93dc2d
+   default up to 2.35.
93dc2d
 
93dc2d
-/* Sort array MAPS according to dependencies of the contained objects.
93dc2d
-   Array USED, if non-NULL, is permutated along MAPS.  If FOR_FINI this is
93dc2d
-   called for finishing an object.  */
93dc2d
-void
93dc2d
-_dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
93dc2d
-	       bool for_fini)
93dc2d
+   Sort array MAPS according to dependencies of the contained objects.
93dc2d
+   If FOR_FINI is true, this is called for finishing an object.  */
93dc2d
+static void
93dc2d
+_dl_sort_maps_original (struct link_map **maps, unsigned int nmaps,
93dc2d
+			unsigned int skip, bool for_fini)
93dc2d
 {
93dc2d
+  /* Allows caller to do the common optimization of skipping the first map,
93dc2d
+     usually the main binary.  */
93dc2d
+  maps += skip;
93dc2d
+  nmaps -= skip;
93dc2d
+
93dc2d
   /* A list of one element need not be sorted.  */
93dc2d
   if (nmaps <= 1)
93dc2d
     return;
93dc2d
@@ -66,14 +74,6 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
93dc2d
 			   (k - i) * sizeof (maps[0]));
93dc2d
 		  maps[k] = thisp;
93dc2d
 
93dc2d
-		  if (used != NULL)
93dc2d
-		    {
93dc2d
-		      char here_used = used[i];
93dc2d
-		      memmove (&used[i], &used[i + 1],
93dc2d
-			       (k - i) * sizeof (used[0]));
93dc2d
-		      used[k] = here_used;
93dc2d
-		    }
93dc2d
-
93dc2d
 		  if (seen[i + 1] > nmaps - i)
93dc2d
 		    {
93dc2d
 		      ++i;
93dc2d
@@ -120,3 +120,183 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
93dc2d
     next:;
93dc2d
     }
93dc2d
 }
93dc2d
+
93dc2d
+#if !HAVE_TUNABLES
93dc2d
+/* In this case, just default to the original algorithm.  */
93dc2d
+strong_alias (_dl_sort_maps_original, _dl_sort_maps);
93dc2d
+#else
93dc2d
+
93dc2d
+/* We use a recursive function due to its better clarity and ease of
93dc2d
+   implementation, as well as faster execution speed. We already use
93dc2d
+   alloca() for list allocation during the breadth-first search of
93dc2d
+   dependencies in _dl_map_object_deps(), and this should be on the
93dc2d
+   same order of worst-case stack usage.
93dc2d
+
93dc2d
+   Note: the '*rpo' parameter is supposed to point to one past the
93dc2d
+   last element of the array where we save the sort results, and is
93dc2d
+   decremented before storing the current map at each level.  */
93dc2d
+
93dc2d
+static void
93dc2d
+dfs_traversal (struct link_map ***rpo, struct link_map *map,
93dc2d
+	       bool *do_reldeps)
93dc2d
+{
93dc2d
+  if (map->l_visited)
93dc2d
+    return;
93dc2d
+
93dc2d
+  map->l_visited = 1;
93dc2d
+
93dc2d
+  if (map->l_initfini)
93dc2d
+    {
93dc2d
+      for (int i = 0; map->l_initfini[i] != NULL; i++)
93dc2d
+	{
93dc2d
+	  struct link_map *dep = map->l_initfini[i];
93dc2d
+	  if (dep->l_visited == 0
93dc2d
+	      && dep->l_main_map == 0)
93dc2d
+	    dfs_traversal (rpo, dep, do_reldeps);
93dc2d
+	}
93dc2d
+    }
93dc2d
+
93dc2d
+  if (__glibc_unlikely (do_reldeps != NULL && map->l_reldeps != NULL))
93dc2d
+    {
93dc2d
+      /* Indicate that we encountered relocation dependencies during
93dc2d
+	 traversal.  */
93dc2d
+      *do_reldeps = true;
93dc2d
+
93dc2d
+      for (int m = map->l_reldeps->act - 1; m >= 0; m--)
93dc2d
+	{
93dc2d
+	  struct link_map *dep = map->l_reldeps->list[m];
93dc2d
+	  if (dep->l_visited == 0
93dc2d
+	      && dep->l_main_map == 0)
93dc2d
+	    dfs_traversal (rpo, dep, do_reldeps);
93dc2d
+	}
93dc2d
+    }
93dc2d
+
93dc2d
+  *rpo -= 1;
93dc2d
+  **rpo = map;
93dc2d
+}
93dc2d
+
93dc2d
+/* Topologically sort array MAPS according to dependencies of the contained
93dc2d
+   objects.  */
93dc2d
+
93dc2d
+static void
93dc2d
+_dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
93dc2d
+		   unsigned int skip __attribute__ ((unused)), bool for_fini)
93dc2d
+{
93dc2d
+  for (int i = nmaps - 1; i >= 0; i--)
93dc2d
+    maps[i]->l_visited = 0;
93dc2d
+
93dc2d
+  /* We apply DFS traversal for each of maps[i] until the whole total order
93dc2d
+     is found and we're at the start of the Reverse-Postorder (RPO) sequence,
93dc2d
+     which is a topological sort.
93dc2d
+
93dc2d
+     We go from maps[nmaps - 1] backwards towards maps[0] at this level.
93dc2d
+     Due to the breadth-first search (BFS) ordering we receive, going
93dc2d
+     backwards usually gives a more shallow depth-first recursion depth,
93dc2d
+     adding more stack usage safety. Also, combined with the natural
93dc2d
+     processing order of l_initfini[] at each node during DFS, this maintains
93dc2d
+     an ordering closer to the original link ordering in the sorting results
93dc2d
+     under most simpler cases.
93dc2d
+
93dc2d
+     Another reason we order the top level backwards, it that maps[0] is
93dc2d
+     usually exactly the main object of which we're in the midst of
93dc2d
+     _dl_map_object_deps() processing, and maps[0]->l_initfini[] is still
93dc2d
+     blank. If we start the traversal from maps[0], since having no
93dc2d
+     dependencies yet filled in, maps[0] will always be immediately
93dc2d
+     incorrectly placed at the last place in the order (first in reverse).
93dc2d
+     Adjusting the order so that maps[0] is last traversed naturally avoids
93dc2d
+     this problem.
93dc2d
+
93dc2d
+     Further, the old "optimization" of skipping the main object at maps[0]
93dc2d
+     from the call-site (i.e. _dl_sort_maps(maps+1,nmaps-1)) is in general
93dc2d
+     no longer valid, since traversing along object dependency-links
93dc2d
+     may "find" the main object even when it is not included in the initial
93dc2d
+     order (e.g. a dlopen()'ed shared object can have circular dependencies
93dc2d
+     linked back to itself). In such a case, traversing N-1 objects will
93dc2d
+     create a N-object result, and raise problems.
93dc2d
+
93dc2d
+     To summarize, just passing in the full list, and iterating from back
93dc2d
+     to front makes things much more straightforward.  */
93dc2d
+
93dc2d
+  /* Array to hold RPO sorting results, before we copy back to maps[].  */
93dc2d
+  struct link_map *rpo[nmaps];
93dc2d
+
93dc2d
+  /* The 'head' position during each DFS iteration. Note that we start at
93dc2d
+     one past the last element due to first-decrement-then-store (see the
93dc2d
+     bottom of above dfs_traversal() routine).  */
93dc2d
+  struct link_map **rpo_head = &rpo[nmaps];
93dc2d
+
93dc2d
+  bool do_reldeps = false;
93dc2d
+  bool *do_reldeps_ref = (for_fini ? &do_reldeps : NULL);
93dc2d
+
93dc2d
+  for (int i = nmaps - 1; i >= 0; i--)
93dc2d
+    {
93dc2d
+      dfs_traversal (&rpo_head, maps[i], do_reldeps_ref);
93dc2d
+
93dc2d
+      /* We can break early if all objects are already placed.  */
93dc2d
+      if (rpo_head == rpo)
93dc2d
+	goto end;
93dc2d
+    }
93dc2d
+  assert (rpo_head == rpo);
93dc2d
+
93dc2d
+ end:
93dc2d
+  /* Here we may do a second pass of sorting, using only l_initfini[]
93dc2d
+     static dependency links. This is avoided if !FOR_FINI or if we didn't
93dc2d
+     find any reldeps in the first DFS traversal.
93dc2d
+
93dc2d
+     The reason we do this is: while it is unspecified how circular
93dc2d
+     dependencies should be handled, the presumed reasonable behavior is to
93dc2d
+     have destructors to respect static dependency links as much as possible,
93dc2d
+     overriding reldeps if needed. And the first sorting pass, which takes
93dc2d
+     l_initfini/l_reldeps links equally, may not preserve this priority.
93dc2d
+
93dc2d
+     Hence we do a 2nd sorting pass, taking only DT_NEEDED links into account
93dc2d
+     (see how the do_reldeps argument to dfs_traversal() is NULL below).  */
93dc2d
+  if (do_reldeps)
93dc2d
+    {
93dc2d
+      for (int i = nmaps - 1; i >= 0; i--)
93dc2d
+	rpo[i]->l_visited = 0;
93dc2d
+
93dc2d
+      struct link_map **maps_head = &maps[nmaps];
93dc2d
+      for (int i = nmaps - 1; i >= 0; i--)
93dc2d
+	{
93dc2d
+	  dfs_traversal (&maps_head, rpo[i], NULL);
93dc2d
+
93dc2d
+	  /* We can break early if all objects are already placed.
93dc2d
+	     The below memcpy is not needed in the do_reldeps case here,
93dc2d
+	     since we wrote back to maps[] during DFS traversal.  */
93dc2d
+	  if (maps_head == maps)
93dc2d
+	    return;
93dc2d
+	}
93dc2d
+      assert (maps_head == maps);
93dc2d
+      return;
93dc2d
+    }
93dc2d
+
93dc2d
+  memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
93dc2d
+}
93dc2d
+
93dc2d
+void
93dc2d
+_dl_sort_maps_init (void)
93dc2d
+{
93dc2d
+  int32_t algorithm = TUNABLE_GET (glibc, rtld, dynamic_sort, int32_t, NULL);
93dc2d
+  GLRO(dl_dso_sort_algo) = algorithm == 1 ? dso_sort_algorithm_original
93dc2d
+					  : dso_sort_algorithm_dfs;
93dc2d
+}
93dc2d
+
93dc2d
+void
93dc2d
+_dl_sort_maps (struct link_map **maps, unsigned int nmaps,
93dc2d
+	       unsigned int skip, bool for_fini)
93dc2d
+{
93dc2d
+  /* It can be tempting to use a static function pointer to store and call
93dc2d
+     the current selected sorting algorithm routine, but experimentation
93dc2d
+     shows that current processors still do not handle indirect branches
93dc2d
+     that efficiently, plus a static function pointer will involve
93dc2d
+     PTR_MANGLE/DEMANGLE, further impairing performance of small, common
93dc2d
+     input cases. A simple if-case with direct function calls appears to
93dc2d
+     be the fastest.  */
93dc2d
+  if (__glibc_likely (GLRO(dl_dso_sort_algo) == dso_sort_algorithm_original))
93dc2d
+    _dl_sort_maps_original (maps, nmaps, skip, for_fini);
93dc2d
+  else
93dc2d
+    _dl_sort_maps_dfs (maps, nmaps, skip, for_fini);
93dc2d
+}
93dc2d
+
93dc2d
+#endif /* HAVE_TUNABLES.  */
93dc2d
diff --git a/elf/dl-support.c b/elf/dl-support.c
93dc2d
index d8c06ba7eb4c76ea..c5ee5d33aa7e1d65 100644
93dc2d
--- a/elf/dl-support.c
93dc2d
+++ b/elf/dl-support.c
93dc2d
@@ -166,6 +166,8 @@ size_t _dl_phnum;
93dc2d
 uint64_t _dl_hwcap;
93dc2d
 uint64_t _dl_hwcap2;
93dc2d
 
93dc2d
+enum dso_sort_algorithm _dl_dso_sort_algo;
93dc2d
+
93dc2d
 /* The value of the FPU control word the kernel will preset in hardware.  */
93dc2d
 fpu_control_t _dl_fpu_control = _FPU_DEFAULT;
93dc2d
 
93dc2d
diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c
93dc2d
index 2c684c2db2a1f59b..4dc366eea445e974 100644
93dc2d
--- a/elf/dl-sysdep.c
93dc2d
+++ b/elf/dl-sysdep.c
93dc2d
@@ -231,6 +231,9 @@ _dl_sysdep_start (void **start_argptr,
93dc2d
 
93dc2d
   __tunables_init (_environ);
93dc2d
 
93dc2d
+  /* Initialize DSO sorting algorithm after tunables.  */
93dc2d
+  _dl_sort_maps_init ();
93dc2d
+
93dc2d
 #ifdef DL_SYSDEP_INIT
93dc2d
   DL_SYSDEP_INIT;
93dc2d
 #endif
93dc2d
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
93dc2d
index 8ddd4a23142a941b..46ffb2378416f90f 100644
93dc2d
--- a/elf/dl-tunables.list
93dc2d
+++ b/elf/dl-tunables.list
93dc2d
@@ -156,4 +156,13 @@ glibc {
93dc2d
       security_level: SXID_IGNORE
93dc2d
     }
93dc2d
   }
93dc2d
+
93dc2d
+  rtld {
93dc2d
+    dynamic_sort {
93dc2d
+      type: INT_32
93dc2d
+      minval: 1
93dc2d
+      maxval: 2
93dc2d
+      default: 1
93dc2d
+    }
93dc2d
+  }
93dc2d
 }
93dc2d
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
93dc2d
index 873ddf55d91155c6..5f7f18ef270bc12d 100644
93dc2d
--- a/elf/dso-sort-tests-1.def
93dc2d
+++ b/elf/dso-sort-tests-1.def
93dc2d
@@ -62,5 +62,5 @@ output: b>a>{}
93dc2d
 # The below expected outputs are what the two algorithms currently produce
93dc2d
 # respectively, for regression testing purposes.
93dc2d
 tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
93dc2d
-xfail_output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
93dc2d
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
93dc2d
 output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
93dc2d
diff --git a/elf/rtld.c b/elf/rtld.c
93dc2d
index 6bbb373c5743cb99..84eac9a8df7125a6 100644
93dc2d
--- a/elf/rtld.c
93dc2d
+++ b/elf/rtld.c
93dc2d
@@ -1426,6 +1426,9 @@ dl_main (const ElfW(Phdr) *phdr,
93dc2d
       main_map->l_name = (char *) "";
93dc2d
       *user_entry = main_map->l_entry;
93dc2d
 
93dc2d
+      /* Set bit indicating this is the main program map.  */
93dc2d
+      main_map->l_main_map = 1;
93dc2d
+
93dc2d
 #ifdef HAVE_AUX_VECTOR
93dc2d
       /* Adjust the on-stack auxiliary vector so that it looks like the
93dc2d
 	 binary was executed directly.  */
93dc2d
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
93dc2d
index 9f66c528855fb21d..9bf572715f996ca6 100644
93dc2d
--- a/elf/tst-rtld-list-tunables.exp
93dc2d
+++ b/elf/tst-rtld-list-tunables.exp
93dc2d
@@ -10,5 +10,6 @@ glibc.malloc.tcache_max: 0x0 (min: 0x0, max: 0x[f]+)
93dc2d
 glibc.malloc.tcache_unsorted_limit: 0x0 (min: 0x0, max: 0x[f]+)
93dc2d
 glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0x[f]+)
93dc2d
 glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0x[f]+)
93dc2d
+glibc.rtld.dynamic_sort: 1 (min: 1, max: 2)
93dc2d
 glibc.rtld.nns: 0x4 (min: 0x1, max: 0x10)
93dc2d
 glibc.rtld.optional_static_tls: 0x200 (min: 0x0, max: 0x[f]+)
93dc2d
diff --git a/include/link.h b/include/link.h
93dc2d
index c46aced9f7b43ba0..4dcf01d8aea90bc2 100644
93dc2d
--- a/include/link.h
93dc2d
+++ b/include/link.h
93dc2d
@@ -181,6 +181,11 @@ struct link_map
93dc2d
     unsigned int l_init_called:1; /* Nonzero if DT_INIT function called.  */
93dc2d
     unsigned int l_global:1;	/* Nonzero if object in _dl_global_scope.  */
93dc2d
     unsigned int l_reserved:2;	/* Reserved for internal use.  */
93dc2d
+    unsigned int l_main_map:1;  /* Nonzero for the map of the main program.  */
93dc2d
+    unsigned int l_visited:1;   /* Used internally for map dependency
93dc2d
+				   graph traversal.  */
93dc2d
+    unsigned int l_map_used:1;  /* These two bits are used during traversal */
93dc2d
+    unsigned int l_map_done:1;  /* of maps in _dl_close_worker. */
93dc2d
     unsigned int l_phdr_allocated:1; /* Nonzero if the data structure pointed
93dc2d
 					to by `l_phdr' is allocated.  */
93dc2d
     unsigned int l_soname_added:1; /* Nonzero if the SONAME is for sure in
93dc2d
diff --git a/manual/tunables.texi b/manual/tunables.texi
93dc2d
index 658547c6137bf177..10f4d75993f9940f 100644
93dc2d
--- a/manual/tunables.texi
93dc2d
+++ b/manual/tunables.texi
93dc2d
@@ -309,6 +309,17 @@ changed once allocated at process startup.  The default allocation of
93dc2d
 optional static TLS is 512 bytes and is allocated in every thread.
93dc2d
 @end deftp
93dc2d
 
93dc2d
+@deftp Tunable glibc.rtld.dynamic_sort
93dc2d
+Sets the algorithm to use for DSO sorting, valid values are @samp{1} and
93dc2d
+@samp{2}.  For value of @samp{1}, an older O(n^3) algorithm is used, which is
93dc2d
+long time tested, but may have performance issues when dependencies between
93dc2d
+shared objects contain cycles due to circular dependencies.  When set to the
93dc2d
+value of @samp{2}, a different algorithm is used, which implements a
93dc2d
+topological sort through depth-first search, and does not exhibit the
93dc2d
+performance issues of @samp{1}.
93dc2d
+
93dc2d
+The default value of this tunable is @samp{1}.
93dc2d
+@end deftp
93dc2d
 
93dc2d
 @node Elision Tunables
93dc2d
 @section Elision Tunables
93dc2d
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
93dc2d
index fcbbf6974827cdf1..bcf1f199c5985c65 100644
93dc2d
--- a/sysdeps/generic/ldsodefs.h
93dc2d
+++ b/sysdeps/generic/ldsodefs.h
93dc2d
@@ -245,6 +245,13 @@ enum allowmask
93dc2d
   };
93dc2d
 
93dc2d
 
93dc2d
+/* DSO sort algorithm to use (check dl-sort-maps.c).  */
93dc2d
+enum dso_sort_algorithm
93dc2d
+  {
93dc2d
+    dso_sort_algorithm_original,
93dc2d
+    dso_sort_algorithm_dfs
93dc2d
+  };
93dc2d
+
93dc2d
 struct audit_ifaces
93dc2d
 {
93dc2d
   void (*activity) (uintptr_t *, unsigned int);
93dc2d
@@ -672,6 +679,8 @@ struct rtld_global_ro
93dc2d
      platforms.  */
93dc2d
   EXTERN uint64_t _dl_hwcap2;
93dc2d
 
93dc2d
+  EXTERN enum dso_sort_algorithm _dl_dso_sort_algo;
93dc2d
+
93dc2d
 #ifdef SHARED
93dc2d
   /* We add a function table to _rtld_global which is then used to
93dc2d
      call the function instead of going through the PLT.  The result
93dc2d
@@ -1098,7 +1107,7 @@ extern void _dl_fini (void) attribute_hidden;
93dc2d
 
93dc2d
 /* Sort array MAPS according to dependencies of the contained objects.  */
93dc2d
 extern void _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
93dc2d
-			   char *used, bool for_fini) attribute_hidden;
93dc2d
+			   unsigned int skip, bool for_fini) attribute_hidden;
93dc2d
 
93dc2d
 /* The dynamic linker calls this function before and having changing
93dc2d
    any shared object mappings.  The `r_state' member of `struct r_debug'
93dc2d
@@ -1225,6 +1234,9 @@ extern struct link_map * _dl_get_dl_main_map (void)
93dc2d
 # endif
93dc2d
 #endif
93dc2d
 
93dc2d
+/* Initialize the DSO sort algorithm to use.  */
93dc2d
+extern void _dl_sort_maps_init (void) attribute_hidden;
93dc2d
+
93dc2d
 /* Initialization of libpthread for statically linked applications.
93dc2d
    If libpthread is not linked in, this is an empty function.  */
93dc2d
 void __pthread_initialize_minimal (void) weak_function;