1d4c55
commit 15a0c5730d1d5aeb95f50c9ec7470640084feae8
1d4c55
Author: Chung-Lin Tang <cltang@codesourcery.com>
1d4c55
Date:   Thu Oct 21 21:41:22 2021 +0800
1d4c55
1d4c55
    elf: Fix slow DSO sorting behavior in dynamic loader (BZ #17645)
1d4c55
    
1d4c55
    This second patch contains the actual implementation of a new sorting algorithm
1d4c55
    for shared objects in the dynamic loader, which solves the slow behavior that
1d4c55
    the current "old" algorithm falls into when the DSO set contains circular
1d4c55
    dependencies.
1d4c55
    
1d4c55
    The new algorithm implemented here is simply depth-first search (DFS) to obtain
1d4c55
    the Reverse-Post Order (RPO) sequence, a topological sort. A new l_visited:1
1d4c55
    bitfield is added to struct link_map to more elegantly facilitate such a search.
1d4c55
    
1d4c55
    The DFS algorithm is applied to the input maps[nmap-1] backwards towards
1d4c55
    maps[0]. This has the effect of a more "shallow" recursion depth in general
1d4c55
    since the input is in BFS. Also, when combined with the natural order of
1d4c55
    processing l_initfini[] at each node, this creates a resulting output sorting
1d4c55
    closer to the intuitive "left-to-right" order in most cases.
1d4c55
    
1d4c55
    Another notable implementation adjustment related to this _dl_sort_maps change
1d4c55
    is the removing of two char arrays 'used' and 'done' in _dl_close_worker to
1d4c55
    represent two per-map attributes. This has been changed to simply use two new
1d4c55
    bit-fields l_map_used:1, l_map_done:1 added to struct link_map. This also allows
1d4c55
    discarding the clunky 'used' array sorting that _dl_sort_maps had to sometimes
1d4c55
    do along the way.
1d4c55
    
1d4c55
    Tunable support for switching between different sorting algorithms at runtime is
1d4c55
    also added. A new tunable 'glibc.rtld.dynamic_sort' with current valid values 1
1d4c55
    (old algorithm) and 2 (new DFS algorithm) has been added. At time of commit
1d4c55
    of this patch, the default setting is 1 (old algorithm).
1d4c55
    
1d4c55
    Signed-off-by: Chung-Lin Tang  <cltang@codesourcery.com>
1d4c55
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
1d4c55
1d4c55
Conflicts:
1d4c55
	elf/dl-tunables.list
1d4c55
	  (No mem.tagging tunable downstream.)
1d4c55
1d4c55
diff --git a/elf/dl-close.c b/elf/dl-close.c
1d4c55
index 74ca9a85dd309780..22225efb3226c3e1 100644
1d4c55
--- a/elf/dl-close.c
1d4c55
+++ b/elf/dl-close.c
1d4c55
@@ -167,8 +167,6 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 
1d4c55
   bool any_tls = false;
1d4c55
   const unsigned int nloaded = ns->_ns_nloaded;
1d4c55
-  char used[nloaded];
1d4c55
-  char done[nloaded];
1d4c55
   struct link_map *maps[nloaded];
1d4c55
 
1d4c55
   /* Run over the list and assign indexes to the link maps and enter
1d4c55
@@ -176,24 +174,21 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
   int idx = 0;
1d4c55
   for (struct link_map *l = ns->_ns_loaded; l != NULL; l = l->l_next)
1d4c55
     {
1d4c55
+      l->l_map_used = 0;
1d4c55
+      l->l_map_done = 0;
1d4c55
       l->l_idx = idx;
1d4c55
       maps[idx] = l;
1d4c55
       ++idx;
1d4c55
-
1d4c55
     }
1d4c55
   assert (idx == nloaded);
1d4c55
 
1d4c55
-  /* Prepare the bitmaps.  */
1d4c55
-  memset (used, '\0', sizeof (used));
1d4c55
-  memset (done, '\0', sizeof (done));
1d4c55
-
1d4c55
   /* Keep track of the lowest index link map we have covered already.  */
1d4c55
   int done_index = -1;
1d4c55
   while (++done_index < nloaded)
1d4c55
     {
1d4c55
       struct link_map *l = maps[done_index];
1d4c55
 
1d4c55
-      if (done[done_index])
1d4c55
+      if (l->l_map_done)
1d4c55
 	/* Already handled.  */
1d4c55
 	continue;
1d4c55
 
1d4c55
@@ -204,12 +199,12 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 	  /* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
1d4c55
 	     acquire is sufficient and correct.  */
1d4c55
 	  && atomic_load_acquire (&l->l_tls_dtor_count) == 0
1d4c55
-	  && !used[done_index])
1d4c55
+	  && !l->l_map_used)
1d4c55
 	continue;
1d4c55
 
1d4c55
       /* We need this object and we handle it now.  */
1d4c55
-      done[done_index] = 1;
1d4c55
-      used[done_index] = 1;
1d4c55
+      l->l_map_used = 1;
1d4c55
+      l->l_map_done = 1;
1d4c55
       /* Signal the object is still needed.  */
1d4c55
       l->l_idx = IDX_STILL_USED;
1d4c55
 
1d4c55
@@ -225,9 +220,9 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 		{
1d4c55
 		  assert ((*lp)->l_idx >= 0 && (*lp)->l_idx < nloaded);
1d4c55
 
1d4c55
-		  if (!used[(*lp)->l_idx])
1d4c55
+		  if (!(*lp)->l_map_used)
1d4c55
 		    {
1d4c55
-		      used[(*lp)->l_idx] = 1;
1d4c55
+		      (*lp)->l_map_used = 1;
1d4c55
 		      /* If we marked a new object as used, and we've
1d4c55
 			 already processed it, then we need to go back
1d4c55
 			 and process again from that point forward to
1d4c55
@@ -250,9 +245,9 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 	      {
1d4c55
 		assert (jmap->l_idx >= 0 && jmap->l_idx < nloaded);
1d4c55
 
1d4c55
-		if (!used[jmap->l_idx])
1d4c55
+		if (!jmap->l_map_used)
1d4c55
 		  {
1d4c55
-		    used[jmap->l_idx] = 1;
1d4c55
+		    jmap->l_map_used = 1;
1d4c55
 		    if (jmap->l_idx - 1 < done_index)
1d4c55
 		      done_index = jmap->l_idx - 1;
1d4c55
 		  }
1d4c55
@@ -262,8 +257,7 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 
1d4c55
   /* Sort the entries.  We can skip looking for the binary itself which is
1d4c55
      at the front of the search list for the main namespace.  */
1d4c55
-  _dl_sort_maps (maps + (nsid == LM_ID_BASE), nloaded - (nsid == LM_ID_BASE),
1d4c55
-		 used + (nsid == LM_ID_BASE), true);
1d4c55
+  _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
1d4c55
 
1d4c55
   /* Call all termination functions at once.  */
1d4c55
   bool unload_any = false;
1d4c55
@@ -277,7 +271,7 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
       /* All elements must be in the same namespace.  */
1d4c55
       assert (imap->l_ns == nsid);
1d4c55
 
1d4c55
-      if (!used[i])
1d4c55
+      if (!imap->l_map_used)
1d4c55
 	{
1d4c55
 	  assert (imap->l_type == lt_loaded && !imap->l_nodelete_active);
1d4c55
 
1d4c55
@@ -315,7 +309,7 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
 	  if (i < first_loaded)
1d4c55
 	    first_loaded = i;
1d4c55
 	}
1d4c55
-      /* Else used[i].  */
1d4c55
+      /* Else imap->l_map_used.  */
1d4c55
       else if (imap->l_type == lt_loaded)
1d4c55
 	{
1d4c55
 	  struct r_scope_elem *new_list = NULL;
1d4c55
@@ -524,7 +518,7 @@ _dl_close_worker (struct link_map *map, bool force)
1d4c55
   for (unsigned int i = first_loaded; i < nloaded; ++i)
1d4c55
     {
1d4c55
       struct link_map *imap = maps[i];
1d4c55
-      if (!used[i])
1d4c55
+      if (!imap->l_map_used)
1d4c55
 	{
1d4c55
 	  assert (imap->l_type == lt_loaded);
1d4c55
 
1d4c55
diff --git a/elf/dl-deps.c b/elf/dl-deps.c
1d4c55
index 007069f670eced95..9365d54c8e03e5f4 100644
1d4c55
--- a/elf/dl-deps.c
1d4c55
+++ b/elf/dl-deps.c
1d4c55
@@ -612,10 +612,9 @@ Filters not supported with LD_TRACE_PRELINKING"));
1d4c55
 
1d4c55
   /* If libc.so.6 is the main map, it participates in the sort, so
1d4c55
      that the relocation order is correct regarding libc.so.6.  */
1d4c55
-  if (l_initfini[0] == GL (dl_ns)[l_initfini[0]->l_ns].libc_map)
1d4c55
-    _dl_sort_maps (l_initfini, nlist, NULL, false);
1d4c55
-  else
1d4c55
-    _dl_sort_maps (&l_initfini[1], nlist - 1, NULL, false);
1d4c55
+  _dl_sort_maps (l_initfini, nlist,
1d4c55
+		 (l_initfini[0] != GL (dl_ns)[l_initfini[0]->l_ns].libc_map),
1d4c55
+		 false);
1d4c55
 
1d4c55
   /* Terminate the list of dependencies.  */
1d4c55
   l_initfini[nlist] = NULL;
1d4c55
diff --git a/elf/dl-fini.c b/elf/dl-fini.c
1d4c55
index eea9d8aad736a99e..e14259a3c8806e0d 100644
1d4c55
--- a/elf/dl-fini.c
1d4c55
+++ b/elf/dl-fini.c
1d4c55
@@ -95,8 +95,7 @@ _dl_fini (void)
1d4c55
 	  /* Now we have to do the sorting.  We can skip looking for the
1d4c55
 	     binary itself which is at the front of the search list for
1d4c55
 	     the main namespace.  */
1d4c55
-	  _dl_sort_maps (maps + (ns == LM_ID_BASE), nmaps - (ns == LM_ID_BASE),
1d4c55
-			 NULL, true);
1d4c55
+	  _dl_sort_maps (maps, nmaps, (ns == LM_ID_BASE), true);
1d4c55
 
1d4c55
 	  /* We do not rely on the linked list of loaded object anymore
1d4c55
 	     from this point on.  We have our own list here (maps).  The
1d4c55
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
1d4c55
index b2a01ede627be1e9..398a08f28c4d9ff1 100644
1d4c55
--- a/elf/dl-sort-maps.c
1d4c55
+++ b/elf/dl-sort-maps.c
1d4c55
@@ -16,16 +16,24 @@
1d4c55
    License along with the GNU C Library; if not, see
1d4c55
    <http://www.gnu.org/licenses/>.  */
1d4c55
 
1d4c55
+#include <assert.h>
1d4c55
 #include <ldsodefs.h>
1d4c55
+#include <elf/dl-tunables.h>
1d4c55
 
1d4c55
+/* Note: this is the older, "original" sorting algorithm, being used as
1d4c55
+   default up to 2.35.
1d4c55
 
1d4c55
-/* Sort array MAPS according to dependencies of the contained objects.
1d4c55
-   Array USED, if non-NULL, is permutated along MAPS.  If FOR_FINI this is
1d4c55
-   called for finishing an object.  */
1d4c55
-void
1d4c55
-_dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
1d4c55
-	       bool for_fini)
1d4c55
+   Sort array MAPS according to dependencies of the contained objects.
1d4c55
+   If FOR_FINI is true, this is called for finishing an object.  */
1d4c55
+static void
1d4c55
+_dl_sort_maps_original (struct link_map **maps, unsigned int nmaps,
1d4c55
+			unsigned int skip, bool for_fini)
1d4c55
 {
1d4c55
+  /* Allows caller to do the common optimization of skipping the first map,
1d4c55
+     usually the main binary.  */
1d4c55
+  maps += skip;
1d4c55
+  nmaps -= skip;
1d4c55
+
1d4c55
   /* A list of one element need not be sorted.  */
1d4c55
   if (nmaps <= 1)
1d4c55
     return;
1d4c55
@@ -66,14 +74,6 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
1d4c55
 			   (k - i) * sizeof (maps[0]));
1d4c55
 		  maps[k] = thisp;
1d4c55
 
1d4c55
-		  if (used != NULL)
1d4c55
-		    {
1d4c55
-		      char here_used = used[i];
1d4c55
-		      memmove (&used[i], &used[i + 1],
1d4c55
-			       (k - i) * sizeof (used[0]));
1d4c55
-		      used[k] = here_used;
1d4c55
-		    }
1d4c55
-
1d4c55
 		  if (seen[i + 1] > nmaps - i)
1d4c55
 		    {
1d4c55
 		      ++i;
1d4c55
@@ -120,3 +120,183 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps, char *used,
1d4c55
     next:;
1d4c55
     }
1d4c55
 }
1d4c55
+
1d4c55
+#if !HAVE_TUNABLES
1d4c55
+/* In this case, just default to the original algorithm.  */
1d4c55
+strong_alias (_dl_sort_maps_original, _dl_sort_maps);
1d4c55
+#else
1d4c55
+
1d4c55
+/* We use a recursive function due to its better clarity and ease of
1d4c55
+   implementation, as well as faster execution speed. We already use
1d4c55
+   alloca() for list allocation during the breadth-first search of
1d4c55
+   dependencies in _dl_map_object_deps(), and this should be on the
1d4c55
+   same order of worst-case stack usage.
1d4c55
+
1d4c55
+   Note: the '*rpo' parameter is supposed to point to one past the
1d4c55
+   last element of the array where we save the sort results, and is
1d4c55
+   decremented before storing the current map at each level.  */
1d4c55
+
1d4c55
+static void
1d4c55
+dfs_traversal (struct link_map ***rpo, struct link_map *map,
1d4c55
+	       bool *do_reldeps)
1d4c55
+{
1d4c55
+  if (map->l_visited)
1d4c55
+    return;
1d4c55
+
1d4c55
+  map->l_visited = 1;
1d4c55
+
1d4c55
+  if (map->l_initfini)
1d4c55
+    {
1d4c55
+      for (int i = 0; map->l_initfini[i] != NULL; i++)
1d4c55
+	{
1d4c55
+	  struct link_map *dep = map->l_initfini[i];
1d4c55
+	  if (dep->l_visited == 0
1d4c55
+	      && dep->l_main_map == 0)
1d4c55
+	    dfs_traversal (rpo, dep, do_reldeps);
1d4c55
+	}
1d4c55
+    }
1d4c55
+
1d4c55
+  if (__glibc_unlikely (do_reldeps != NULL && map->l_reldeps != NULL))
1d4c55
+    {
1d4c55
+      /* Indicate that we encountered relocation dependencies during
1d4c55
+	 traversal.  */
1d4c55
+      *do_reldeps = true;
1d4c55
+
1d4c55
+      for (int m = map->l_reldeps->act - 1; m >= 0; m--)
1d4c55
+	{
1d4c55
+	  struct link_map *dep = map->l_reldeps->list[m];
1d4c55
+	  if (dep->l_visited == 0
1d4c55
+	      && dep->l_main_map == 0)
1d4c55
+	    dfs_traversal (rpo, dep, do_reldeps);
1d4c55
+	}
1d4c55
+    }
1d4c55
+
1d4c55
+  *rpo -= 1;
1d4c55
+  **rpo = map;
1d4c55
+}
1d4c55
+
1d4c55
+/* Topologically sort array MAPS according to dependencies of the contained
1d4c55
+   objects.  */
1d4c55
+
1d4c55
+static void
1d4c55
+_dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
1d4c55
+		   unsigned int skip __attribute__ ((unused)), bool for_fini)
1d4c55
+{
1d4c55
+  for (int i = nmaps - 1; i >= 0; i--)
1d4c55
+    maps[i]->l_visited = 0;
1d4c55
+
1d4c55
+  /* We apply DFS traversal for each of maps[i] until the whole total order
1d4c55
+     is found and we're at the start of the Reverse-Postorder (RPO) sequence,
1d4c55
+     which is a topological sort.
1d4c55
+
1d4c55
+     We go from maps[nmaps - 1] backwards towards maps[0] at this level.
1d4c55
+     Due to the breadth-first search (BFS) ordering we receive, going
1d4c55
+     backwards usually gives a more shallow depth-first recursion depth,
1d4c55
+     adding more stack usage safety. Also, combined with the natural
1d4c55
+     processing order of l_initfini[] at each node during DFS, this maintains
1d4c55
+     an ordering closer to the original link ordering in the sorting results
1d4c55
+     under most simpler cases.
1d4c55
+
1d4c55
+     Another reason we order the top level backwards, it that maps[0] is
1d4c55
+     usually exactly the main object of which we're in the midst of
1d4c55
+     _dl_map_object_deps() processing, and maps[0]->l_initfini[] is still
1d4c55
+     blank. If we start the traversal from maps[0], since having no
1d4c55
+     dependencies yet filled in, maps[0] will always be immediately
1d4c55
+     incorrectly placed at the last place in the order (first in reverse).
1d4c55
+     Adjusting the order so that maps[0] is last traversed naturally avoids
1d4c55
+     this problem.
1d4c55
+
1d4c55
+     Further, the old "optimization" of skipping the main object at maps[0]
1d4c55
+     from the call-site (i.e. _dl_sort_maps(maps+1,nmaps-1)) is in general
1d4c55
+     no longer valid, since traversing along object dependency-links
1d4c55
+     may "find" the main object even when it is not included in the initial
1d4c55
+     order (e.g. a dlopen()'ed shared object can have circular dependencies
1d4c55
+     linked back to itself). In such a case, traversing N-1 objects will
1d4c55
+     create a N-object result, and raise problems.
1d4c55
+
1d4c55
+     To summarize, just passing in the full list, and iterating from back
1d4c55
+     to front makes things much more straightforward.  */
1d4c55
+
1d4c55
+  /* Array to hold RPO sorting results, before we copy back to maps[].  */
1d4c55
+  struct link_map *rpo[nmaps];
1d4c55
+
1d4c55
+  /* The 'head' position during each DFS iteration. Note that we start at
1d4c55
+     one past the last element due to first-decrement-then-store (see the
1d4c55
+     bottom of above dfs_traversal() routine).  */
1d4c55
+  struct link_map **rpo_head = &rpo[nmaps];
1d4c55
+
1d4c55
+  bool do_reldeps = false;
1d4c55
+  bool *do_reldeps_ref = (for_fini ? &do_reldeps : NULL);
1d4c55
+
1d4c55
+  for (int i = nmaps - 1; i >= 0; i--)
1d4c55
+    {
1d4c55
+      dfs_traversal (&rpo_head, maps[i], do_reldeps_ref);
1d4c55
+
1d4c55
+      /* We can break early if all objects are already placed.  */
1d4c55
+      if (rpo_head == rpo)
1d4c55
+	goto end;
1d4c55
+    }
1d4c55
+  assert (rpo_head == rpo);
1d4c55
+
1d4c55
+ end:
1d4c55
+  /* Here we may do a second pass of sorting, using only l_initfini[]
1d4c55
+     static dependency links. This is avoided if !FOR_FINI or if we didn't
1d4c55
+     find any reldeps in the first DFS traversal.
1d4c55
+
1d4c55
+     The reason we do this is: while it is unspecified how circular
1d4c55
+     dependencies should be handled, the presumed reasonable behavior is to
1d4c55
+     have destructors to respect static dependency links as much as possible,
1d4c55
+     overriding reldeps if needed. And the first sorting pass, which takes
1d4c55
+     l_initfini/l_reldeps links equally, may not preserve this priority.
1d4c55
+
1d4c55
+     Hence we do a 2nd sorting pass, taking only DT_NEEDED links into account
1d4c55
+     (see how the do_reldeps argument to dfs_traversal() is NULL below).  */
1d4c55
+  if (do_reldeps)
1d4c55
+    {
1d4c55
+      for (int i = nmaps - 1; i >= 0; i--)
1d4c55
+	rpo[i]->l_visited = 0;
1d4c55
+
1d4c55
+      struct link_map **maps_head = &maps[nmaps];
1d4c55
+      for (int i = nmaps - 1; i >= 0; i--)
1d4c55
+	{
1d4c55
+	  dfs_traversal (&maps_head, rpo[i], NULL);
1d4c55
+
1d4c55
+	  /* We can break early if all objects are already placed.
1d4c55
+	     The below memcpy is not needed in the do_reldeps case here,
1d4c55
+	     since we wrote back to maps[] during DFS traversal.  */
1d4c55
+	  if (maps_head == maps)
1d4c55
+	    return;
1d4c55
+	}
1d4c55
+      assert (maps_head == maps);
1d4c55
+      return;
1d4c55
+    }
1d4c55
+
1d4c55
+  memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
1d4c55
+}
1d4c55
+
1d4c55
+void
1d4c55
+_dl_sort_maps_init (void)
1d4c55
+{
1d4c55
+  int32_t algorithm = TUNABLE_GET (glibc, rtld, dynamic_sort, int32_t, NULL);
1d4c55
+  GLRO(dl_dso_sort_algo) = algorithm == 1 ? dso_sort_algorithm_original
1d4c55
+					  : dso_sort_algorithm_dfs;
1d4c55
+}
1d4c55
+
1d4c55
+void
1d4c55
+_dl_sort_maps (struct link_map **maps, unsigned int nmaps,
1d4c55
+	       unsigned int skip, bool for_fini)
1d4c55
+{
1d4c55
+  /* It can be tempting to use a static function pointer to store and call
1d4c55
+     the current selected sorting algorithm routine, but experimentation
1d4c55
+     shows that current processors still do not handle indirect branches
1d4c55
+     that efficiently, plus a static function pointer will involve
1d4c55
+     PTR_MANGLE/DEMANGLE, further impairing performance of small, common
1d4c55
+     input cases. A simple if-case with direct function calls appears to
1d4c55
+     be the fastest.  */
1d4c55
+  if (__glibc_likely (GLRO(dl_dso_sort_algo) == dso_sort_algorithm_original))
1d4c55
+    _dl_sort_maps_original (maps, nmaps, skip, for_fini);
1d4c55
+  else
1d4c55
+    _dl_sort_maps_dfs (maps, nmaps, skip, for_fini);
1d4c55
+}
1d4c55
+
1d4c55
+#endif /* HAVE_TUNABLES.  */
1d4c55
diff --git a/elf/dl-support.c b/elf/dl-support.c
1d4c55
index e9943e889ef447ad..ae03aec9764e29d3 100644
1d4c55
--- a/elf/dl-support.c
1d4c55
+++ b/elf/dl-support.c
1d4c55
@@ -155,6 +155,8 @@ size_t _dl_phnum;
1d4c55
 uint64_t _dl_hwcap __attribute__ ((nocommon));
1d4c55
 uint64_t _dl_hwcap2 __attribute__ ((nocommon));
1d4c55
 
1d4c55
+enum dso_sort_algorithm _dl_dso_sort_algo;
1d4c55
+
1d4c55
 /* The value of the FPU control word the kernel will preset in hardware.  */
1d4c55
 fpu_control_t _dl_fpu_control = _FPU_DEFAULT;
1d4c55
 
1d4c55
diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c
1d4c55
index 998c5d52bcab8193..4e8a986541fc4c09 100644
1d4c55
--- a/elf/dl-sysdep.c
1d4c55
+++ b/elf/dl-sysdep.c
1d4c55
@@ -223,6 +223,9 @@ _dl_sysdep_start (void **start_argptr,
1d4c55
 
1d4c55
   __tunables_init (_environ);
1d4c55
 
1d4c55
+  /* Initialize DSO sorting algorithm after tunables.  */
1d4c55
+  _dl_sort_maps_init ();
1d4c55
+
1d4c55
 #ifdef DL_SYSDEP_INIT
1d4c55
   DL_SYSDEP_INIT;
1d4c55
 #endif
1d4c55
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
1d4c55
index 6408a8e5ae92d2c6..54ef2a921310b229 100644
1d4c55
--- a/elf/dl-tunables.list
1d4c55
+++ b/elf/dl-tunables.list
1d4c55
@@ -140,4 +140,13 @@ glibc {
1d4c55
       default: 512
1d4c55
     }
1d4c55
   }
1d4c55
+
1d4c55
+  rtld {
1d4c55
+    dynamic_sort {
1d4c55
+      type: INT_32
1d4c55
+      minval: 1
1d4c55
+      maxval: 2
1d4c55
+      default: 1
1d4c55
+    }
1d4c55
+  }
1d4c55
 }
1d4c55
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
1d4c55
index 873ddf55d91155c6..5f7f18ef270bc12d 100644
1d4c55
--- a/elf/dso-sort-tests-1.def
1d4c55
+++ b/elf/dso-sort-tests-1.def
1d4c55
@@ -62,5 +62,5 @@ output: b>a>{}
1d4c55
 # The below expected outputs are what the two algorithms currently produce
1d4c55
 # respectively, for regression testing purposes.
1d4c55
 tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
1d4c55
-xfail_output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
1d4c55
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
1d4c55
 output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[
1d4c55
diff --git a/elf/rtld.c b/elf/rtld.c
1d4c55
index b47e84ca2fb6f03c..cd2cc4024a3581c2 100644
1d4c55
--- a/elf/rtld.c
1d4c55
+++ b/elf/rtld.c
1d4c55
@@ -1453,6 +1453,9 @@ dl_main (const ElfW(Phdr) *phdr,
1d4c55
       main_map->l_name = (char *) "";
1d4c55
       *user_entry = main_map->l_entry;
1d4c55
 
1d4c55
+      /* Set bit indicating this is the main program map.  */
1d4c55
+      main_map->l_main_map = 1;
1d4c55
+
1d4c55
 #ifdef HAVE_AUX_VECTOR
1d4c55
       /* Adjust the on-stack auxiliary vector so that it looks like the
1d4c55
 	 binary was executed directly.  */
1d4c55
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
1d4c55
index 4f3f7ee4e30a2b42..118afc271057afd4 100644
1d4c55
--- a/elf/tst-rtld-list-tunables.exp
1d4c55
+++ b/elf/tst-rtld-list-tunables.exp
1d4c55
@@ -10,5 +10,6 @@ glibc.malloc.tcache_max: 0x0 (min: 0x0, max: 0x[f]+)
1d4c55
 glibc.malloc.tcache_unsorted_limit: 0x0 (min: 0x0, max: 0x[f]+)
1d4c55
 glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0x[f]+)
1d4c55
 glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0x[f]+)
1d4c55
+glibc.rtld.dynamic_sort: 1 (min: 1, max: 2)
1d4c55
 glibc.rtld.nns: 0x4 (min: 0x1, max: 0x10)
1d4c55
 glibc.rtld.optional_static_tls: 0x200 (min: 0x0, max: 0x[f]+)
1d4c55
diff --git a/include/link.h b/include/link.h
1d4c55
index dd491989beb41353..041ff5f753a9ee11 100644
1d4c55
--- a/include/link.h
1d4c55
+++ b/include/link.h
1d4c55
@@ -181,6 +181,11 @@ struct link_map
1d4c55
     unsigned int l_init_called:1; /* Nonzero if DT_INIT function called.  */
1d4c55
     unsigned int l_global:1;	/* Nonzero if object in _dl_global_scope.  */
1d4c55
     unsigned int l_reserved:2;	/* Reserved for internal use.  */
1d4c55
+    unsigned int l_main_map:1;  /* Nonzero for the map of the main program.  */
1d4c55
+    unsigned int l_visited:1;   /* Used internally for map dependency
1d4c55
+				   graph traversal.  */
1d4c55
+    unsigned int l_map_used:1;  /* These two bits are used during traversal */
1d4c55
+    unsigned int l_map_done:1;  /* of maps in _dl_close_worker. */
1d4c55
     unsigned int l_phdr_allocated:1; /* Nonzero if the data structure pointed
1d4c55
 					to by `l_phdr' is allocated.  */
1d4c55
     unsigned int l_soname_added:1; /* Nonzero if the SONAME is for sure in
1d4c55
diff --git a/manual/tunables.texi b/manual/tunables.texi
1d4c55
index 43272cf885d1e3e6..c3f96cdc85208926 100644
1d4c55
--- a/manual/tunables.texi
1d4c55
+++ b/manual/tunables.texi
1d4c55
@@ -303,6 +303,17 @@ changed once allocated at process startup.  The default allocation of
1d4c55
 optional static TLS is 512 bytes and is allocated in every thread.
1d4c55
 @end deftp
1d4c55
 
1d4c55
+@deftp Tunable glibc.rtld.dynamic_sort
1d4c55
+Sets the algorithm to use for DSO sorting, valid values are @samp{1} and
1d4c55
+@samp{2}.  For value of @samp{1}, an older O(n^3) algorithm is used, which is
1d4c55
+long time tested, but may have performance issues when dependencies between
1d4c55
+shared objects contain cycles due to circular dependencies.  When set to the
1d4c55
+value of @samp{2}, a different algorithm is used, which implements a
1d4c55
+topological sort through depth-first search, and does not exhibit the
1d4c55
+performance issues of @samp{1}.
1d4c55
+
1d4c55
+The default value of this tunable is @samp{1}.
1d4c55
+@end deftp
1d4c55
 
1d4c55
 @node Elision Tunables
1d4c55
 @section Elision Tunables
1d4c55
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
1d4c55
index 5e56550a4d556fa7..9f09a4a280396659 100644
1d4c55
--- a/sysdeps/generic/ldsodefs.h
1d4c55
+++ b/sysdeps/generic/ldsodefs.h
1d4c55
@@ -240,6 +240,13 @@ enum allowmask
1d4c55
   };
1d4c55
 
1d4c55
 
1d4c55
+/* DSO sort algorithm to use (check dl-sort-maps.c).  */
1d4c55
+enum dso_sort_algorithm
1d4c55
+  {
1d4c55
+    dso_sort_algorithm_original,
1d4c55
+    dso_sort_algorithm_dfs
1d4c55
+  };
1d4c55
+
1d4c55
 struct audit_ifaces
1d4c55
 {
1d4c55
   void (*activity) (uintptr_t *, unsigned int);
1d4c55
@@ -633,6 +640,8 @@ struct rtld_global_ro
1d4c55
      platforms.  */
1d4c55
   EXTERN uint64_t _dl_hwcap2;
1d4c55
 
1d4c55
+  EXTERN enum dso_sort_algorithm _dl_dso_sort_algo;
1d4c55
+
1d4c55
 #ifdef SHARED
1d4c55
   /* We add a function table to _rtld_global which is then used to
1d4c55
      call the function instead of going through the PLT.  The result
1d4c55
@@ -1049,7 +1058,7 @@ extern void _dl_fini (void) attribute_hidden;
1d4c55
 
1d4c55
 /* Sort array MAPS according to dependencies of the contained objects.  */
1d4c55
 extern void _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
1d4c55
-			   char *used, bool for_fini) attribute_hidden;
1d4c55
+			   unsigned int skip, bool for_fini) attribute_hidden;
1d4c55
 
1d4c55
 /* The dynamic linker calls this function before and having changing
1d4c55
    any shared object mappings.  The `r_state' member of `struct r_debug'
1d4c55
@@ -1167,6 +1176,9 @@ extern struct link_map * _dl_get_dl_main_map (void)
1d4c55
 # endif
1d4c55
 #endif
1d4c55
 
1d4c55
+/* Initialize the DSO sort algorithm to use.  */
1d4c55
+extern void _dl_sort_maps_init (void) attribute_hidden;
1d4c55
+
1d4c55
 /* Initialization of libpthread for statically linked applications.
1d4c55
    If libpthread is not linked in, this is an empty function.  */
1d4c55
 void __pthread_initialize_minimal (void) weak_function;