1f556a
commit d3c57027470b78dba79c6d931e4e409b1fecfc80
1f556a
Author: Patrick McGehearty <patrick.mcgehearty@oracle.com>
1f556a
Date:   Mon Sep 28 20:11:28 2020 +0000
1f556a
1f556a
    Reversing calculation of __x86_shared_non_temporal_threshold
1f556a
1f556a
    The __x86_shared_non_temporal_threshold determines when memcpy on x86
1f556a
    uses non_temporal stores to avoid pushing other data out of the last
1f556a
    level cache.
1f556a
1f556a
    This patch proposes to revert the calculation change made by H.J. Lu's
1f556a
    patch of June 2, 2017.
1f556a
1f556a
    H.J. Lu's patch selected a threshold suitable for a single thread
1f556a
    getting maximum performance. It was tuned using the single threaded
1f556a
    large memcpy micro benchmark on an 8 core processor. The last change
1f556a
    changes the threshold from using 3/4 of one thread's share of the
1f556a
    cache to using 3/4 of the entire cache of a multi-threaded system
1f556a
    before switching to non-temporal stores. Multi-threaded systems with
1f556a
    more than a few threads are server-class and typically have many
1f556a
    active threads. If one thread consumes 3/4 of the available cache for
1f556a
    all threads, it will cause other active threads to have data removed
1f556a
    from the cache. Two examples show the range of the effect. John
1f556a
    McCalpin's widely parallel Stream benchmark, which runs in parallel
1f556a
    and fetches data sequentially, saw a 20% slowdown with this patch on
1f556a
    an internal system test of 128 threads. This regression was discovered
1f556a
    when comparing OL8 performance to OL7.  An example that compares
1f556a
    normal stores to non-temporal stores may be found at
1f556a
    https://vgatherps.github.io/2018-09-02-nontemporal/.  A simple test
1f556a
    shows performance loss of 400 to 500% due to a failure to use
1f556a
    nontemporal stores. These performance losses are most likely to occur
1f556a
    when the system load is heaviest and good performance is critical.
1f556a
1f556a
    The tunable x86_non_temporal_threshold can be used to override the
1f556a
    default for the knowledgable user who really wants maximum cache
1f556a
    allocation to a single thread in a multi-threaded system.
1f556a
    The manual entry for the tunable has been expanded to provide
1f556a
    more information about its purpose.
1f556a
1f556a
            modified: sysdeps/x86/cacheinfo.c
1f556a
            modified: manual/tunables.texi
1f556a
1f556a
Conflicts:
1f556a
	manual/tunables.texi
1f556a
	  (Downstream uses the glibc.tune namespace, upstream uses
1f556a
	  glibc.cpu.)
1f556a
	sysdeps/x86/cacheinfo.c
1f556a
	  (Downstream does not have rep_movsb_threshold,
1f556a
	  x86_rep_stosb_threshold tunables.)
1f556a
1f556a
diff --git a/manual/tunables.texi b/manual/tunables.texi
1f556a
index 3dc6f9a44592c030..3e1e519dff153b09 100644
1f556a
--- a/manual/tunables.texi
1f556a
+++ b/manual/tunables.texi
1f556a
@@ -364,7 +364,11 @@ set shared cache size in bytes for use in memory and string routines.
1f556a
 
1f556a
 @deftp Tunable glibc.tune.x86_non_temporal_threshold
1f556a
 The @code{glibc.tune.x86_non_temporal_threshold} tunable allows the user
1f556a
-to set threshold in bytes for non temporal store.
1f556a
+to set threshold in bytes for non temporal store. Non temporal stores
1f556a
+give a hint to the hardware to move data directly to memory without
1f556a
+displacing other data from the cache. This tunable is used by some
1f556a
+platforms to determine when to use non temporal stores in operations
1f556a
+like memmove and memcpy.
1f556a
 
1f556a
 This tunable is specific to i386 and x86-64.
1f556a
 @end deftp
1f556a
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
1f556a
index b9444ddd52051e05..42b468d0c4885bad 100644
1f556a
--- a/sysdeps/x86/cacheinfo.c
1f556a
+++ b/sysdeps/x86/cacheinfo.c
1f556a
@@ -778,14 +778,20 @@ intel_bug_no_cache_info:
1f556a
       __x86_shared_cache_size = shared;
1f556a
     }
1f556a
 
1f556a
-  /* The large memcpy micro benchmark in glibc shows that 6 times of
1f556a
-     shared cache size is the approximate value above which non-temporal
1f556a
-     store becomes faster on a 8-core processor.  This is the 3/4 of the
1f556a
-     total shared cache size.  */
1f556a
+  /* The default setting for the non_temporal threshold is 3/4 of one
1f556a
+     thread's share of the chip's cache. For most Intel and AMD processors
1f556a
+     with an initial release date between 2017 and 2020, a thread's typical
1f556a
+     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
1f556a
+     threshold leaves 125 KBytes to 500 KBytes of the thread's data
1f556a
+     in cache after a maximum temporal copy, which will maintain
1f556a
+     in cache a reasonable portion of the thread's stack and other
1f556a
+     active data. If the threshold is set higher than one thread's
1f556a
+     share of the cache, it has a substantial risk of negatively
1f556a
+     impacting the performance of other threads running on the chip. */
1f556a
   __x86_shared_non_temporal_threshold
1f556a
     = (cpu_features->non_temporal_threshold != 0
1f556a
        ? cpu_features->non_temporal_threshold
1f556a
-       : __x86_shared_cache_size * threads * 3 / 4);
1f556a
+       : __x86_shared_cache_size * 3 / 4);
1f556a
 }
1f556a
 
1f556a
 #endif