b18e99
From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001
b18e99
From: Noah Goldstein <goldstein.w.n@gmail.com>
b18e99
Date: Wed, 7 Jun 2023 13:18:01 -0500
b18e99
Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3
b18e99
 / 4`
b18e99
Content-type: text/plain; charset=UTF-8
b18e99
b18e99
Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 /
b18e99
ncores_per_socket'. This patch updates that value to roughly
b18e99
'sizeof_L3 / 4`
b18e99
b18e99
The original value (specifically dividing the `ncores_per_socket`) was
b18e99
done to limit the amount of other threads' data a `memcpy`/`memset`
b18e99
could evict.
b18e99
b18e99
Dividing by 'ncores_per_socket', however leads to exceedingly low
b18e99
non-temporal thresholds and leads to using non-temporal stores in
b18e99
cases where REP MOVSB is multiple times faster.
b18e99
b18e99
Furthermore, non-temporal stores are written directly to main memory
b18e99
so using it at a size much smaller than L3 can place soon to be
b18e99
accessed data much further away than it otherwise could be. As well,
b18e99
modern machines are able to detect streaming patterns (especially if
b18e99
REP MOVSB is used) and provide LRU hints to the memory subsystem. This
b18e99
in affect caps the total amount of eviction at 1/cache_associativity,
b18e99
far below meaningfully thrashing the entire cache.
b18e99
b18e99
As best I can tell, the benchmarks that lead this small threshold
b18e99
where done comparing non-temporal stores versus standard cacheable
b18e99
stores. A better comparison (linked below) is to be REP MOVSB which,
b18e99
on the measure systems, is nearly 2x faster than non-temporal stores
b18e99
at the low-end of the previous threshold, and within 10% for over
b18e99
100MB copies (well past even the current threshold). In cases with a
b18e99
low number of threads competing for bandwidth, REP MOVSB is ~2x faster
b18e99
up to `sizeof_L3`.
b18e99
b18e99
The divisor of `4` is a somewhat arbitrary value. From benchmarks it
b18e99
seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs
b18e99
such as Broadwell prefer something closer to `8`. This patch is meant
b18e99
to be followed up by another one to make the divisor cpu-specific, but
b18e99
in the meantime (and for easier backporting), this patch settles on
b18e99
`4` as a middle-ground.
b18e99
b18e99
Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable
b18e99
stores where done using:
b18e99
https://github.com/goldsteinn/memcpy-nt-benchmarks
b18e99
b18e99
Sheets results (also available in pdf on the github):
b18e99
https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml
b18e99
Reviewed-by: DJ Delorie <dj@redhat.com>
b18e99
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
b18e99
---
b18e99
 sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++---------------
b18e99
 1 file changed, 43 insertions(+), 27 deletions(-)
b18e99
b18e99
[DJ - ported to C8S]
b18e99
b18e99
diff -rup a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
b18e99
--- a/sysdeps/x86/cacheinfo.h	2023-08-08 11:54:09.969791421 -0400
b18e99
+++ b/sysdeps/x86/cacheinfo.h	2023-08-08 13:44:55.185333601 -0400
b18e99
@@ -46,7 +46,7 @@ long int __x86_rep_movsb_threshold attri
b18e99
 long int __x86_rep_stosb_threshold attribute_hidden = 2048;
b18e99
 
b18e99
 static void
b18e99
-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
b18e99
+get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
b18e99
 		       long int core)
b18e99
 {
b18e99
   unsigned int eax;
b18e99
@@ -65,6 +65,7 @@ get_common_cache_info (long int *shared_
b18e99
   unsigned int family = cpu_features->basic.family;
b18e99
   unsigned int model = cpu_features->basic.model;
b18e99
   long int shared = *shared_ptr;
b18e99
+  long int shared_per_thread = *shared_per_thread_ptr;
b18e99
   unsigned int threads = *threads_ptr;
b18e99
   bool inclusive_cache = true;
b18e99
   bool support_count_mask = true;
b18e99
@@ -80,6 +81,7 @@ get_common_cache_info (long int *shared_
b18e99
       /* Try L2 otherwise.  */
b18e99
       level  = 2;
b18e99
       shared = core;
b18e99
+      shared_per_thread = core;
b18e99
       threads_l2 = 0;
b18e99
       threads_l3 = -1;
b18e99
     }
b18e99
@@ -236,29 +238,28 @@ get_common_cache_info (long int *shared_
b18e99
         }
b18e99
       else
b18e99
         {
b18e99
-intel_bug_no_cache_info:
b18e99
-          /* Assume that all logical threads share the highest cache
b18e99
-             level.  */
b18e99
-          threads
b18e99
-            = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
b18e99
-                >> 16) & 0xff);
b18e99
-        }
b18e99
-
b18e99
-        /* Cap usage of highest cache level to the number of supported
b18e99
-           threads.  */
b18e99
-        if (shared > 0 && threads > 0)
b18e99
-          shared /= threads;
b18e99
+	intel_bug_no_cache_info:
b18e99
+	  /* Assume that all logical threads share the highest cache
b18e99
+	     level.  */
b18e99
+	  threads = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx >> 16)
b18e99
+		     & 0xff);
b18e99
+
b18e99
+	  /* Get per-thread size of highest level cache.  */
b18e99
+	  if (shared_per_thread > 0 && threads > 0)
b18e99
+	    shared_per_thread /= threads;
b18e99
+	}
b18e99
     }
b18e99
 
b18e99
   /* Account for non-inclusive L2 and L3 caches.  */
b18e99
   if (!inclusive_cache)
b18e99
     {
b18e99
       if (threads_l2 > 0)
b18e99
-        core /= threads_l2;
b18e99
+	shared_per_thread += core / threads_l2;
b18e99
       shared += core;
b18e99
     }
b18e99
 
b18e99
   *shared_ptr = shared;
b18e99
+  *shared_per_thread_ptr = shared_per_thread;
b18e99
   *threads_ptr = threads;
b18e99
 }
b18e99
 
b18e99
@@ -272,6 +273,7 @@ init_cacheinfo (void)
b18e99
   int max_cpuid_ex;
b18e99
   long int data = -1;
b18e99
   long int shared = -1;
b18e99
+  long int shared_per_thread = -1;
b18e99
   long int core;
b18e99
   unsigned int threads = 0;
b18e99
   const struct cpu_features *cpu_features = __get_cpu_features ();
b18e99
@@ -287,22 +289,25 @@ init_cacheinfo (void)
b18e99
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
b18e99
       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
b18e99
       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
b18e99
+      shared_per_thread = shared;
b18e99
 
b18e99
-      get_common_cache_info (&shared, &threads, core);
b18e99
+      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
b18e99
     }
b18e99
   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
b18e99
     {
b18e99
       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
b18e99
       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
b18e99
       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
b18e99
+      shared_per_thread = shared;
b18e99
 
b18e99
-      get_common_cache_info (&shared, &threads, core);
b18e99
+      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
b18e99
     }
b18e99
   else if (cpu_features->basic.kind == arch_kind_amd)
b18e99
     {
b18e99
       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
b18e99
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
b18e99
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
b18e99
+      shared_per_thread = shared;
b18e99
 
b18e99
       /* Get maximum extended function. */
b18e99
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
b18e99
@@ -352,6 +357,9 @@ init_cacheinfo (void)
b18e99
 	      shared += core;
b18e99
             }
b18e99
 	}
b18e99
+
b18e99
+      if (shared_per_thread <= 0)
b18e99
+	shared_per_thread = shared;
b18e99
     }
b18e99
 
b18e99
   if (cpu_features->data_cache_size != 0)
b18e99
@@ -380,20 +388,30 @@ init_cacheinfo (void)
b18e99
       __x86_shared_cache_size = shared;
b18e99
     }
b18e99
 
b18e99
-  /* The default setting for the non_temporal threshold is 3/4 of one
b18e99
-     thread's share of the chip's cache. For most Intel and AMD processors
b18e99
-     with an initial release date between 2017 and 2020, a thread's typical
b18e99
-     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
b18e99
-     threshold leaves 125 KBytes to 500 KBytes of the thread's data
b18e99
-     in cache after a maximum temporal copy, which will maintain
b18e99
-     in cache a reasonable portion of the thread's stack and other
b18e99
-     active data. If the threshold is set higher than one thread's
b18e99
-     share of the cache, it has a substantial risk of negatively
b18e99
-     impacting the performance of other threads running on the chip. */
b18e99
+  /* The default setting for the non_temporal threshold is 1/4 of size
b18e99
+     of the chip's cache. For most Intel and AMD processors with an
b18e99
+     initial release date between 2017 and 2023, a thread's typical
b18e99
+     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
b18e99
+     estimate the point where non-temporal stores begin out-competing
b18e99
+     REP MOVSB. As well the point where the fact that non-temporal
b18e99
+     stores are forced back to main memory would already occurred to the
b18e99
+     majority of the lines in the copy. Note, concerns about the
b18e99
+     entire L3 cache being evicted by the copy are mostly alleviated
b18e99
+     by the fact that modern HW detects streaming patterns and
b18e99
+     provides proper LRU hints so that the maximum thrashing
b18e99
+     capped at 1/associativity. */
b18e99
+  unsigned long int non_temporal_threshold = shared / 4;
b18e99
+  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
b18e99
+     a higher risk of actually thrashing the cache as they don't have a HW LRU
b18e99
+     hint. As well, their performance in highly parallel situations is
b18e99
+     noticeably worse.  */
b18e99
+  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
b18e99
+    non_temporal_threshold = shared_per_thread * 3 / 4;
b18e99
+
b18e99
   __x86_shared_non_temporal_threshold
b18e99
     = (cpu_features->non_temporal_threshold != 0
b18e99
        ? cpu_features->non_temporal_threshold
b18e99
-       : __x86_shared_cache_size * 3 / 4);
b18e99
+       : non_temporal_threshold);
b18e99
 
b18e99
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
b18e99
   unsigned int minimum_rep_movsb_threshold;
b18e99
Only in b/sysdeps/x86: cacheinfo.h~