7d916a
commit 8813b2682e4094e43b0cf1634e99619f1b8b2c62
7d916a
Author: Sajan Karumanchi <sajan.karumanchi@amd.com>
7d916a
Date:   Wed Oct 28 13:05:33 2020 +0530
7d916a
7d916a
    x86: Optimizing memcpy for AMD Zen architecture.
7d916a
7d916a
    Modifying the shareable cache '__x86_shared_cache_size', which is a
7d916a
    factor in computing the non-temporal threshold parameter
7d916a
    '__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
7d916a
    architectures.
7d916a
    In the existing implementation, the shareable cache is computed as 'L3
7d916a
    per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
7d916a
    CCX(Core-Complex)' has brought in performance gains.
7d916a
    As per the large bench variant results, this patch also addresses the
7d916a
    regression problem on AMD Zen architectures.
7d916a
7d916a
    Backport of commit 59803e81f96b479c17f583b31eac44b57591a1bf upstream,
7d916a
    with the fix from cb3a749a22a55645dc6a52659eea765300623f98 ("x86:
7d916a
    Restore processing of cache size tunables in init_cacheinfo") applied.
7d916a
7d916a
    Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
7d916a
    Co-Authored-by: Florian Weimer <fweimer@redhat.com>
7d916a
7d916a
Backport is off the release/2.32/master branch upstream, to minimize
7d916a
conflicts.  Adjusted for missing "basic" member in struct cpu_features.
7d916a
7d916a
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
7d916a
index 42b468d0c4885bad..57c36d030a76c8b2 100644
7d916a
--- a/sysdeps/x86/cacheinfo.c
7d916a
+++ b/sysdeps/x86/cacheinfo.c
7d916a
@@ -722,7 +722,7 @@ intel_bug_no_cache_info:
7d916a
 	      threads = 1 << ((ecx >> 12) & 0x0f);
7d916a
 	    }
7d916a
 
7d916a
-	  if (threads == 0)
7d916a
+	  if (threads == 0 || cpu_features->family >= 0x17)
7d916a
 	    {
7d916a
 	      /* If APIC ID width is not available, use logical
7d916a
 		 processor count.  */
7d916a
@@ -737,8 +737,22 @@ intel_bug_no_cache_info:
7d916a
 	  if (threads > 0)
7d916a
 	    shared /= threads;
7d916a
 
7d916a
-	  /* Account for exclusive L2 and L3 caches.  */
7d916a
-	  shared += core;
7d916a
+	  /* Get shared cache per ccx for Zen architectures.  */
7d916a
+	  if (cpu_features->family >= 0x17)
7d916a
+	    {
7d916a
+	      unsigned int eax;
7d916a
+
7d916a
+	      /* Get number of threads share the L3 cache in CCX.  */
7d916a
+	      __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
7d916a
+
7d916a
+	      unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
7d916a
+	      shared *= threads_per_ccx;
7d916a
+	    }
7d916a
+	  else
7d916a
+	    {
7d916a
+	      /* Account for exclusive L2 and L3 caches.  */
7d916a
+	      shared += core;
7d916a
+            }
7d916a
 	}
7d916a
 
7d916a
 #ifndef DISABLE_PREFETCHW