1f556a
commit 8813b2682e4094e43b0cf1634e99619f1b8b2c62
1f556a
Author: Sajan Karumanchi <sajan.karumanchi@amd.com>
1f556a
Date:   Wed Oct 28 13:05:33 2020 +0530
1f556a
1f556a
    x86: Optimizing memcpy for AMD Zen architecture.
1f556a
1f556a
    Modifying the shareable cache '__x86_shared_cache_size', which is a
1f556a
    factor in computing the non-temporal threshold parameter
1f556a
    '__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
1f556a
    architectures.
1f556a
    In the existing implementation, the shareable cache is computed as 'L3
1f556a
    per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
1f556a
    CCX(Core-Complex)' has brought in performance gains.
1f556a
    As per the large bench variant results, this patch also addresses the
1f556a
    regression problem on AMD Zen architectures.
1f556a
1f556a
    Backport of commit 59803e81f96b479c17f583b31eac44b57591a1bf upstream,
1f556a
    with the fix from cb3a749a22a55645dc6a52659eea765300623f98 ("x86:
1f556a
    Restore processing of cache size tunables in init_cacheinfo") applied.
1f556a
1f556a
    Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
1f556a
    Co-Authored-by: Florian Weimer <fweimer@redhat.com>
1f556a
1f556a
Backport is off the release/2.32/master branch upstream, to minimize
1f556a
conflicts.  Adjusted for missing "basic" member in struct cpu_features.
1f556a
1f556a
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
1f556a
index 42b468d0c4885bad..57c36d030a76c8b2 100644
1f556a
--- a/sysdeps/x86/cacheinfo.c
1f556a
+++ b/sysdeps/x86/cacheinfo.c
1f556a
@@ -722,7 +722,7 @@ intel_bug_no_cache_info:
1f556a
 	      threads = 1 << ((ecx >> 12) & 0x0f);
1f556a
 	    }
1f556a
 
1f556a
-	  if (threads == 0)
1f556a
+	  if (threads == 0 || cpu_features->family >= 0x17)
1f556a
 	    {
1f556a
 	      /* If APIC ID width is not available, use logical
1f556a
 		 processor count.  */
1f556a
@@ -737,8 +737,22 @@ intel_bug_no_cache_info:
1f556a
 	  if (threads > 0)
1f556a
 	    shared /= threads;
1f556a
 
1f556a
-	  /* Account for exclusive L2 and L3 caches.  */
1f556a
-	  shared += core;
1f556a
+	  /* Get shared cache per ccx for Zen architectures.  */
1f556a
+	  if (cpu_features->family >= 0x17)
1f556a
+	    {
1f556a
+	      unsigned int eax;
1f556a
+
1f556a
+	      /* Get number of threads share the L3 cache in CCX.  */
1f556a
+	      __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
1f556a
+
1f556a
+	      unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
1f556a
+	      shared *= threads_per_ccx;
1f556a
+	    }
1f556a
+	  else
1f556a
+	    {
1f556a
+	      /* Account for exclusive L2 and L3 caches.  */
1f556a
+	      shared += core;
1f556a
+            }
1f556a
 	}
1f556a
 
1f556a
 #ifndef DISABLE_PREFETCHW