|
|
7d916a |
commit 8813b2682e4094e43b0cf1634e99619f1b8b2c62
|
|
|
7d916a |
Author: Sajan Karumanchi <sajan.karumanchi@amd.com>
|
|
|
7d916a |
Date: Wed Oct 28 13:05:33 2020 +0530
|
|
|
7d916a |
|
|
|
7d916a |
x86: Optimizing memcpy for AMD Zen architecture.
|
|
|
7d916a |
|
|
|
7d916a |
Modifying the shareable cache '__x86_shared_cache_size', which is a
|
|
|
7d916a |
factor in computing the non-temporal threshold parameter
|
|
|
7d916a |
'__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
|
|
|
7d916a |
architectures.
|
|
|
7d916a |
In the existing implementation, the shareable cache is computed as 'L3
|
|
|
7d916a |
per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
|
|
|
7d916a |
CCX(Core-Complex)' has brought in performance gains.
|
|
|
7d916a |
As per the large bench variant results, this patch also addresses the
|
|
|
7d916a |
regression problem on AMD Zen architectures.
|
|
|
7d916a |
|
|
|
7d916a |
Backport of commit 59803e81f96b479c17f583b31eac44b57591a1bf upstream,
|
|
|
7d916a |
with the fix from cb3a749a22a55645dc6a52659eea765300623f98 ("x86:
|
|
|
7d916a |
Restore processing of cache size tunables in init_cacheinfo") applied.
|
|
|
7d916a |
|
|
|
7d916a |
Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
|
|
|
7d916a |
Co-Authored-by: Florian Weimer <fweimer@redhat.com>
|
|
|
7d916a |
|
|
|
7d916a |
Backport is off the release/2.32/master branch upstream, to minimize
|
|
|
7d916a |
conflicts. Adjusted for missing "basic" member in struct cpu_features.
|
|
|
7d916a |
|
|
|
7d916a |
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
|
|
|
7d916a |
index 42b468d0c4885bad..57c36d030a76c8b2 100644
|
|
|
7d916a |
--- a/sysdeps/x86/cacheinfo.c
|
|
|
7d916a |
+++ b/sysdeps/x86/cacheinfo.c
|
|
|
7d916a |
@@ -722,7 +722,7 @@ intel_bug_no_cache_info:
|
|
|
7d916a |
threads = 1 << ((ecx >> 12) & 0x0f);
|
|
|
7d916a |
}
|
|
|
7d916a |
|
|
|
7d916a |
- if (threads == 0)
|
|
|
7d916a |
+ if (threads == 0 || cpu_features->family >= 0x17)
|
|
|
7d916a |
{
|
|
|
7d916a |
/* If APIC ID width is not available, use logical
|
|
|
7d916a |
processor count. */
|
|
|
7d916a |
@@ -737,8 +737,22 @@ intel_bug_no_cache_info:
|
|
|
7d916a |
if (threads > 0)
|
|
|
7d916a |
shared /= threads;
|
|
|
7d916a |
|
|
|
7d916a |
- /* Account for exclusive L2 and L3 caches. */
|
|
|
7d916a |
- shared += core;
|
|
|
7d916a |
+ /* Get shared cache per ccx for Zen architectures. */
|
|
|
7d916a |
+ if (cpu_features->family >= 0x17)
|
|
|
7d916a |
+ {
|
|
|
7d916a |
+ unsigned int eax;
|
|
|
7d916a |
+
|
|
|
7d916a |
+ /* Get number of threads share the L3 cache in CCX. */
|
|
|
7d916a |
+ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
|
|
|
7d916a |
+
|
|
|
7d916a |
+ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
|
|
|
7d916a |
+ shared *= threads_per_ccx;
|
|
|
7d916a |
+ }
|
|
|
7d916a |
+ else
|
|
|
7d916a |
+ {
|
|
|
7d916a |
+ /* Account for exclusive L2 and L3 caches. */
|
|
|
7d916a |
+ shared += core;
|
|
|
7d916a |
+ }
|
|
|
7d916a |
}
|
|
|
7d916a |
|
|
|
7d916a |
#ifndef DISABLE_PREFETCHW
|