|
|
190885 |
From 571a3ddd938b742af8fc2b02f26b4b3296ea8a94 Mon Sep 17 00:00:00 2001
|
|
|
190885 |
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
|
190885 |
Date: Wed, 2 Mar 2022 16:12:40 -0800
|
|
|
190885 |
Subject: [PATCH] x86: Adding an upper bound for Enhanced REP MOVSB.
|
|
|
190885 |
|
|
|
190885 |
In the process of optimizing memcpy for AMD machines, we have found the
|
|
|
190885 |
vector move operations are outperforming enhanced REP MOVSB for data
|
|
|
190885 |
transfers above the L2 cache size on Zen3 architectures.
|
|
|
190885 |
To handle this use case, we are adding an upper bound parameter on
|
|
|
190885 |
enhanced REP MOVSB:'__x86_rep_movsb_stop_threshold'.
|
|
|
190885 |
As per large-bench results, we are configuring this parameter to the
|
|
|
190885 |
L2 cache size for AMD machines and applicable from Zen3 architecture
|
|
|
190885 |
supporting the ERMS feature.
|
|
|
190885 |
For architectures other than AMD, it is the computed value of
|
|
|
190885 |
non-temporal threshold parameter.
|
|
|
190885 |
|
|
|
190885 |
Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
|
|
|
190885 |
|
|
|
190885 |
(cherry picked from commit 6e02b3e9327b7dbb063958d2b124b64fcb4bbe3f)
|
|
|
190885 |
---
|
|
|
190885 |
sysdeps/x86/cacheinfo.h | 14 ++++++++++++++
|
|
|
190885 |
.../x86_64/multiarch/memmove-vec-unaligned-erms.S | 7 +++++--
|
|
|
190885 |
2 files changed, 19 insertions(+), 2 deletions(-)
|
|
|
190885 |
|
|
|
190885 |
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
index 02556961..b982982f 100644
|
|
|
190885 |
--- a/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
+++ b/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
@@ -45,6 +45,9 @@ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
|
|
190885 |
/* Threshold to use Enhanced REP STOSB. */
|
|
|
190885 |
long int __x86_rep_stosb_threshold attribute_hidden = 2048;
|
|
|
190885 |
|
|
|
190885 |
+/* Threshold to stop using Enhanced REP MOVSB. */
|
|
|
190885 |
+long int __x86_rep_movsb_stop_threshold attribute_hidden;
|
|
|
190885 |
+
|
|
|
190885 |
static void
|
|
|
190885 |
get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
|
|
|
190885 |
long int core)
|
|
|
190885 |
@@ -352,6 +355,12 @@ init_cacheinfo (void)
|
|
|
190885 |
shared += core;
|
|
|
190885 |
}
|
|
|
190885 |
}
|
|
|
190885 |
+
|
|
|
190885 |
+ /* ERMS feature is implemented from AMD Zen3 architecture and it is
|
|
|
190885 |
+ performing poorly for data above L2 cache size. Henceforth, adding
|
|
|
190885 |
+ an upper bound threshold parameter to limit the usage of Enhanced
|
|
|
190885 |
+ REP MOVSB operations and setting its value to L2 cache size. */
|
|
|
190885 |
+ __x86_rep_movsb_stop_threshold = core;
|
|
|
190885 |
}
|
|
|
190885 |
|
|
|
190885 |
if (cpu_features->data_cache_size != 0)
|
|
|
190885 |
@@ -421,6 +430,11 @@ init_cacheinfo (void)
|
|
|
190885 |
else
|
|
|
190885 |
__x86_rep_movsb_threshold = rep_movsb_threshold;
|
|
|
190885 |
|
|
|
190885 |
+ /* Setting the upper bound of ERMS to the computed value of
|
|
|
190885 |
+ non-temporal threshold for architectures other than AMD. */
|
|
|
190885 |
+ if (cpu_features->basic.kind != arch_kind_amd)
|
|
|
190885 |
+ __x86_rep_movsb_stop_threshold = __x86_shared_non_temporal_threshold;
|
|
|
190885 |
+
|
|
|
190885 |
# if HAVE_TUNABLES
|
|
|
190885 |
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
|
|
|
190885 |
# endif
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
|
190885 |
index 572cef04..620ce3a8 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
|
190885 |
@@ -30,7 +30,10 @@
|
|
|
190885 |
load and aligned store. Load the last 4 * VEC and first VEC
|
|
|
190885 |
before the loop and store them after the loop to support
|
|
|
190885 |
overlapping addresses.
|
|
|
190885 |
- 6. If size >= __x86_shared_non_temporal_threshold and there is no
|
|
|
190885 |
+ 6. On machines with ERMS feature, if size greater than equal or to
|
|
|
190885 |
+ __x86_rep_movsb_threshold and less than
|
|
|
190885 |
+ __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
|
|
|
190885 |
+ 7. If size >= __x86_shared_non_temporal_threshold and there is no
|
|
|
190885 |
overlap between destination and source, use non-temporal store
|
|
|
190885 |
instead of aligned store copying from either 2 or 4 pages at
|
|
|
190885 |
once.
|
|
|
190885 |
@@ -311,7 +314,7 @@ L(return):
|
|
|
190885 |
#endif
|
|
|
190885 |
|
|
|
190885 |
L(movsb):
|
|
|
190885 |
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
|
190885 |
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
|
|
190885 |
jae L(more_8x_vec)
|
|
|
190885 |
cmpq %rsi, %rdi
|
|
|
190885 |
jb 1f
|
|
|
190885 |
--
|
|
|
190885 |
GitLab
|
|
|
190885 |
|