Blame SOURCES/ia-upperbound-enh-rep_movsb.patch

190885
From 571a3ddd938b742af8fc2b02f26b4b3296ea8a94 Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Wed, 2 Mar 2022 16:12:40 -0800
190885
Subject: [PATCH] x86: Adding an upper bound for Enhanced REP MOVSB.
190885
190885
In the process of optimizing memcpy for AMD machines, we have found the
190885
vector move operations are outperforming enhanced REP MOVSB for data
190885
transfers above the L2 cache size on Zen3 architectures.
190885
To handle this use case, we are adding an upper bound parameter on
190885
enhanced REP MOVSB:'__x86_rep_movsb_stop_threshold'.
190885
As per large-bench results, we are configuring this parameter to the
190885
L2 cache size for AMD machines and applicable from Zen3 architecture
190885
supporting the ERMS feature.
190885
For architectures other than AMD, it is the computed value of
190885
non-temporal threshold parameter.
190885
190885
Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
190885
190885
(cherry picked from commit 6e02b3e9327b7dbb063958d2b124b64fcb4bbe3f)
190885
---
190885
 sysdeps/x86/cacheinfo.h                            | 14 ++++++++++++++
190885
 .../x86_64/multiarch/memmove-vec-unaligned-erms.S  |  7 +++++--
190885
 2 files changed, 19 insertions(+), 2 deletions(-)
190885
190885
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
190885
index 02556961..b982982f 100644
190885
--- a/sysdeps/x86/cacheinfo.h
190885
+++ b/sysdeps/x86/cacheinfo.h
190885
@@ -45,6 +45,9 @@ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
190885
 /* Threshold to use Enhanced REP STOSB.  */
190885
 long int __x86_rep_stosb_threshold attribute_hidden = 2048;
190885
 
190885
+/* Threshold to stop using Enhanced REP MOVSB.  */
190885
+long int __x86_rep_movsb_stop_threshold attribute_hidden;
190885
+
190885
 static void
190885
 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
190885
 		       long int core)
190885
@@ -352,6 +355,12 @@ init_cacheinfo (void)
190885
 	      shared += core;
190885
             }
190885
 	}
190885
+
190885
+      /* ERMS feature is implemented from AMD Zen3 architecture and it is
190885
+	 performing poorly for data above L2 cache size. Henceforth, adding
190885
+	 an upper bound threshold parameter to limit the usage of Enhanced
190885
+	 REP MOVSB operations and setting its value to L2 cache size.  */
190885
+      __x86_rep_movsb_stop_threshold = core;
190885
     }
190885
 
190885
   if (cpu_features->data_cache_size != 0)
190885
@@ -421,6 +430,11 @@ init_cacheinfo (void)
190885
   else
190885
     __x86_rep_movsb_threshold = rep_movsb_threshold;
190885
 
190885
+  /* Setting the upper bound of ERMS to the computed value of
190885
+     non-temporal threshold for architectures other than AMD.  */
190885
+  if (cpu_features->basic.kind != arch_kind_amd)
190885
+    __x86_rep_movsb_stop_threshold = __x86_shared_non_temporal_threshold;
190885
+
190885
 # if HAVE_TUNABLES
190885
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
190885
 # endif
190885
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
index 572cef04..620ce3a8 100644
190885
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
@@ -30,7 +30,10 @@
190885
       load and aligned store.  Load the last 4 * VEC and first VEC
190885
       before the loop and store them after the loop to support
190885
       overlapping addresses.
190885
-   6. If size >= __x86_shared_non_temporal_threshold and there is no
190885
+   6. On machines with ERMS feature, if size greater than equal or to
190885
+      __x86_rep_movsb_threshold and less than
190885
+      __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
190885
+   7. If size >= __x86_shared_non_temporal_threshold and there is no
190885
       overlap between destination and source, use non-temporal store
190885
       instead of aligned store copying from either 2 or 4 pages at
190885
       once.
190885
@@ -311,7 +314,7 @@ L(return):
190885
 #endif
190885
 
190885
 L(movsb):
190885
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
190885
+	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
190885
 	jae	L(more_8x_vec)
190885
 	cmpq	%rsi, %rdi
190885
 	jb	1f
190885
-- 
190885
GitLab
190885