190885
From c2440c1e45d53140531105de024f7b9ceb53c51e Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Fri, 5 Mar 2021 06:46:08 -0800
190885
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
190885
190885
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
190885
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
190885
AVX512VL since VZEROUPPER isn't needed at function exit.
190885
190885
(cherry picked from commit 63ad43566f7a25d140dc723598aeb441ad657eed)
190885
---
190885
 sysdeps/x86_64/multiarch/Makefile             |  1 +
190885
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
190885
 sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
190885
 .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
190885
 .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
190885
 5 files changed, 104 insertions(+), 11 deletions(-)
190885
 create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
190885
190885
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
190885
index 46783cd1..4563fc56 100644
190885
--- a/sysdeps/x86_64/multiarch/Makefile
190885
+++ b/sysdeps/x86_64/multiarch/Makefile
190885
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
190885
 		   memset-avx2-unaligned-erms \
190885
 		   memset-avx512-unaligned-erms \
190885
 		   memchr-evex \
190885
+		   memmove-evex-unaligned-erms \
190885
 		   memrchr-evex \
190885
 		   rawmemchr-evex \
190885
 		   stpcpy-evex \
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
index 082e4da3..6bd3abfc 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __memmove_chk_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memmove_chk_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memmove_chk_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
190885
 			      CPU_FEATURE_USABLE (SSSE3),
190885
 			      __memmove_chk_ssse3_back)
190885
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, memmove,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __memmove_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, memmove,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memmove_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, memmove,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memmove_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, memmove,
190885
 			      CPU_FEATURE_USABLE (AVX512F),
190885
 			      __memmove_avx512_no_vzeroupper)
190885
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __memcpy_chk_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memcpy_chk_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memcpy_chk_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
190885
 			      CPU_FEATURE_USABLE (SSSE3),
190885
 			      __memcpy_chk_ssse3_back)
190885
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, memcpy,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __memcpy_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, memcpy,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memcpy_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, memcpy,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __memcpy_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __memcpy_ssse3_back)
190885
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
190885
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __mempcpy_chk_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __mempcpy_chk_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __mempcpy_chk_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
190885
 			      CPU_FEATURE_USABLE (SSSE3),
190885
 			      __mempcpy_chk_ssse3_back)
190885
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
190885
 			      CPU_FEATURE_USABLE (AVX),
190885
 			      __mempcpy_avx_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __mempcpy_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __mempcpy_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __mempcpy_ssse3_back)
190885
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
190885
index 5e5f0299..6f8bce5f 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
190885
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
190885
   attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
190885
+  attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
190885
+  attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
190885
   attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
190885
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
190885
 
190885
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
     {
190885
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
-	return OPTIMIZE (avx_unaligned_erms);
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
190885
+	{
190885
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
+	    return OPTIMIZE (evex_unaligned_erms);
190885
+
190885
+	  return OPTIMIZE (evex_unaligned);
190885
+	}
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	{
190885
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
+	    return OPTIMIZE (avx_unaligned_erms);
190885
 
190885
-      return OPTIMIZE (avx_unaligned);
190885
+	  return OPTIMIZE (avx_unaligned);
190885
+	}
190885
     }
190885
 
190885
   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
190885
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
190885
new file mode 100644
190885
index 00000000..0cbce8f9
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
190885
@@ -0,0 +1,33 @@
190885
+#if IS_IN (libc)
190885
+# define VEC_SIZE	32
190885
+# define XMM0		xmm16
190885
+# define XMM1		xmm17
190885
+# define YMM0		ymm16
190885
+# define YMM1		ymm17
190885
+# define VEC0		ymm16
190885
+# define VEC1		ymm17
190885
+# define VEC2		ymm18
190885
+# define VEC3		ymm19
190885
+# define VEC4		ymm20
190885
+# define VEC5		ymm21
190885
+# define VEC6		ymm22
190885
+# define VEC7		ymm23
190885
+# define VEC8		ymm24
190885
+# define VEC9		ymm25
190885
+# define VEC10		ymm26
190885
+# define VEC11		ymm27
190885
+# define VEC12		ymm28
190885
+# define VEC13		ymm29
190885
+# define VEC14		ymm30
190885
+# define VEC15		ymm31
190885
+# define VEC(i)		VEC##i
190885
+# define VMOVNT		vmovntdq
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+# define VZEROUPPER
190885
+
190885
+# define SECTION(p)		p##.evex
190885
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
190885
+
190885
+# include "memmove-vec-unaligned-erms.S"
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
index 274aa1c7..08e21692 100644
190885
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
@@ -48,6 +48,14 @@
190885
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
190885
 #endif
190885
 
190885
+#ifndef XMM0
190885
+# define XMM0				xmm0
190885
+#endif
190885
+
190885
+#ifndef YMM0
190885
+# define YMM0				ymm0
190885
+#endif
190885
+
190885
 #ifndef VZEROUPPER
190885
 # if VEC_SIZE > 16
190885
 #  define VZEROUPPER vzeroupper
190885
@@ -277,20 +285,20 @@ L(less_vec):
190885
 #if VEC_SIZE > 32
190885
 L(between_32_63):
190885
 	/* From 32 to 63.  No branch when size == 32.  */
190885
-	vmovdqu	(%rsi), %ymm0
190885
-	vmovdqu	-32(%rsi,%rdx), %ymm1
190885
-	vmovdqu	%ymm0, (%rdi)
190885
-	vmovdqu	%ymm1, -32(%rdi,%rdx)
190885
+	VMOVU	(%rsi), %YMM0
190885
+	VMOVU	-32(%rsi,%rdx), %YMM1
190885
+	VMOVU	%YMM0, (%rdi)
190885
+	VMOVU	%YMM1, -32(%rdi,%rdx)
190885
 	VZEROUPPER
190885
 	ret
190885
 #endif
190885
 #if VEC_SIZE > 16
190885
 	/* From 16 to 31.  No branch when size == 16.  */
190885
 L(between_16_31):
190885
-	vmovdqu	(%rsi), %xmm0
190885
-	vmovdqu	-16(%rsi,%rdx), %xmm1
190885
-	vmovdqu	%xmm0, (%rdi)
190885
-	vmovdqu	%xmm1, -16(%rdi,%rdx)
190885
+	VMOVU	(%rsi), %XMM0
190885
+	VMOVU	-16(%rsi,%rdx), %XMM1
190885
+	VMOVU	%XMM0, (%rdi)
190885
+	VMOVU	%XMM1, -16(%rdi,%rdx)
190885
 	ret
190885
 #endif
190885
 L(between_8_15):
190885
-- 
190885
GitLab
190885