Blame SOURCES/ia-upd-256bit-evex-instr-4.patch

190885
From 3d5101ddb7a4004459ca3f894caa47cfe9208be6 Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Fri, 5 Mar 2021 07:15:03 -0800
190885
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
190885
190885
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
190885
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
190885
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
190885
function exit.
190885
190885
(cherry picked from commit 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee)
190885
---
190885
 sysdeps/x86_64/multiarch/Makefile             |  1 +
190885
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
190885
 sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
190885
 sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
190885
 .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
190885
 .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
190885
 6 files changed, 90 insertions(+), 14 deletions(-)
190885
 create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
190885
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
190885
index 4563fc56..1cc0a10e 100644
190885
--- a/sysdeps/x86_64/multiarch/Makefile
190885
+++ b/sysdeps/x86_64/multiarch/Makefile
190885
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
190885
 		   memchr-evex \
190885
 		   memmove-evex-unaligned-erms \
190885
 		   memrchr-evex \
190885
+		   memset-evex-unaligned-erms \
190885
 		   rawmemchr-evex \
190885
 		   stpcpy-evex \
190885
 		   stpncpy-evex \
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
index 6bd3abfc..7cf83485 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __memset_chk_avx2_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __memset_chk_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __memset_chk_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
190885
 			      CPU_FEATURE_USABLE (AVX512F),
190885
 			      __memset_chk_avx512_unaligned_erms)
190885
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, memset,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __memset_avx2_unaligned_erms)
190885
+	      IFUNC_IMPL_ADD (array, i, memset,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __memset_evex_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, memset,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __memset_evex_unaligned_erms)
190885
 	      IFUNC_IMPL_ADD (array, i, memset,
190885
 			      CPU_FEATURE_USABLE (AVX512F),
190885
 			      __memset_avx512_unaligned_erms)
190885
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wmemset,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wmemset_avx2_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, wmemset,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __wmemset_evex_unaligned)
190885
 	      IFUNC_IMPL_ADD (array, i, wmemset,
190885
 			      CPU_FEATURE_USABLE (AVX512F),
190885
 			      __wmemset_avx512_unaligned))
190885
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wmemset_chk_avx2_unaligned)
190885
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
190885
+			      CPU_FEATURE_USABLE (AVX512VL),
190885
+			      __wmemset_chk_evex_unaligned)
190885
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
190885
 			      CPU_FEATURE_USABLE (AVX512F),
190885
 			      __wmemset_chk_avx512_unaligned))
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
190885
index 708bd72e..6f31f4dc 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
190885
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
190885
   attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
190885
+  attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
190885
+  attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
190885
   attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
190885
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
190885
 
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
190885
     {
190885
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
-	return OPTIMIZE (avx2_unaligned_erms);
190885
-      else
190885
-	return OPTIMIZE (avx2_unaligned);
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
190885
+	{
190885
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
+	    return OPTIMIZE (evex_unaligned_erms);
190885
+
190885
+	  return OPTIMIZE (evex_unaligned);
190885
+	}
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	{
190885
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
+	    return OPTIMIZE (avx2_unaligned_erms);
190885
+
190885
+	  return OPTIMIZE (avx2_unaligned);
190885
+	}
190885
     }
190885
 
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
190885
index eb242210..9290c4bf 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
190885
@@ -20,6 +20,7 @@
190885
 
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
190885
 
190885
 static inline void *
190885
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
     {
190885
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
190885
-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
190885
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
190885
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
 	return OPTIMIZE (avx512_unaligned);
190885
-      else
190885
+
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
190885
+	return OPTIMIZE (evex_unaligned);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
 	return OPTIMIZE (avx2_unaligned);
190885
     }
190885
 
190885
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
new file mode 100644
190885
index 00000000..ae0a4d6e
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
@@ -0,0 +1,24 @@
190885
+#if IS_IN (libc)
190885
+# define VEC_SIZE	32
190885
+# define XMM0		xmm16
190885
+# define YMM0		ymm16
190885
+# define VEC0		ymm16
190885
+# define VEC(i)		VEC##i
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+# define VZEROUPPER
190885
+
190885
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
+  movq r, %rax; \
190885
+  vpbroadcastb d, %VEC0
190885
+
190885
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
+  movq r, %rax; \
190885
+  vpbroadcastd d, %VEC0
190885
+
190885
+# define SECTION(p)		p##.evex
190885
+# define MEMSET_SYMBOL(p,s)	p##_evex_##s
190885
+# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
190885
+
190885
+# include "memset-vec-unaligned-erms.S"
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
index 9a0fd818..71e91a8f 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
@@ -34,6 +34,14 @@
190885
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
190885
 #endif
190885
 
190885
+#ifndef XMM0
190885
+# define XMM0				xmm0
190885
+#endif
190885
+
190885
+#ifndef YMM0
190885
+# define YMM0				ymm0
190885
+#endif
190885
+
190885
 #ifndef VZEROUPPER
190885
 # if VEC_SIZE > 16
190885
 #  define VZEROUPPER			vzeroupper
190885
@@ -67,7 +75,7 @@
190885
 ENTRY (__bzero)
190885
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
190885
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
190885
-	pxor	%xmm0, %xmm0
190885
+	pxor	%XMM0, %XMM0
190885
 	jmp	L(entry_from_bzero)
190885
 END (__bzero)
190885
 weak_alias (__bzero, bzero)
190885
@@ -223,7 +231,7 @@ L(less_vec):
190885
 	cmpb	$16, %dl
190885
 	jae	L(between_16_31)
190885
 # endif
190885
-	MOVQ	%xmm0, %rcx
190885
+	MOVQ	%XMM0, %rcx
190885
 	cmpb	$8, %dl
190885
 	jae	L(between_8_15)
190885
 	cmpb	$4, %dl
190885
@@ -238,16 +246,16 @@ L(less_vec):
190885
 # if VEC_SIZE > 32
190885
 	/* From 32 to 63.  No branch when size == 32.  */
190885
 L(between_32_63):
190885
-	vmovdqu	%ymm0, -32(%rdi,%rdx)
190885
-	vmovdqu	%ymm0, (%rdi)
190885
+	VMOVU	%YMM0, -32(%rdi,%rdx)
190885
+	VMOVU	%YMM0, (%rdi)
190885
 	VZEROUPPER
190885
 	ret
190885
 # endif
190885
 # if VEC_SIZE > 16
190885
 	/* From 16 to 31.  No branch when size == 16.  */
190885
 L(between_16_31):
190885
-	vmovdqu	%xmm0, -16(%rdi,%rdx)
190885
-	vmovdqu	%xmm0, (%rdi)
190885
+	VMOVU	%XMM0, -16(%rdi,%rdx)
190885
+	VMOVU	%XMM0, (%rdi)
190885
 	VZEROUPPER
190885
 	ret
190885
 # endif
190885
-- 
190885
GitLab
190885