Blame SOURCES/ia-upd-256bit-evex-instr-2.patch

190885
From 98192464b47c056515b6ac5ff218c197bd75618d Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Fri, 5 Mar 2021 06:36:50 -0800
190885
Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX
190885
190885
Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX
190885
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
190885
AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit.
190885
190885
(cherry picked from commit 525bc2a32c9710df40371f951217c6ae7a923aee)
190885
---
190885
 sysdeps/x86_64/multiarch/Makefile          |    6 +
190885
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   24 +
190885
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   13 +-
190885
 sysdeps/x86_64/multiarch/stpcpy-evex.S     |    3 +
190885
 sysdeps/x86_64/multiarch/stpncpy-evex.S    |    4 +
190885
 sysdeps/x86_64/multiarch/strcat-evex.S     |  283 ++++++
190885
 sysdeps/x86_64/multiarch/strcpy-evex.S     | 1003 ++++++++++++++++++++
190885
 sysdeps/x86_64/multiarch/strncat-evex.S    |    3 +
190885
 sysdeps/x86_64/multiarch/strncpy-evex.S    |    3 +
190885
 9 files changed, 1339 insertions(+), 3 deletions(-)
190885
 create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S
190885
190885
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
190885
index 5ce85882..46783cd1 100644
190885
--- a/sysdeps/x86_64/multiarch/Makefile
190885
+++ b/sysdeps/x86_64/multiarch/Makefile
190885
@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
190885
 		   memchr-evex \
190885
 		   memrchr-evex \
190885
 		   rawmemchr-evex \
190885
+		   stpcpy-evex \
190885
+		   stpncpy-evex \
190885
+		   strcat-evex \
190885
 		   strchr-evex \
190885
 		   strchrnul-evex \
190885
 		   strcmp-evex \
190885
+		   strcpy-evex \
190885
 		   strlen-evex \
190885
+		   strncat-evex \
190885
 		   strncmp-evex \
190885
+		   strncpy-evex \
190885
 		   strnlen-evex \
190885
 		   strrchr-evex
190885
 CFLAGS-varshift.c += -msse4
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
index bd7d9f19..082e4da3 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 			      __stpncpy_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
190885
 			      __stpncpy_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __stpncpy_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
190885
 			      __stpncpy_sse2_unaligned)
190885
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
190885
@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 			      __stpcpy_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
190885
 			      __stpcpy_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __stpcpy_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
190885
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
190885
 
190885
@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
   IFUNC_IMPL (i, name, strcat,
190885
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
190885
 			      __strcat_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strcat,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strcat_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __strcat_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
190885
@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
   IFUNC_IMPL (i, name, strcpy,
190885
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
190885
 			      __strcpy_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strcpy,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strcpy_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __strcpy_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
190885
@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
   IFUNC_IMPL (i, name, strncat,
190885
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
190885
 			      __strncat_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strncat,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strncat_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __strncat_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
190885
@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
   IFUNC_IMPL (i, name, strncpy,
190885
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
190885
 			      __strncpy_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strncpy,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strncpy_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
190885
 			      __strncpy_ssse3)
190885
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
190885
index 100dca5c..deae6348 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
190885
@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
190885
   attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
190885
     return OPTIMIZE (sse2_unaligned);
190885
diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
190885
new file mode 100644
190885
index 00000000..7c6f26cd
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define USE_AS_STPCPY
190885
+#define STRCPY __stpcpy_evex
190885
+#include "strcpy-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
190885
new file mode 100644
190885
index 00000000..1570014d
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define USE_AS_STPCPY
190885
+#define USE_AS_STRNCPY
190885
+#define STRCPY __stpncpy_evex
190885
+#include "strcpy-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
190885
new file mode 100644
190885
index 00000000..97c3d85b
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
190885
@@ -0,0 +1,283 @@
190885
+/* strcat with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef STRCAT
190885
+#  define STRCAT  __strcat_evex
190885
+# endif
190885
+
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+/* zero register */
190885
+# define XMMZERO	xmm16
190885
+# define YMMZERO	ymm16
190885
+# define YMM0		ymm17
190885
+# define YMM1		ymm18
190885
+
190885
+# define USE_AS_STRCAT
190885
+
190885
+/* Number of bytes in a vector register */
190885
+# define VEC_SIZE	32
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRCAT)
190885
+	mov	%rdi, %r9
190885
+# ifdef USE_AS_STRNCAT
190885
+	mov	%rdx, %r8
190885
+# endif
190885
+
190885
+	xor	%eax, %eax
190885
+	mov	%edi, %ecx
190885
+	and	$((VEC_SIZE * 4) - 1), %ecx
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+	cmp	$(VEC_SIZE * 3), %ecx
190885
+	ja	L(fourth_vector_boundary)
190885
+	vpcmpb	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_first_vector)
190885
+	mov	%rdi, %rax
190885
+	and	$-VEC_SIZE, %rax
190885
+	jmp	L(align_vec_size_start)
190885
+L(fourth_vector_boundary):
190885
+	mov	%rdi, %rax
190885
+	and	$-VEC_SIZE, %rax
190885
+	vpcmpb	$0, (%rax), %YMMZERO, %k0
190885
+	mov	$-1, %r10d
190885
+	sub	%rax, %rcx
190885
+	shl	%cl, %r10d
190885
+	kmovd	%k0, %edx
190885
+	and	%r10d, %edx
190885
+	jnz	L(exit)
190885
+
190885
+L(align_vec_size_start):
190885
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_second_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_third_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fourth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fifth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+	kmovd	%k4, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_second_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_third_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fourth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fifth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
190885
+	kmovd	%k4, %edx
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_second_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_third_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fourth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fifth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+	kmovd	%k4, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_second_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_third_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fourth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fifth_vector)
190885
+
190885
+	test	$((VEC_SIZE * 4) - 1), %rax
190885
+	jz	L(align_four_vec_loop)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
190885
+	add	$(VEC_SIZE * 5), %rax
190885
+	kmovd	%k4, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit)
190885
+
190885
+	test	$((VEC_SIZE * 4) - 1), %rax
190885
+	jz	L(align_four_vec_loop)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
190885
+	add	$VEC_SIZE, %rax
190885
+	kmovd	%k0, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit)
190885
+
190885
+	test	$((VEC_SIZE * 4) - 1), %rax
190885
+	jz	L(align_four_vec_loop)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
190885
+	add	$VEC_SIZE, %rax
190885
+	kmovd	%k0, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit)
190885
+
190885
+	test	$((VEC_SIZE * 4) - 1), %rax
190885
+	jz	L(align_four_vec_loop)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
190885
+	add	$VEC_SIZE, %rax
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit)
190885
+
190885
+	add	$VEC_SIZE, %rax
190885
+
190885
+	.p2align 4
190885
+L(align_four_vec_loop):
190885
+	VMOVA	(%rax), %YMM0
190885
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
190885
+	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
190885
+	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
190885
+	vpminub	%YMM0, %YMM1, %YMM0
190885
+	/* If K0 != 0, there is a null byte.  */
190885
+	vpcmpb	$0, %YMM0, %YMMZERO, %k0
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+	ktestd	%k0, %k0
190885
+	jz	L(align_four_vec_loop)
190885
+
190885
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
190885
+	sub	$(VEC_SIZE * 5), %rax
190885
+	kmovd	%k0, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_second_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_third_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(exit_null_on_fourth_vector)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	sub	%rdi, %rax
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+	jmp	L(StartStrcpyPart)
190885
+
190885
+	.p2align 4
190885
+L(exit):
190885
+	sub	%rdi, %rax
190885
+L(exit_null_on_first_vector):
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	jmp	L(StartStrcpyPart)
190885
+
190885
+	.p2align 4
190885
+L(exit_null_on_second_vector):
190885
+	sub	%rdi, %rax
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	add	$VEC_SIZE, %rax
190885
+	jmp	L(StartStrcpyPart)
190885
+
190885
+	.p2align 4
190885
+L(exit_null_on_third_vector):
190885
+	sub	%rdi, %rax
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	add	$(VEC_SIZE * 2), %rax
190885
+	jmp	L(StartStrcpyPart)
190885
+
190885
+	.p2align 4
190885
+L(exit_null_on_fourth_vector):
190885
+	sub	%rdi, %rax
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	add	$(VEC_SIZE * 3), %rax
190885
+	jmp	L(StartStrcpyPart)
190885
+
190885
+	.p2align 4
190885
+L(exit_null_on_fifth_vector):
190885
+	sub	%rdi, %rax
190885
+	bsf	%rdx, %rdx
190885
+	add	%rdx, %rax
190885
+	add	$(VEC_SIZE * 4), %rax
190885
+
190885
+	.p2align 4
190885
+L(StartStrcpyPart):
190885
+	lea	(%r9, %rax), %rdi
190885
+	mov	%rsi, %rcx
190885
+	mov	%r9, %rax      /* save result */
190885
+
190885
+# ifdef USE_AS_STRNCAT
190885
+	test	%r8, %r8
190885
+	jz	L(ExitZero)
190885
+#  define USE_AS_STRNCPY
190885
+# endif
190885
+
190885
+# include "strcpy-evex.S"
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
190885
new file mode 100644
190885
index 00000000..a343a1a6
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
190885
@@ -0,0 +1,1003 @@
190885
+/* strcpy with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# ifndef USE_AS_STRCAT
190885
+#  include <sysdep.h>
190885
+
190885
+#  ifndef STRCPY
190885
+#   define STRCPY  __strcpy_evex
190885
+#  endif
190885
+
190885
+# endif
190885
+
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+/* Number of bytes in a vector register */
190885
+# ifndef VEC_SIZE
190885
+#  define VEC_SIZE	32
190885
+# endif
190885
+
190885
+# define XMM2		xmm18
190885
+# define XMM3		xmm19
190885
+
190885
+# define YMM2		ymm18
190885
+# define YMM3		ymm19
190885
+# define YMM4		ymm20
190885
+# define YMM5		ymm21
190885
+# define YMM6		ymm22
190885
+# define YMM7		ymm23
190885
+
190885
+# ifndef USE_AS_STRCAT
190885
+
190885
+/* zero register */
190885
+#  define XMMZERO	xmm16
190885
+#  define YMMZERO	ymm16
190885
+#  define YMM1		ymm17
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRCPY)
190885
+#  ifdef USE_AS_STRNCPY
190885
+	mov	%RDX_LP, %R8_LP
190885
+	test	%R8_LP, %R8_LP
190885
+	jz	L(ExitZero)
190885
+#  endif
190885
+	mov	%rsi, %rcx
190885
+#  ifndef USE_AS_STPCPY
190885
+	mov	%rdi, %rax      /* save result */
190885
+#  endif
190885
+
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+# endif
190885
+
190885
+	and	$((VEC_SIZE * 4) - 1), %ecx
190885
+	cmp	$(VEC_SIZE * 2), %ecx
190885
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
190885
+
190885
+	and	$-VEC_SIZE, %rsi
190885
+	and	$(VEC_SIZE - 1), %ecx
190885
+
190885
+	vpcmpb	$0, (%rsi), %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	shr	%cl, %rdx
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
190885
+	mov	$VEC_SIZE, %r10
190885
+	sub	%rcx, %r10
190885
+	cmp	%r10, %r8
190885
+#  else
190885
+	mov	$(VEC_SIZE + 1), %r10
190885
+	sub	%rcx, %r10
190885
+	cmp	%r10, %r8
190885
+#  endif
190885
+	jbe	L(CopyVecSizeTailCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyVecSizeTail)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+	add	$VEC_SIZE, %r10
190885
+	cmp	%r10, %r8
190885
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyTwoVecSize)
190885
+
190885
+	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
190885
+	VMOVU	%YMM2, (%rdi)
190885
+
190885
+/* If source address alignment != destination address alignment */
190885
+	.p2align 4
190885
+L(UnalignVecSizeBoth):
190885
+	sub	%rcx, %rdi
190885
+# ifdef USE_AS_STRNCPY
190885
+	add	%rcx, %r8
190885
+	sbb	%rcx, %rcx
190885
+	or	%rcx, %r8
190885
+# endif
190885
+	mov	$VEC_SIZE, %rcx
190885
+	VMOVA	(%rsi, %rcx), %YMM2
190885
+	VMOVU	%YMM2, (%rdi, %rcx)
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$(VEC_SIZE * 3), %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec2)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVU	%YMM2, (%rdi, %rcx)
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
190885
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec3)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVU	%YMM3, (%rdi, %rcx)
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
190885
+	vpcmpb	$0, %YMM4, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec4)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVU	%YMM4, (%rdi, %rcx)
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec2)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVU	%YMM2, (%rdi, %rcx)
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec2)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
190885
+	VMOVU	%YMM2, (%rdi, %rcx)
190885
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+	add	$VEC_SIZE, %rcx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec3)
190885
+# else
190885
+	jnz	L(CopyVecSize)
190885
+# endif
190885
+
190885
+	VMOVU	%YMM3, (%rdi, %rcx)
190885
+	mov	%rsi, %rdx
190885
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
190885
+	and	$-(VEC_SIZE * 4), %rsi
190885
+	sub	%rsi, %rdx
190885
+	sub	%rdx, %rdi
190885
+# ifdef USE_AS_STRNCPY
190885
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
190885
+# endif
190885
+L(UnalignedFourVecSizeLoop):
190885
+	VMOVA	(%rsi), %YMM4
190885
+	VMOVA	VEC_SIZE(%rsi), %YMM5
190885
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
190885
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
190885
+	vpminub	%YMM5, %YMM4, %YMM2
190885
+	vpminub	%YMM7, %YMM6, %YMM3
190885
+	vpminub	%YMM2, %YMM3, %YMM2
190885
+	/* If K7 != 0, there is a null byte.  */
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
190885
+	kmovd	%k7, %edx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$(VEC_SIZE * 4), %r8
190885
+	jbe	L(UnalignedLeaveCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jnz	L(UnalignedFourVecSizeLeave)
190885
+
190885
+L(UnalignedFourVecSizeLoop_start):
190885
+	add	$(VEC_SIZE * 4), %rdi
190885
+	add	$(VEC_SIZE * 4), %rsi
190885
+	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
190885
+	VMOVA	(%rsi), %YMM4
190885
+	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
190885
+	VMOVA	VEC_SIZE(%rsi), %YMM5
190885
+	vpminub	%YMM5, %YMM4, %YMM2
190885
+	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
190885
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
190885
+	VMOVU	%YMM7, -VEC_SIZE(%rdi)
190885
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
190885
+	vpminub	%YMM7, %YMM6, %YMM3
190885
+	vpminub	%YMM2, %YMM3, %YMM2
190885
+	/* If K7 != 0, there is a null byte.  */
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
190885
+	kmovd	%k7, %edx
190885
+# ifdef USE_AS_STRNCPY
190885
+	sub	$(VEC_SIZE * 4), %r8
190885
+	jbe	L(UnalignedLeaveCase2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jz	L(UnalignedFourVecSizeLoop_start)
190885
+
190885
+L(UnalignedFourVecSizeLeave):
190885
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyVecSizeUnaligned_0)
190885
+
190885
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
190885
+	kmovd	%k2, %ecx
190885
+	test	%ecx, %ecx
190885
+	jnz	L(CopyVecSizeUnaligned_16)
190885
+
190885
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyVecSizeUnaligned_32)
190885
+
190885
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
190885
+	kmovd	%k4, %ecx
190885
+	bsf	%ecx, %edx
190885
+	VMOVU	%YMM4, (%rdi)
190885
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
190885
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
190885
+# endif
190885
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
190885
+	add	$(VEC_SIZE - 1), %r8
190885
+	sub	%rdx, %r8
190885
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
190885
+	jmp	L(StrncpyFillTailWithZero)
190885
+# else
190885
+	add	$(VEC_SIZE * 3), %rsi
190885
+	add	$(VEC_SIZE * 3), %rdi
190885
+	jmp	L(CopyVecSizeExit)
190885
+# endif
190885
+
190885
+/* If source address alignment == destination address alignment */
190885
+
190885
+L(SourceStringAlignmentLessTwoVecSize):
190885
+	VMOVU	(%rsi), %YMM3
190885
+	VMOVU	VEC_SIZE(%rsi), %YMM2
190885
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
190885
+	cmp	$VEC_SIZE, %r8
190885
+#  else
190885
+	cmp	$(VEC_SIZE + 1), %r8
190885
+#  endif
190885
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyVecSizeTail1)
190885
+
190885
+	VMOVU	%YMM3, (%rdi)
190885
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
190885
+	kmovd	%k0, %edx
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
190885
+	cmp	$(VEC_SIZE * 2), %r8
190885
+#  else
190885
+	cmp	$((VEC_SIZE * 2) + 1), %r8
190885
+#  endif
190885
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
190885
+# endif
190885
+	test	%edx, %edx
190885
+	jnz	L(CopyTwoVecSize1)
190885
+
190885
+	and	$-VEC_SIZE, %rsi
190885
+	and	$(VEC_SIZE - 1), %ecx
190885
+	jmp	L(UnalignVecSizeBoth)
190885
+
190885
+/*------End of main part with loops---------------------*/
190885
+
190885
+/* Case1 */
190885
+
190885
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
190885
+	.p2align 4
190885
+L(CopyVecSize):
190885
+	add	%rcx, %rdi
190885
+# endif
190885
+L(CopyVecSizeTail):
190885
+	add	%rcx, %rsi
190885
+L(CopyVecSizeTail1):
190885
+	bsf	%edx, %edx
190885
+L(CopyVecSizeExit):
190885
+	cmp	$32, %edx
190885
+	jae	L(Exit32_63)
190885
+	cmp	$16, %edx
190885
+	jae	L(Exit16_31)
190885
+	cmp	$8, %edx
190885
+	jae	L(Exit8_15)
190885
+	cmp	$4, %edx
190885
+	jae	L(Exit4_7)
190885
+	cmp	$3, %edx
190885
+	je	L(Exit3)
190885
+	cmp	$1, %edx
190885
+	ja	L(Exit2)
190885
+	je	L(Exit1)
190885
+	movb	$0, (%rdi)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	$1, %r8
190885
+	lea	1(%rdi), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(CopyTwoVecSize1):
190885
+	add	$VEC_SIZE, %rsi
190885
+	add	$VEC_SIZE, %rdi
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	$VEC_SIZE, %r8
190885
+# endif
190885
+	jmp	L(CopyVecSizeTail1)
190885
+
190885
+	.p2align 4
190885
+L(CopyTwoVecSize):
190885
+	bsf	%edx, %edx
190885
+	add	%rcx, %rsi
190885
+	add	$VEC_SIZE, %edx
190885
+	sub	%ecx, %edx
190885
+	jmp	L(CopyVecSizeExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnaligned_0):
190885
+	bsf	%edx, %edx
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+# endif
190885
+	VMOVU	%YMM4, (%rdi)
190885
+	add	$((VEC_SIZE * 4) - 1), %r8
190885
+	sub	%rdx, %r8
190885
+	lea	1(%rdi, %rdx), %rdi
190885
+	jmp	L(StrncpyFillTailWithZero)
190885
+# else
190885
+	jmp	L(CopyVecSizeExit)
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnaligned_16):
190885
+	bsf	%ecx, %edx
190885
+	VMOVU	%YMM4, (%rdi)
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	VEC_SIZE(%rdi, %rdx), %rax
190885
+# endif
190885
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
190885
+	add	$((VEC_SIZE * 3) - 1), %r8
190885
+	sub	%rdx, %r8
190885
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
190885
+	jmp	L(StrncpyFillTailWithZero)
190885
+# else
190885
+	add	$VEC_SIZE, %rsi
190885
+	add	$VEC_SIZE, %rdi
190885
+	jmp	L(CopyVecSizeExit)
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnaligned_32):
190885
+	bsf	%edx, %edx
190885
+	VMOVU	%YMM4, (%rdi)
190885
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
190885
+# endif
190885
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
190885
+	add	$((VEC_SIZE * 2) - 1), %r8
190885
+	sub	%rdx, %r8
190885
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
190885
+	jmp	L(StrncpyFillTailWithZero)
190885
+# else
190885
+	add	$(VEC_SIZE * 2), %rsi
190885
+	add	$(VEC_SIZE * 2), %rdi
190885
+	jmp	L(CopyVecSizeExit)
190885
+# endif
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+#  ifndef USE_AS_STRCAT
190885
+	.p2align 4
190885
+L(CopyVecSizeUnalignedVec6):
190885
+	VMOVU	%YMM6, (%rdi, %rcx)
190885
+	jmp	L(CopyVecSizeVecExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnalignedVec5):
190885
+	VMOVU	%YMM5, (%rdi, %rcx)
190885
+	jmp	L(CopyVecSizeVecExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnalignedVec4):
190885
+	VMOVU	%YMM4, (%rdi, %rcx)
190885
+	jmp	L(CopyVecSizeVecExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnalignedVec3):
190885
+	VMOVU	%YMM3, (%rdi, %rcx)
190885
+	jmp	L(CopyVecSizeVecExit)
190885
+#  endif
190885
+
190885
+/* Case2 */
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeCase2):
190885
+	add	$VEC_SIZE, %r8
190885
+	add	%rcx, %rdi
190885
+	add	%rcx, %rsi
190885
+	bsf	%edx, %edx
190885
+	cmp	%r8d, %edx
190885
+	jb	L(CopyVecSizeExit)
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyTwoVecSizeCase2):
190885
+	add	%rcx, %rsi
190885
+	bsf	%edx, %edx
190885
+	add	$VEC_SIZE, %edx
190885
+	sub	%ecx, %edx
190885
+	cmp	%r8d, %edx
190885
+	jb	L(CopyVecSizeExit)
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+L(CopyVecSizeTailCase2):
190885
+	add	%rcx, %rsi
190885
+	bsf	%edx, %edx
190885
+	cmp	%r8d, %edx
190885
+	jb	L(CopyVecSizeExit)
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+L(CopyVecSizeTail1Case2):
190885
+	bsf	%edx, %edx
190885
+	cmp	%r8d, %edx
190885
+	jb	L(CopyVecSizeExit)
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+/* Case2 or Case3,  Case3 */
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeCase2OrCase3):
190885
+	test	%rdx, %rdx
190885
+	jnz	L(CopyVecSizeCase2)
190885
+L(CopyVecSizeCase3):
190885
+	add	$VEC_SIZE, %r8
190885
+	add	%rcx, %rdi
190885
+	add	%rcx, %rsi
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyTwoVecSizeCase2OrCase3):
190885
+	test	%rdx, %rdx
190885
+	jnz	L(CopyTwoVecSizeCase2)
190885
+	add	%rcx, %rsi
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeTailCase2OrCase3):
190885
+	test	%rdx, %rdx
190885
+	jnz	L(CopyVecSizeTailCase2)
190885
+	add	%rcx, %rsi
190885
+	jmp	L(StrncpyExit)
190885
+
190885
+	.p2align 4
190885
+L(CopyTwoVecSize1Case2OrCase3):
190885
+	add	$VEC_SIZE, %rdi
190885
+	add	$VEC_SIZE, %rsi
190885
+	sub	$VEC_SIZE, %r8
190885
+L(CopyVecSizeTail1Case2OrCase3):
190885
+	test	%rdx, %rdx
190885
+	jnz	L(CopyVecSizeTail1Case2)
190885
+	jmp	L(StrncpyExit)
190885
+# endif
190885
+
190885
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
190885
+
190885
+	.p2align 4
190885
+L(Exit1):
190885
+	movzwl	(%rsi), %edx
190885
+	mov	%dx, (%rdi)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	1(%rdi), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	$2, %r8
190885
+	lea	2(%rdi), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit2):
190885
+	movzwl	(%rsi), %ecx
190885
+	mov	%cx, (%rdi)
190885
+	movb	$0, 2(%rdi)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	2(%rdi), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	$3, %r8
190885
+	lea	3(%rdi), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit3):
190885
+	mov	(%rsi), %edx
190885
+	mov	%edx, (%rdi)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	3(%rdi), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	$4, %r8
190885
+	lea	4(%rdi), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit4_7):
190885
+	mov	(%rsi), %ecx
190885
+	mov	%ecx, (%rdi)
190885
+	mov	-3(%rsi, %rdx), %ecx
190885
+	mov	%ecx, -3(%rdi, %rdx)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	%rdx, %r8
190885
+	sub	$1, %r8
190885
+	lea	1(%rdi, %rdx), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit8_15):
190885
+	mov	(%rsi), %rcx
190885
+	mov	-7(%rsi, %rdx), %r9
190885
+	mov	%rcx, (%rdi)
190885
+	mov	%r9, -7(%rdi, %rdx)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	%rdx, %r8
190885
+	sub	$1, %r8
190885
+	lea	1(%rdi, %rdx), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit16_31):
190885
+	VMOVU	(%rsi), %XMM2
190885
+	VMOVU	-15(%rsi, %rdx), %XMM3
190885
+	VMOVU	%XMM2, (%rdi)
190885
+	VMOVU	%XMM3, -15(%rdi, %rdx)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub %rdx, %r8
190885
+	sub $1, %r8
190885
+	lea 1(%rdi, %rdx), %rdi
190885
+	jnz L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Exit32_63):
190885
+	VMOVU	(%rsi), %YMM2
190885
+	VMOVU	-31(%rsi, %rdx), %YMM3
190885
+	VMOVU	%YMM2, (%rdi)
190885
+	VMOVU	%YMM3, -31(%rdi, %rdx)
190885
+# ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+# endif
190885
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
190885
+	sub	%rdx, %r8
190885
+	sub	$1, %r8
190885
+	lea	1(%rdi, %rdx), %rdi
190885
+	jnz	L(StrncpyFillTailWithZero)
190885
+# endif
190885
+	ret
190885
+
190885
+# ifdef USE_AS_STRNCPY
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit1):
190885
+	movzbl	(%rsi), %edx
190885
+	mov	%dl, (%rdi)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	1(%rdi), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, 1(%rdi)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit2):
190885
+	movzwl	(%rsi), %edx
190885
+	mov	%dx, (%rdi)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	2(%rdi), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, 2(%rdi)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit3_4):
190885
+	movzwl	(%rsi), %ecx
190885
+	movzwl	-2(%rsi, %r8), %edx
190885
+	mov	%cx, (%rdi)
190885
+	mov	%dx, -2(%rdi, %r8)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %r8), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi, %r8)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit5_8):
190885
+	mov	(%rsi), %ecx
190885
+	mov	-4(%rsi, %r8), %edx
190885
+	mov	%ecx, (%rdi)
190885
+	mov	%edx, -4(%rdi, %r8)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %r8), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi, %r8)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit9_16):
190885
+	mov	(%rsi), %rcx
190885
+	mov	-8(%rsi, %r8), %rdx
190885
+	mov	%rcx, (%rdi)
190885
+	mov	%rdx, -8(%rdi, %r8)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %r8), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi, %r8)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit17_32):
190885
+	VMOVU	(%rsi), %XMM2
190885
+	VMOVU	-16(%rsi, %r8), %XMM3
190885
+	VMOVU	%XMM2, (%rdi)
190885
+	VMOVU	%XMM3, -16(%rdi, %r8)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %r8), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi, %r8)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit33_64):
190885
+	/*  0/32, 31/16 */
190885
+	VMOVU	(%rsi), %YMM2
190885
+	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
190885
+	VMOVU	%YMM2, (%rdi)
190885
+	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %r8), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi, %r8)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(StrncpyExit65):
190885
+	/* 0/32, 32/32, 64/1 */
190885
+	VMOVU	(%rsi), %YMM2
190885
+	VMOVU	32(%rsi), %YMM3
190885
+	mov	64(%rsi), %cl
190885
+	VMOVU	%YMM2, (%rdi)
190885
+	VMOVU	%YMM3, 32(%rdi)
190885
+	mov	%cl, 64(%rdi)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	65(%rdi), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, 65(%rdi)
190885
+#  endif
190885
+	ret
190885
+
190885
+#  ifndef USE_AS_STRCAT
190885
+
190885
+	.p2align 4
190885
+L(Fill1):
190885
+	mov	%dl, (%rdi)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Fill2):
190885
+	mov	%dx, (%rdi)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Fill3_4):
190885
+	mov	%dx, (%rdi)
190885
+	mov     %dx, -2(%rdi, %r8)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Fill5_8):
190885
+	mov	%edx, (%rdi)
190885
+	mov     %edx, -4(%rdi, %r8)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Fill9_16):
190885
+	mov	%rdx, (%rdi)
190885
+	mov	%rdx, -8(%rdi, %r8)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(Fill17_32):
190885
+	VMOVU	%XMMZERO, (%rdi)
190885
+	VMOVU	%XMMZERO, -16(%rdi, %r8)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeUnalignedVec2):
190885
+	VMOVU	%YMM2, (%rdi, %rcx)
190885
+
190885
+	.p2align 4
190885
+L(CopyVecSizeVecExit):
190885
+	bsf	%edx, %edx
190885
+	add	$(VEC_SIZE - 1), %r8
190885
+	add	%rcx, %rdi
190885
+#   ifdef USE_AS_STPCPY
190885
+	lea	(%rdi, %rdx), %rax
190885
+#   endif
190885
+	sub	%rdx, %r8
190885
+	lea	1(%rdi, %rdx), %rdi
190885
+
190885
+	.p2align 4
190885
+L(StrncpyFillTailWithZero):
190885
+	xor	%edx, %edx
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(StrncpyFillExit)
190885
+
190885
+	VMOVU	%YMMZERO, (%rdi)
190885
+	add	$VEC_SIZE, %rdi
190885
+
190885
+	mov	%rdi, %rsi
190885
+	and	$(VEC_SIZE - 1), %esi
190885
+	sub	%rsi, %rdi
190885
+	add	%rsi, %r8
190885
+	sub	$(VEC_SIZE * 4), %r8
190885
+	jb	L(StrncpyFillLessFourVecSize)
190885
+
190885
+L(StrncpyFillLoopVmovdqa):
190885
+	VMOVA	%YMMZERO, (%rdi)
190885
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
190885
+	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
190885
+	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
190885
+	add	$(VEC_SIZE * 4), %rdi
190885
+	sub	$(VEC_SIZE * 4), %r8
190885
+	jae	L(StrncpyFillLoopVmovdqa)
190885
+
190885
+L(StrncpyFillLessFourVecSize):
190885
+	add	$(VEC_SIZE * 2), %r8
190885
+	jl	L(StrncpyFillLessTwoVecSize)
190885
+	VMOVA	%YMMZERO, (%rdi)
190885
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
190885
+	add	$(VEC_SIZE * 2), %rdi
190885
+	sub	$VEC_SIZE, %r8
190885
+	jl	L(StrncpyFillExit)
190885
+	VMOVA	%YMMZERO, (%rdi)
190885
+	add	$VEC_SIZE, %rdi
190885
+	jmp	L(Fill)
190885
+
190885
+	.p2align 4
190885
+L(StrncpyFillLessTwoVecSize):
190885
+	add	$VEC_SIZE, %r8
190885
+	jl	L(StrncpyFillExit)
190885
+	VMOVA	%YMMZERO, (%rdi)
190885
+	add	$VEC_SIZE, %rdi
190885
+	jmp	L(Fill)
190885
+
190885
+	.p2align 4
190885
+L(StrncpyFillExit):
190885
+	add	$VEC_SIZE, %r8
190885
+L(Fill):
190885
+	cmp	$17, %r8d
190885
+	jae	L(Fill17_32)
190885
+	cmp	$9, %r8d
190885
+	jae	L(Fill9_16)
190885
+	cmp	$5, %r8d
190885
+	jae	L(Fill5_8)
190885
+	cmp	$3, %r8d
190885
+	jae	L(Fill3_4)
190885
+	cmp	$1, %r8d
190885
+	ja	L(Fill2)
190885
+	je	L(Fill1)
190885
+	ret
190885
+
190885
+/* end of ifndef USE_AS_STRCAT */
190885
+#  endif
190885
+
190885
+	.p2align 4
190885
+L(UnalignedLeaveCase2OrCase3):
190885
+	test	%rdx, %rdx
190885
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
190885
+L(UnalignedFourVecSizeLeaveCase3):
190885
+	lea	(VEC_SIZE * 4)(%r8), %rcx
190885
+	and	$-VEC_SIZE, %rcx
190885
+	add	$(VEC_SIZE * 3), %r8
190885
+	jl	L(CopyVecSizeCase3)
190885
+	VMOVU	%YMM4, (%rdi)
190885
+	sub	$VEC_SIZE, %r8
190885
+	jb	L(CopyVecSizeCase3)
190885
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
190885
+	sub	$VEC_SIZE, %r8
190885
+	jb	L(CopyVecSizeCase3)
190885
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
190885
+	sub	$VEC_SIZE, %r8
190885
+	jb	L(CopyVecSizeCase3)
190885
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
190885
+#  ifdef USE_AS_STPCPY
190885
+	lea	(VEC_SIZE * 4)(%rdi), %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (VEC_SIZE * 4)(%rdi)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(UnalignedFourVecSizeLeaveCase2):
190885
+	xor	%ecx, %ecx
190885
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
190885
+	kmovd	%k1, %edx
190885
+	add	$(VEC_SIZE * 3), %r8
190885
+	jle	L(CopyVecSizeCase2OrCase3)
190885
+	test	%edx, %edx
190885
+#  ifndef USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec4)
190885
+#  else
190885
+	jnz	L(CopyVecSize)
190885
+#  endif
190885
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
190885
+	kmovd	%k2, %edx
190885
+	VMOVU	%YMM4, (%rdi)
190885
+	add	$VEC_SIZE, %rcx
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+	test	%edx, %edx
190885
+#  ifndef USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec5)
190885
+#  else
190885
+	jnz	L(CopyVecSize)
190885
+#  endif
190885
+
190885
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
190885
+	kmovd	%k3, %edx
190885
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
190885
+	add	$VEC_SIZE, %rcx
190885
+	sub	$VEC_SIZE, %r8
190885
+	jbe	L(CopyVecSizeCase2OrCase3)
190885
+	test	%edx, %edx
190885
+#  ifndef USE_AS_STRCAT
190885
+	jnz	L(CopyVecSizeUnalignedVec6)
190885
+#  else
190885
+	jnz	L(CopyVecSize)
190885
+#  endif
190885
+
190885
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
190885
+	kmovd	%k4, %edx
190885
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
190885
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
190885
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
190885
+	bsf	%edx, %edx
190885
+	cmp	%r8d, %edx
190885
+	jb	L(CopyVecSizeExit)
190885
+L(StrncpyExit):
190885
+	cmp	$65, %r8d
190885
+	je	L(StrncpyExit65)
190885
+	cmp	$33, %r8d
190885
+	jae	L(StrncpyExit33_64)
190885
+	cmp	$17, %r8d
190885
+	jae	L(StrncpyExit17_32)
190885
+	cmp	$9, %r8d
190885
+	jae	L(StrncpyExit9_16)
190885
+	cmp	$5, %r8d
190885
+	jae	L(StrncpyExit5_8)
190885
+	cmp	$3, %r8d
190885
+	jae	L(StrncpyExit3_4)
190885
+	cmp	$1, %r8d
190885
+	ja	L(StrncpyExit2)
190885
+	je	L(StrncpyExit1)
190885
+#  ifdef USE_AS_STPCPY
190885
+	mov	%rdi, %rax
190885
+#  endif
190885
+#  ifdef USE_AS_STRCAT
190885
+	movb	$0, (%rdi)
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(ExitZero):
190885
+#  ifndef USE_AS_STRCAT
190885
+	mov	%rdi, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+# endif
190885
+
190885
+# ifndef USE_AS_STRCAT
190885
+END (STRCPY)
190885
+# else
190885
+END (STRCAT)
190885
+# endif
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
190885
new file mode 100644
190885
index 00000000..8884f023
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define USE_AS_STRNCAT
190885
+#define STRCAT __strncat_evex
190885
+#include "strcat-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
190885
new file mode 100644
190885
index 00000000..40e391f0
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define USE_AS_STRNCPY
190885
+#define STRCPY __strncpy_evex
190885
+#include "strcpy-evex.S"
190885
-- 
190885
GitLab
190885