b1dca6
commit 1a153e47fcc9401d8ea424ad86569a57ed0f8c52
b1dca6
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
b1dca6
Date:   Mon Oct 8 08:59:50 2018 -0500
b1dca6
b1dca6
    x86-64: Optimize strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2
b1dca6
    
b1dca6
    Optimize x86-64 strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2.
b1dca6
    It uses vector comparison as much as possible. In general, the larger the
b1dca6
    source string, the greater performance gain observed, reaching speedups of
b1dca6
    1.6x compared to SSE2 unaligned routines. Select AVX2 strcat/strncat,
b1dca6
    strcpy/strncpy and stpcpy/stpncpy on AVX2 machines where vzeroupper is
b1dca6
    preferred and AVX unaligned load is fast.
b1dca6
    
b1dca6
            * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
b1dca6
            strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
b1dca6
            stpcpy-avx2 and stpncpy-avx2.
b1dca6
            * sysdeps/x86_64/multiarch/ifunc-impl-list.c:
b1dca6
            (__libc_ifunc_impl_list): Add tests for __strcat_avx2,
b1dca6
            __strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
b1dca6
            and __stpncpy_avx2.
b1dca6
            * sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
b1dca6
            ifunc-strcpy.h}: rename header for a more generic name.
b1dca6
            * sysdeps/x86_64/multiarch/ifunc-strcpy.h:
b1dca6
            (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
b1dca6
            AVX unaligned load is fast and vzeroupper is preferred.
b1dca6
            * sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file
b1dca6
            * sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise
b1dca6
            * sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise
b1dca6
            * sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise
b1dca6
            * sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise
b1dca6
            * sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise
b1dca6
b1dca6
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
b1dca6
index bb5e97073520ee51..395e432c092ca17c 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/Makefile
b1dca6
+++ b/sysdeps/x86_64/multiarch/Makefile
b1dca6
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
b1dca6
 		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
b1dca6
 		   strrchr-sse2 strrchr-avx2 \
b1dca6
 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
b1dca6
+		   strcat-avx2 strncat-avx2 \
b1dca6
 		   strcat-ssse3 strncat-ssse3\
b1dca6
+		   strcpy-avx2 strncpy-avx2 \
b1dca6
 		   strcpy-sse2 stpcpy-sse2 \
b1dca6
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
b1dca6
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
b1dca6
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
b1dca6
+		   stpcpy-avx2 stpncpy-avx2 \
b1dca6
 		   strcat-sse2 \
b1dca6
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
b1dca6
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
b1dca6
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
b1dca6
index 9aaaef7251b8edfe..8b55bb6954000cc2 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
b1dca6
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
   IFUNC_IMPL (i, name, stpncpy,
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __stpncpy_ssse3)
b1dca6
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __stpncpy_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
b1dca6
 			      __stpncpy_sse2_unaligned)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
b1dca6
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
   IFUNC_IMPL (i, name, stpcpy,
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __stpcpy_ssse3)
b1dca6
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __stpcpy_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
b1dca6
 
b1dca6
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
 
b1dca6
   /* Support sysdeps/x86_64/multiarch/strcat.c.  */
b1dca6
   IFUNC_IMPL (i, name, strcat,
b1dca6
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __strcat_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __strcat_ssse3)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
b1dca6
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
 
b1dca6
   /* Support sysdeps/x86_64/multiarch/strcpy.c.  */
b1dca6
   IFUNC_IMPL (i, name, strcpy,
b1dca6
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __strcpy_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __strcpy_ssse3)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
b1dca6
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
 
b1dca6
   /* Support sysdeps/x86_64/multiarch/strncat.c.  */
b1dca6
   IFUNC_IMPL (i, name, strncat,
b1dca6
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __strncat_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __strncat_ssse3)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
b1dca6
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
b1dca6
 
b1dca6
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
b1dca6
   IFUNC_IMPL (i, name, strncpy,
b1dca6
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
b1dca6
+			      __strncpy_avx2)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
b1dca6
 			      __strncpy_ssse3)
b1dca6
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
b1dca6
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
b1dca6
similarity index 83%
b1dca6
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
b1dca6
rename to sysdeps/x86_64/multiarch/ifunc-strcpy.h
b1dca6
index 81805f9832345923..4f2286fefccda069 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
b1dca6
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
b1dca6
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
b1dca6
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
b1dca6
   attribute_hidden;
b1dca6
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
b1dca6
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
b1dca6
 
b1dca6
 static inline void *
b1dca6
 IFUNC_SELECTOR (void)
b1dca6
 {
b1dca6
   const struct cpu_features* cpu_features = __get_cpu_features ();
b1dca6
 
b1dca6
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
b1dca6
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
b1dca6
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
b1dca6
+    return OPTIMIZE (avx2);
b1dca6
+
b1dca6
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
b1dca6
     return OPTIMIZE (sse2_unaligned);
b1dca6
 
b1dca6
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..f0bd3029fe3047ed
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
b1dca6
@@ -0,0 +1,3 @@
b1dca6
+#define USE_AS_STPCPY
b1dca6
+#define STRCPY __stpcpy_avx2
b1dca6
+#include "strcpy-avx2.S"
b1dca6
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
b1dca6
index 1e340fca991a021c..8ffd13b48c83ca8e 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/stpcpy.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
b1dca6
@@ -28,7 +28,7 @@
b1dca6
 # undef __stpcpy
b1dca6
 
b1dca6
 # define SYMBOL_NAME stpcpy
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
b1dca6
 
b1dca6
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..032b0407d08c6a9d
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
b1dca6
@@ -0,0 +1,4 @@
b1dca6
+#define USE_AS_STPCPY
b1dca6
+#define USE_AS_STRNCPY
b1dca6
+#define STRCPY __stpncpy_avx2
b1dca6
+#include "strcpy-avx2.S"
b1dca6
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
b1dca6
index 28842ece2b0998e3..f3e203f78cca2e61 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/stpncpy.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
b1dca6
@@ -26,7 +26,7 @@
b1dca6
 # undef __stpncpy
b1dca6
 
b1dca6
 # define SYMBOL_NAME stpncpy
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
b1dca6
 
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..b062356427677ca6
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
b1dca6
@@ -0,0 +1,275 @@
b1dca6
+/* strcat with AVX2
b1dca6
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
b1dca6
+   Contributed by Intel Corporation.
b1dca6
+   This file is part of the GNU C Library.
b1dca6
+
b1dca6
+   The GNU C Library is free software; you can redistribute it and/or
b1dca6
+   modify it under the terms of the GNU Lesser General Public
b1dca6
+   License as published by the Free Software Foundation; either
b1dca6
+   version 2.1 of the License, or (at your option) any later version.
b1dca6
+
b1dca6
+   The GNU C Library is distributed in the hope that it will be useful,
b1dca6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
b1dca6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
b1dca6
+   Lesser General Public License for more details.
b1dca6
+
b1dca6
+   You should have received a copy of the GNU Lesser General Public
b1dca6
+   License along with the GNU C Library; if not, see
b1dca6
+   <http://www.gnu.org/licenses/>.  */
b1dca6
+
b1dca6
+#if IS_IN (libc)
b1dca6
+
b1dca6
+# include <sysdep.h>
b1dca6
+
b1dca6
+# ifndef STRCAT
b1dca6
+#  define STRCAT  __strcat_avx2
b1dca6
+# endif
b1dca6
+
b1dca6
+# define USE_AS_STRCAT
b1dca6
+
b1dca6
+/* Number of bytes in a vector register */
b1dca6
+# define VEC_SIZE	32
b1dca6
+
b1dca6
+	.section .text.avx,"ax",@progbits
b1dca6
+ENTRY (STRCAT)
b1dca6
+	mov	%rdi, %r9
b1dca6
+# ifdef USE_AS_STRNCAT
b1dca6
+	mov	%rdx, %r8
b1dca6
+# endif
b1dca6
+
b1dca6
+	xor	%eax, %eax
b1dca6
+	mov	%edi, %ecx
b1dca6
+	and	$((VEC_SIZE * 4) - 1), %ecx
b1dca6
+	vpxor	%xmm6, %xmm6, %xmm6
b1dca6
+	cmp	$(VEC_SIZE * 3), %ecx
b1dca6
+	ja	L(fourth_vector_boundary)
b1dca6
+	vpcmpeqb (%rdi), %ymm6, %ymm0
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_first_vector)
b1dca6
+	mov	%rdi, %rax
b1dca6
+	and	$-VEC_SIZE, %rax
b1dca6
+	jmp	L(align_vec_size_start)
b1dca6
+L(fourth_vector_boundary):
b1dca6
+	mov	%rdi, %rax
b1dca6
+	and	$-VEC_SIZE, %rax
b1dca6
+	vpcmpeqb	(%rax), %ymm6, %ymm0
b1dca6
+	mov	$-1, %r10d
b1dca6
+	sub	%rax, %rcx
b1dca6
+	shl	%cl, %r10d
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	and	%r10d, %edx
b1dca6
+	jnz	L(exit)
b1dca6
+
b1dca6
+L(align_vec_size_start):
b1dca6
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_second_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_third_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fourth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fifth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
b1dca6
+	add	$(VEC_SIZE * 4), %rax
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_second_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_third_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fourth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fifth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
b1dca6
+	add	$(VEC_SIZE * 4), %rax
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_second_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_third_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fourth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fifth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
b1dca6
+	add	$(VEC_SIZE * 4), %rax
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_second_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_third_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fourth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fifth_vector)
b1dca6
+
b1dca6
+	test	$((VEC_SIZE * 4) - 1), %rax
b1dca6
+	jz	L(align_four_vec_loop)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
b1dca6
+	add	$(VEC_SIZE * 5), %rax
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit)
b1dca6
+
b1dca6
+	test	$((VEC_SIZE * 4) - 1), %rax
b1dca6
+	jz	L(align_four_vec_loop)
b1dca6
+
b1dca6
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
b1dca6
+	add	$VEC_SIZE, %rax
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit)
b1dca6
+
b1dca6
+	test	$((VEC_SIZE * 4) - 1), %rax
b1dca6
+	jz	L(align_four_vec_loop)
b1dca6
+
b1dca6
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
b1dca6
+	add	$VEC_SIZE, %rax
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit)
b1dca6
+
b1dca6
+	test	$((VEC_SIZE * 4) - 1), %rax
b1dca6
+	jz	L(align_four_vec_loop)
b1dca6
+
b1dca6
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
b1dca6
+	add	$VEC_SIZE, %rax
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit)
b1dca6
+
b1dca6
+	add	$VEC_SIZE, %rax
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(align_four_vec_loop):
b1dca6
+	vmovaps	(%rax),	%ymm4
b1dca6
+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
b1dca6
+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
b1dca6
+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
b1dca6
+	add	$(VEC_SIZE * 4),	%rax
b1dca6
+	vpminub	%ymm4,	%ymm5, %ymm5
b1dca6
+	vpcmpeqb %ymm5,	%ymm6, %ymm5
b1dca6
+	vpmovmskb %ymm5,	%edx
b1dca6
+	test	%edx,	%edx
b1dca6
+	jz	L(align_four_vec_loop)
b1dca6
+
b1dca6
+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
b1dca6
+	sub	$(VEC_SIZE * 5),	%rax
b1dca6
+	vpmovmskb %ymm0, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_second_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
b1dca6
+	vpmovmskb %ymm1, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_third_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(exit_null_on_fourth_vector)
b1dca6
+
b1dca6
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+	sub	%rdi, %rax
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	add	$(VEC_SIZE * 4), %rax
b1dca6
+	jmp	L(StartStrcpyPart)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(exit):
b1dca6
+	sub	%rdi, %rax
b1dca6
+L(exit_null_on_first_vector):
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	jmp	L(StartStrcpyPart)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(exit_null_on_second_vector):
b1dca6
+	sub	%rdi, %rax
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	add	$VEC_SIZE, %rax
b1dca6
+	jmp	L(StartStrcpyPart)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(exit_null_on_third_vector):
b1dca6
+	sub	%rdi, %rax
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	add	$(VEC_SIZE * 2), %rax
b1dca6
+	jmp	L(StartStrcpyPart)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(exit_null_on_fourth_vector):
b1dca6
+	sub	%rdi, %rax
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	add	$(VEC_SIZE * 3), %rax
b1dca6
+	jmp	L(StartStrcpyPart)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(exit_null_on_fifth_vector):
b1dca6
+	sub	%rdi, %rax
b1dca6
+	bsf	%rdx, %rdx
b1dca6
+	add	%rdx, %rax
b1dca6
+	add	$(VEC_SIZE * 4), %rax
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StartStrcpyPart):
b1dca6
+	lea	(%r9, %rax), %rdi
b1dca6
+	mov	%rsi, %rcx
b1dca6
+	mov	%r9, %rax      /* save result */
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCAT
b1dca6
+	test	%r8, %r8
b1dca6
+	jz	L(ExitZero)
b1dca6
+#  define USE_AS_STRNCPY
b1dca6
+# endif
b1dca6
+
b1dca6
+# include "strcpy-avx2.S"
b1dca6
+#endif
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
b1dca6
index 1f7f6263f35ba402..694b9b2405827bd4 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/strcat.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/strcat.c
b1dca6
@@ -24,7 +24,7 @@
b1dca6
 # undef strcat
b1dca6
 
b1dca6
 # define SYMBOL_NAME strcat
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
b1dca6
 
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..81677f9060773a49
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
b1dca6
@@ -0,0 +1,1022 @@
b1dca6
+/* strcpy with AVX2
b1dca6
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
b1dca6
+   Contributed by Intel Corporation.
b1dca6
+   This file is part of the GNU C Library.
b1dca6
+
b1dca6
+   The GNU C Library is free software; you can redistribute it and/or
b1dca6
+   modify it under the terms of the GNU Lesser General Public
b1dca6
+   License as published by the Free Software Foundation; either
b1dca6
+   version 2.1 of the License, or (at your option) any later version.
b1dca6
+
b1dca6
+   The GNU C Library is distributed in the hope that it will be useful,
b1dca6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
b1dca6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
b1dca6
+   Lesser General Public License for more details.
b1dca6
+
b1dca6
+   You should have received a copy of the GNU Lesser General Public
b1dca6
+   License along with the GNU C Library; if not, see
b1dca6
+   <http://www.gnu.org/licenses/>.  */
b1dca6
+
b1dca6
+#if IS_IN (libc)
b1dca6
+
b1dca6
+# ifndef USE_AS_STRCAT
b1dca6
+#  include <sysdep.h>
b1dca6
+
b1dca6
+#  ifndef STRCPY
b1dca6
+#   define STRCPY  __strcpy_avx2
b1dca6
+#  endif
b1dca6
+
b1dca6
+# endif
b1dca6
+
b1dca6
+/* Number of bytes in a vector register */
b1dca6
+# ifndef VEC_SIZE
b1dca6
+#  define VEC_SIZE	32
b1dca6
+# endif
b1dca6
+
b1dca6
+# ifndef VZEROUPPER
b1dca6
+#  define VZEROUPPER	vzeroupper
b1dca6
+# endif
b1dca6
+
b1dca6
+/* zero register */
b1dca6
+#define xmmZ	xmm0
b1dca6
+#define ymmZ	ymm0
b1dca6
+
b1dca6
+/* mask register */
b1dca6
+#define ymmM	ymm1
b1dca6
+
b1dca6
+# ifndef USE_AS_STRCAT
b1dca6
+
b1dca6
+	.section .text.avx,"ax",@progbits
b1dca6
+ENTRY (STRCPY)
b1dca6
+#  ifdef USE_AS_STRNCPY
b1dca6
+	mov	%rdx, %r8
b1dca6
+	test	%r8, %r8
b1dca6
+	jz	L(ExitZero)
b1dca6
+#  endif
b1dca6
+	mov	%rsi, %rcx
b1dca6
+#  ifndef USE_AS_STPCPY
b1dca6
+	mov	%rdi, %rax      /* save result */
b1dca6
+#  endif
b1dca6
+
b1dca6
+# endif
b1dca6
+
b1dca6
+	vpxor	%xmmZ, %xmmZ, %xmmZ
b1dca6
+
b1dca6
+	and	$((VEC_SIZE * 4) - 1), %ecx
b1dca6
+	cmp	$(VEC_SIZE * 2), %ecx
b1dca6
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
b1dca6
+
b1dca6
+	and	$-VEC_SIZE, %rsi
b1dca6
+	and	$(VEC_SIZE - 1), %ecx
b1dca6
+
b1dca6
+	vpcmpeqb (%rsi), %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	shr	%cl, %rdx
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
b1dca6
+	mov	$VEC_SIZE, %r10
b1dca6
+	sub	%rcx, %r10
b1dca6
+	cmp	%r10, %r8
b1dca6
+#  else
b1dca6
+	mov	$(VEC_SIZE + 1), %r10
b1dca6
+	sub	%rcx, %r10
b1dca6
+	cmp	%r10, %r8
b1dca6
+#  endif
b1dca6
+	jbe	L(CopyVecSizeTailCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyVecSizeTail)
b1dca6
+
b1dca6
+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
b1dca6
+	vpmovmskb %ymm2, %edx
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	add	$VEC_SIZE, %r10
b1dca6
+	cmp	%r10, %r8
b1dca6
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyTwoVecSize)
b1dca6
+
b1dca6
+	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
b1dca6
+	vmovdqu %ymm2, (%rdi)
b1dca6
+
b1dca6
+/* If source address alignment != destination address alignment */
b1dca6
+	.p2align 4
b1dca6
+L(UnalignVecSizeBoth):
b1dca6
+	sub	%rcx, %rdi
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	add	%rcx, %r8
b1dca6
+	sbb	%rcx, %rcx
b1dca6
+	or	%rcx, %r8
b1dca6
+# endif
b1dca6
+	mov	$VEC_SIZE, %rcx
b1dca6
+	vmovdqa (%rsi, %rcx), %ymm2
b1dca6
+	vmovdqu %ymm2, (%rdi, %rcx)
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
b1dca6
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$(VEC_SIZE * 3), %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec2)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqu %ymm2, (%rdi, %rcx)
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
b1dca6
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec3)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqu %ymm3, (%rdi, %rcx)
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
b1dca6
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec4)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqu %ymm4, (%rdi, %rcx)
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
b1dca6
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec2)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqu %ymm2, (%rdi, %rcx)
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
b1dca6
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec2)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
b1dca6
+	vmovdqu %ymm2, (%rdi, %rcx)
b1dca6
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec3)
b1dca6
+# else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+# endif
b1dca6
+
b1dca6
+	vmovdqu %ymm3, (%rdi, %rcx)
b1dca6
+	mov	%rsi, %rdx
b1dca6
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
b1dca6
+	and	$-(VEC_SIZE * 4), %rsi
b1dca6
+	sub	%rsi, %rdx
b1dca6
+	sub	%rdx, %rdi
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
b1dca6
+# endif
b1dca6
+L(UnalignedFourVecSizeLoop):
b1dca6
+	vmovdqa (%rsi), %ymm4
b1dca6
+	vmovdqa VEC_SIZE(%rsi), %ymm5
b1dca6
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
b1dca6
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
b1dca6
+	vpminub %ymm5, %ymm4, %ymm2
b1dca6
+	vpminub %ymm7, %ymm6, %ymm3
b1dca6
+	vpminub %ymm2, %ymm3, %ymm3
b1dca6
+	vpcmpeqb %ymmM, %ymm3, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$(VEC_SIZE * 4), %r8
b1dca6
+	jbe	L(UnalignedLeaveCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(UnalignedFourVecSizeLeave)
b1dca6
+
b1dca6
+L(UnalignedFourVecSizeLoop_start):
b1dca6
+	add	$(VEC_SIZE * 4), %rdi
b1dca6
+	add	$(VEC_SIZE * 4), %rsi
b1dca6
+	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
b1dca6
+	vmovdqa (%rsi), %ymm4
b1dca6
+	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
b1dca6
+	vmovdqa VEC_SIZE(%rsi), %ymm5
b1dca6
+	vpminub %ymm5, %ymm4, %ymm2
b1dca6
+	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
b1dca6
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
b1dca6
+	vmovdqu %ymm7, -VEC_SIZE(%rdi)
b1dca6
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
b1dca6
+	vpminub %ymm7, %ymm6, %ymm3
b1dca6
+	vpminub %ymm2, %ymm3, %ymm3
b1dca6
+	vpcmpeqb %ymmM, %ymm3, %ymm3
b1dca6
+	vpmovmskb %ymm3, %edx
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+	sub	$(VEC_SIZE * 4), %r8
b1dca6
+	jbe	L(UnalignedLeaveCase2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jz	L(UnalignedFourVecSizeLoop_start)
b1dca6
+
b1dca6
+L(UnalignedFourVecSizeLeave):
b1dca6
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyVecSizeUnaligned_0)
b1dca6
+
b1dca6
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %ecx
b1dca6
+	test	%ecx, %ecx
b1dca6
+	jnz	L(CopyVecSizeUnaligned_16)
b1dca6
+
b1dca6
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyVecSizeUnaligned_32)
b1dca6
+
b1dca6
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %ecx
b1dca6
+	bsf	%ecx, %edx
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
b1dca6
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
b1dca6
+	add	$(VEC_SIZE - 1), %r8
b1dca6
+	sub	%rdx, %r8
b1dca6
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
b1dca6
+	jmp	L(StrncpyFillTailWithZero)
b1dca6
+# else
b1dca6
+	add	$(VEC_SIZE * 3), %rsi
b1dca6
+	add	$(VEC_SIZE * 3), %rdi
b1dca6
+	jmp	L(CopyVecSizeExit)
b1dca6
+# endif
b1dca6
+
b1dca6
+/* If source address alignment == destination address alignment */
b1dca6
+
b1dca6
+L(SourceStringAlignmentLessTwoVecSize):
b1dca6
+	vmovdqu (%rsi), %ymm3
b1dca6
+	vmovdqu VEC_SIZE(%rsi), %ymm2
b1dca6
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
b1dca6
+	cmp	$VEC_SIZE, %r8
b1dca6
+#  else
b1dca6
+	cmp	$(VEC_SIZE + 1), %r8
b1dca6
+#  endif
b1dca6
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyVecSizeTail1)
b1dca6
+
b1dca6
+	vmovdqu %ymm3, (%rdi)
b1dca6
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
b1dca6
+	cmp	$(VEC_SIZE * 2), %r8
b1dca6
+#  else
b1dca6
+	cmp	$((VEC_SIZE * 2) + 1), %r8
b1dca6
+#  endif
b1dca6
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
b1dca6
+# endif
b1dca6
+	test	%edx, %edx
b1dca6
+	jnz	L(CopyTwoVecSize1)
b1dca6
+
b1dca6
+	and	$-VEC_SIZE, %rsi
b1dca6
+	and	$(VEC_SIZE - 1), %ecx
b1dca6
+	jmp	L(UnalignVecSizeBoth)
b1dca6
+
b1dca6
+/*------End of main part with loops---------------------*/
b1dca6
+
b1dca6
+/* Case1 */
b1dca6
+
b1dca6
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSize):
b1dca6
+	add	%rcx, %rdi
b1dca6
+# endif
b1dca6
+L(CopyVecSizeTail):
b1dca6
+	add	%rcx, %rsi
b1dca6
+L(CopyVecSizeTail1):
b1dca6
+	bsf	%edx, %edx
b1dca6
+L(CopyVecSizeExit):
b1dca6
+	cmp	$32, %edx
b1dca6
+	jae	L(Exit32_63)
b1dca6
+	cmp	$16, %edx
b1dca6
+	jae	L(Exit16_31)
b1dca6
+	cmp	$8, %edx
b1dca6
+	jae	L(Exit8_15)
b1dca6
+	cmp	$4, %edx
b1dca6
+	jae	L(Exit4_7)
b1dca6
+	cmp	$3, %edx
b1dca6
+	je	L(Exit3)
b1dca6
+	cmp	$1, %edx
b1dca6
+	ja	L(Exit2)
b1dca6
+	je	L(Exit1)
b1dca6
+	movb	$0, (%rdi)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	$1, %r8
b1dca6
+	lea	1(%rdi), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyTwoVecSize1):
b1dca6
+	add	$VEC_SIZE, %rsi
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+# endif
b1dca6
+	jmp	L(CopyVecSizeTail1)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyTwoVecSize):
b1dca6
+	bsf	%edx, %edx
b1dca6
+	add	%rcx, %rsi
b1dca6
+	add	$VEC_SIZE, %edx
b1dca6
+	sub	%ecx, %edx
b1dca6
+	jmp	L(CopyVecSizeExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnaligned_0):
b1dca6
+	bsf	%edx, %edx
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+	add	$((VEC_SIZE * 4) - 1), %r8
b1dca6
+	sub	%rdx, %r8
b1dca6
+	lea	1(%rdi, %rdx), %rdi
b1dca6
+	jmp	L(StrncpyFillTailWithZero)
b1dca6
+# else
b1dca6
+	jmp	L(CopyVecSizeExit)
b1dca6
+# endif
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnaligned_16):
b1dca6
+	bsf	%ecx, %edx
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	VEC_SIZE(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
b1dca6
+	add	$((VEC_SIZE * 3) - 1), %r8
b1dca6
+	sub	%rdx, %r8
b1dca6
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
b1dca6
+	jmp	L(StrncpyFillTailWithZero)
b1dca6
+# else
b1dca6
+	add	$VEC_SIZE, %rsi
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+	jmp	L(CopyVecSizeExit)
b1dca6
+# endif
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnaligned_32):
b1dca6
+	bsf	%edx, %edx
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
b1dca6
+	add	$((VEC_SIZE * 2) - 1), %r8
b1dca6
+	sub	%rdx, %r8
b1dca6
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
b1dca6
+	jmp	L(StrncpyFillTailWithZero)
b1dca6
+# else
b1dca6
+	add	$(VEC_SIZE * 2), %rsi
b1dca6
+	add	$(VEC_SIZE * 2), %rdi
b1dca6
+	jmp	L(CopyVecSizeExit)
b1dca6
+# endif
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnalignedVec6):
b1dca6
+	vmovdqu %ymm6, (%rdi, %rcx)
b1dca6
+	jmp	L(CopyVecSizeVecExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnalignedVec5):
b1dca6
+	vmovdqu %ymm5, (%rdi, %rcx)
b1dca6
+	jmp	L(CopyVecSizeVecExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnalignedVec4):
b1dca6
+	vmovdqu %ymm4, (%rdi, %rcx)
b1dca6
+	jmp	L(CopyVecSizeVecExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnalignedVec3):
b1dca6
+	vmovdqu %ymm3, (%rdi, %rcx)
b1dca6
+	jmp	L(CopyVecSizeVecExit)
b1dca6
+#  endif
b1dca6
+
b1dca6
+/* Case2 */
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeCase2):
b1dca6
+	add	$VEC_SIZE, %r8
b1dca6
+	add	%rcx, %rdi
b1dca6
+	add	%rcx, %rsi
b1dca6
+	bsf	%edx, %edx
b1dca6
+	cmp	%r8d, %edx
b1dca6
+	jb	L(CopyVecSizeExit)
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyTwoVecSizeCase2):
b1dca6
+	add	%rcx, %rsi
b1dca6
+	bsf	%edx, %edx
b1dca6
+	add	$VEC_SIZE, %edx
b1dca6
+	sub	%ecx, %edx
b1dca6
+	cmp	%r8d, %edx
b1dca6
+	jb	L(CopyVecSizeExit)
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+L(CopyVecSizeTailCase2):
b1dca6
+	add	%rcx, %rsi
b1dca6
+	bsf	%edx, %edx
b1dca6
+	cmp	%r8d, %edx
b1dca6
+	jb	L(CopyVecSizeExit)
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+L(CopyVecSizeTail1Case2):
b1dca6
+	bsf	%edx, %edx
b1dca6
+	cmp	%r8d, %edx
b1dca6
+	jb	L(CopyVecSizeExit)
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+/* Case2 or Case3,  Case3 */
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeCase2OrCase3):
b1dca6
+	test	%rdx, %rdx
b1dca6
+	jnz	L(CopyVecSizeCase2)
b1dca6
+L(CopyVecSizeCase3):
b1dca6
+	add	$VEC_SIZE, %r8
b1dca6
+	add	%rcx, %rdi
b1dca6
+	add	%rcx, %rsi
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyTwoVecSizeCase2OrCase3):
b1dca6
+	test	%rdx, %rdx
b1dca6
+	jnz	L(CopyTwoVecSizeCase2)
b1dca6
+	add	%rcx, %rsi
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeTailCase2OrCase3):
b1dca6
+	test	%rdx, %rdx
b1dca6
+	jnz	L(CopyVecSizeTailCase2)
b1dca6
+	add	%rcx, %rsi
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyTwoVecSize1Case2OrCase3):
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+	add	$VEC_SIZE, %rsi
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+L(CopyVecSizeTail1Case2OrCase3):
b1dca6
+	test	%rdx, %rdx
b1dca6
+	jnz	L(CopyVecSizeTail1Case2)
b1dca6
+	jmp	L(StrncpyExit)
b1dca6
+# endif
b1dca6
+
b1dca6
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit1):
b1dca6
+	movzwl	(%rsi), %edx
b1dca6
+	mov	%dx, (%rdi)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	1(%rdi), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	$2, %r8
b1dca6
+	lea	2(%rdi), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit2):
b1dca6
+	movzwl	(%rsi), %ecx
b1dca6
+	mov	%cx, (%rdi)
b1dca6
+	movb	$0, 2(%rdi)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	2(%rdi), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	$3, %r8
b1dca6
+	lea	3(%rdi), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit3):
b1dca6
+	mov	(%rsi), %edx
b1dca6
+	mov	%edx, (%rdi)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	3(%rdi), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	$4, %r8
b1dca6
+	lea	4(%rdi), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit4_7):
b1dca6
+	mov	(%rsi), %ecx
b1dca6
+	mov	%ecx, (%rdi)
b1dca6
+	mov	-3(%rsi, %rdx), %ecx
b1dca6
+	mov	%ecx, -3(%rdi, %rdx)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	%rdx, %r8
b1dca6
+	sub	$1, %r8
b1dca6
+	lea	1(%rdi, %rdx), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit8_15):
b1dca6
+	mov	(%rsi), %rcx
b1dca6
+	mov	-7(%rsi, %rdx), %r9
b1dca6
+	mov	%rcx, (%rdi)
b1dca6
+	mov	%r9, -7(%rdi, %rdx)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	%rdx, %r8
b1dca6
+	sub	$1, %r8
b1dca6
+	lea	1(%rdi, %rdx), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit16_31):
b1dca6
+	vmovdqu (%rsi), %xmm2
b1dca6
+	vmovdqu -15(%rsi, %rdx), %xmm3
b1dca6
+	vmovdqu %xmm2, (%rdi)
b1dca6
+	vmovdqu %xmm3, -15(%rdi, %rdx)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub %rdx, %r8
b1dca6
+	sub $1, %r8
b1dca6
+	lea 1(%rdi, %rdx), %rdi
b1dca6
+	jnz L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Exit32_63):
b1dca6
+	vmovdqu (%rsi), %ymm2
b1dca6
+	vmovdqu -31(%rsi, %rdx), %ymm3
b1dca6
+	vmovdqu %ymm2, (%rdi)
b1dca6
+	vmovdqu %ymm3, -31(%rdi, %rdx)
b1dca6
+# ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+# endif
b1dca6
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
b1dca6
+	sub	%rdx, %r8
b1dca6
+	sub	$1, %r8
b1dca6
+	lea	1(%rdi, %rdx), %rdi
b1dca6
+	jnz	L(StrncpyFillTailWithZero)
b1dca6
+# endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+# ifdef USE_AS_STRNCPY
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit1):
b1dca6
+	movzbl	(%rsi), %edx
b1dca6
+	mov	%dl, (%rdi)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	1(%rdi), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, 1(%rdi)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit2):
b1dca6
+	movzwl	(%rsi), %edx
b1dca6
+	mov	%dx, (%rdi)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	2(%rdi), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, 2(%rdi)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit3_4):
b1dca6
+	movzwl	(%rsi), %ecx
b1dca6
+	movzwl	-2(%rsi, %r8), %edx
b1dca6
+	mov	%cx, (%rdi)
b1dca6
+	mov	%dx, -2(%rdi, %r8)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %r8), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi, %r8)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit5_8):
b1dca6
+	mov	(%rsi), %ecx
b1dca6
+	mov	-4(%rsi, %r8), %edx
b1dca6
+	mov	%ecx, (%rdi)
b1dca6
+	mov	%edx, -4(%rdi, %r8)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %r8), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi, %r8)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit9_16):
b1dca6
+	mov	(%rsi), %rcx
b1dca6
+	mov	-8(%rsi, %r8), %rdx
b1dca6
+	mov	%rcx, (%rdi)
b1dca6
+	mov	%rdx, -8(%rdi, %r8)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %r8), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi, %r8)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit17_32):
b1dca6
+	vmovdqu (%rsi), %xmm2
b1dca6
+	vmovdqu -16(%rsi, %r8), %xmm3
b1dca6
+	vmovdqu %xmm2, (%rdi)
b1dca6
+	vmovdqu %xmm3, -16(%rdi, %r8)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %r8), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi, %r8)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit33_64):
b1dca6
+	/*  0/32, 31/16 */
b1dca6
+	vmovdqu (%rsi), %ymm2
b1dca6
+	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
b1dca6
+	vmovdqu %ymm2, (%rdi)
b1dca6
+	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %r8), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi, %r8)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyExit65):
b1dca6
+	/* 0/32, 32/32, 64/1 */
b1dca6
+	vmovdqu (%rsi), %ymm2
b1dca6
+	vmovdqu 32(%rsi), %ymm3
b1dca6
+	mov	64(%rsi), %cl
b1dca6
+	vmovdqu %ymm2, (%rdi)
b1dca6
+	vmovdqu %ymm3, 32(%rdi)
b1dca6
+	mov	%cl, 64(%rdi)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	65(%rdi), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, 65(%rdi)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill1):
b1dca6
+	mov	%dl, (%rdi)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill2):
b1dca6
+	mov	%dx, (%rdi)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill3_4):
b1dca6
+	mov	%dx, (%rdi)
b1dca6
+	mov     %dx, -2(%rdi, %r8)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill5_8):
b1dca6
+	mov	%edx, (%rdi)
b1dca6
+	mov     %edx, -4(%rdi, %r8)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill9_16):
b1dca6
+	mov	%rdx, (%rdi)
b1dca6
+	mov	%rdx, -8(%rdi, %r8)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(Fill17_32):
b1dca6
+	vmovdqu %xmmZ, (%rdi)
b1dca6
+	vmovdqu %xmmZ, -16(%rdi, %r8)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeUnalignedVec2):
b1dca6
+	vmovdqu %ymm2, (%rdi, %rcx)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(CopyVecSizeVecExit):
b1dca6
+	bsf	%edx, %edx
b1dca6
+	add	$(VEC_SIZE - 1), %r8
b1dca6
+	add	%rcx, %rdi
b1dca6
+#   ifdef USE_AS_STPCPY
b1dca6
+	lea	(%rdi, %rdx), %rax
b1dca6
+#   endif
b1dca6
+	sub	%rdx, %r8
b1dca6
+	lea	1(%rdi, %rdx), %rdi
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyFillTailWithZero):
b1dca6
+	xor	%edx, %edx
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(StrncpyFillExit)
b1dca6
+
b1dca6
+	vmovdqu %ymmZ, (%rdi)
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+
b1dca6
+	mov	%rdi, %rsi
b1dca6
+	and	$(VEC_SIZE - 1), %esi
b1dca6
+	sub	%rsi, %rdi
b1dca6
+	add	%rsi, %r8
b1dca6
+	sub	$(VEC_SIZE * 4), %r8
b1dca6
+	jb	L(StrncpyFillLessFourVecSize)
b1dca6
+
b1dca6
+L(StrncpyFillLoopVmovdqa):
b1dca6
+	vmovdqa %ymmZ, (%rdi)
b1dca6
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
b1dca6
+	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
b1dca6
+	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
b1dca6
+	add	$(VEC_SIZE * 4), %rdi
b1dca6
+	sub	$(VEC_SIZE * 4), %r8
b1dca6
+	jae	L(StrncpyFillLoopVmovdqa)
b1dca6
+
b1dca6
+L(StrncpyFillLessFourVecSize):
b1dca6
+	add	$(VEC_SIZE * 2), %r8
b1dca6
+	jl	L(StrncpyFillLessTwoVecSize)
b1dca6
+	vmovdqa %ymmZ, (%rdi)
b1dca6
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
b1dca6
+	add	$(VEC_SIZE * 2), %rdi
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jl	L(StrncpyFillExit)
b1dca6
+	vmovdqa %ymmZ, (%rdi)
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+	jmp	L(Fill)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyFillLessTwoVecSize):
b1dca6
+	add	$VEC_SIZE, %r8
b1dca6
+	jl	L(StrncpyFillExit)
b1dca6
+	vmovdqa %ymmZ, (%rdi)
b1dca6
+	add	$VEC_SIZE, %rdi
b1dca6
+	jmp	L(Fill)
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(StrncpyFillExit):
b1dca6
+	add	$VEC_SIZE, %r8
b1dca6
+L(Fill):
b1dca6
+	cmp	$17, %r8d
b1dca6
+	jae	L(Fill17_32)
b1dca6
+	cmp	$9, %r8d
b1dca6
+	jae	L(Fill9_16)
b1dca6
+	cmp	$5, %r8d
b1dca6
+	jae	L(Fill5_8)
b1dca6
+	cmp	$3, %r8d
b1dca6
+	jae	L(Fill3_4)
b1dca6
+	cmp	$1, %r8d
b1dca6
+	ja	L(Fill2)
b1dca6
+	je	L(Fill1)
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+/* end of ifndef USE_AS_STRCAT */
b1dca6
+#  endif
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(UnalignedLeaveCase2OrCase3):
b1dca6
+	test	%rdx, %rdx
b1dca6
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
b1dca6
+L(UnalignedFourVecSizeLeaveCase3):
b1dca6
+	lea	(VEC_SIZE * 4)(%r8), %rcx
b1dca6
+	and	$-VEC_SIZE, %rcx
b1dca6
+	add	$(VEC_SIZE * 3), %r8
b1dca6
+	jl	L(CopyVecSizeCase3)
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jb	L(CopyVecSizeCase3)
b1dca6
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jb	L(CopyVecSizeCase3)
b1dca6
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jb	L(CopyVecSizeCase3)
b1dca6
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	lea	(VEC_SIZE * 4)(%rdi), %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (VEC_SIZE * 4)(%rdi)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(UnalignedFourVecSizeLeaveCase2):
b1dca6
+	xor	%ecx, %ecx
b1dca6
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	add	$(VEC_SIZE * 3), %r8
b1dca6
+	jle	L(CopyVecSizeCase2OrCase3)
b1dca6
+	test	%edx, %edx
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec4)
b1dca6
+#  else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+#  endif
b1dca6
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	vmovdqu %ymm4, (%rdi)
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+	test	%edx, %edx
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec5)
b1dca6
+#  else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+#  endif
b1dca6
+
b1dca6
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
b1dca6
+	add	$VEC_SIZE, %rcx
b1dca6
+	sub	$VEC_SIZE, %r8
b1dca6
+	jbe	L(CopyVecSizeCase2OrCase3)
b1dca6
+	test	%edx, %edx
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+	jnz	L(CopyVecSizeUnalignedVec6)
b1dca6
+#  else
b1dca6
+	jnz	L(CopyVecSize)
b1dca6
+#  endif
b1dca6
+
b1dca6
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
b1dca6
+	vpmovmskb %ymmM, %edx
b1dca6
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
b1dca6
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
b1dca6
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
b1dca6
+	bsf	%edx, %edx
b1dca6
+	cmp	%r8d, %edx
b1dca6
+	jb	L(CopyVecSizeExit)
b1dca6
+L(StrncpyExit):
b1dca6
+	cmp	$65, %r8d
b1dca6
+	je	L(StrncpyExit65)
b1dca6
+	cmp	$33, %r8d
b1dca6
+	jae	L(StrncpyExit33_64)
b1dca6
+	cmp	$17, %r8d
b1dca6
+	jae	L(StrncpyExit17_32)
b1dca6
+	cmp	$9, %r8d
b1dca6
+	jae	L(StrncpyExit9_16)
b1dca6
+	cmp	$5, %r8d
b1dca6
+	jae	L(StrncpyExit5_8)
b1dca6
+	cmp	$3, %r8d
b1dca6
+	jae	L(StrncpyExit3_4)
b1dca6
+	cmp	$1, %r8d
b1dca6
+	ja	L(StrncpyExit2)
b1dca6
+	je	L(StrncpyExit1)
b1dca6
+#  ifdef USE_AS_STPCPY
b1dca6
+	mov	%rdi, %rax
b1dca6
+#  endif
b1dca6
+#  ifdef USE_AS_STRCAT
b1dca6
+	movb	$0, (%rdi)
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+	.p2align 4
b1dca6
+L(ExitZero):
b1dca6
+#  ifndef USE_AS_STRCAT
b1dca6
+	mov	%rdi, %rax
b1dca6
+#  endif
b1dca6
+	VZEROUPPER
b1dca6
+	ret
b1dca6
+
b1dca6
+# endif
b1dca6
+
b1dca6
+# ifndef USE_AS_STRCAT
b1dca6
+END (STRCPY)
b1dca6
+# else
b1dca6
+END (STRCAT)
b1dca6
+# endif
b1dca6
+#endif
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
b1dca6
index 12e0e3ffe20602c6..ecf90d4b044a1b01 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/strcpy.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/strcpy.c
b1dca6
@@ -24,7 +24,7 @@
b1dca6
 # undef strcpy
b1dca6
 
b1dca6
 # define SYMBOL_NAME strcpy
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
b1dca6
 
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..bfefa659bb6281fa
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
b1dca6
@@ -0,0 +1,3 @@
b1dca6
+#define USE_AS_STRNCAT
b1dca6
+#define STRCAT __strncat_avx2
b1dca6
+#include "strcat-avx2.S"
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
b1dca6
index 841c165565add132..74f7d028ae23d700 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/strncat.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/strncat.c
b1dca6
@@ -24,7 +24,7 @@
b1dca6
 # undef strncat
b1dca6
 
b1dca6
 # define SYMBOL_NAME strncat
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
b1dca6
 strong_alias (strncat, __strncat);
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
b1dca6
new file mode 100644
b1dca6
index 0000000000000000..9ef8c87627dc4924
b1dca6
--- /dev/null
b1dca6
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
b1dca6
@@ -0,0 +1,3 @@
b1dca6
+#define USE_AS_STRNCPY
b1dca6
+#define STRCPY __strncpy_avx2
b1dca6
+#include "strcpy-avx2.S"
b1dca6
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
b1dca6
index 3c3de8b18ebb177f..93dfb4cfde79467a 100644
b1dca6
--- a/sysdeps/x86_64/multiarch/strncpy.c
b1dca6
+++ b/sysdeps/x86_64/multiarch/strncpy.c
b1dca6
@@ -24,7 +24,7 @@
b1dca6
 # undef strncpy
b1dca6
 
b1dca6
 # define SYMBOL_NAME strncpy
b1dca6
-# include "ifunc-unaligned-ssse3.h"
b1dca6
+# include "ifunc-strcpy.h"
b1dca6
 
b1dca6
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
b1dca6