e354a5
commit 1a153e47fcc9401d8ea424ad86569a57ed0f8c52
e354a5
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
e354a5
Date:   Mon Oct 8 08:59:50 2018 -0500
e354a5
e354a5
    x86-64: Optimize strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2
e354a5
    
e354a5
    Optimize x86-64 strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2.
e354a5
    It uses vector comparison as much as possible. In general, the larger the
e354a5
    source string, the greater performance gain observed, reaching speedups of
e354a5
    1.6x compared to SSE2 unaligned routines. Select AVX2 strcat/strncat,
e354a5
    strcpy/strncpy and stpcpy/stpncpy on AVX2 machines where vzeroupper is
e354a5
    preferred and AVX unaligned load is fast.
e354a5
    
e354a5
            * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
e354a5
            strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
e354a5
            stpcpy-avx2 and stpncpy-avx2.
e354a5
            * sysdeps/x86_64/multiarch/ifunc-impl-list.c:
e354a5
            (__libc_ifunc_impl_list): Add tests for __strcat_avx2,
e354a5
            __strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
e354a5
            and __stpncpy_avx2.
e354a5
            * sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
e354a5
            ifunc-strcpy.h}: rename header for a more generic name.
e354a5
            * sysdeps/x86_64/multiarch/ifunc-strcpy.h:
e354a5
            (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
e354a5
            AVX unaligned load is fast and vzeroupper is preferred.
e354a5
            * sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file
e354a5
            * sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise
e354a5
            * sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise
e354a5
            * sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise
e354a5
            * sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise
e354a5
            * sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise
e354a5
e354a5
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
e354a5
index bb5e97073520ee51..395e432c092ca17c 100644
e354a5
--- a/sysdeps/x86_64/multiarch/Makefile
e354a5
+++ b/sysdeps/x86_64/multiarch/Makefile
e354a5
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
e354a5
 		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
e354a5
 		   strrchr-sse2 strrchr-avx2 \
e354a5
 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
e354a5
+		   strcat-avx2 strncat-avx2 \
e354a5
 		   strcat-ssse3 strncat-ssse3\
e354a5
+		   strcpy-avx2 strncpy-avx2 \
e354a5
 		   strcpy-sse2 stpcpy-sse2 \
e354a5
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
e354a5
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
e354a5
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
e354a5
+		   stpcpy-avx2 stpncpy-avx2 \
e354a5
 		   strcat-sse2 \
e354a5
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
e354a5
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
e354a5
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
e354a5
index 9aaaef7251b8edfe..8b55bb6954000cc2 100644
e354a5
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
e354a5
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
e354a5
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
   IFUNC_IMPL (i, name, stpncpy,
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __stpncpy_ssse3)
e354a5
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __stpncpy_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
e354a5
 			      __stpncpy_sse2_unaligned)
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
e354a5
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
   IFUNC_IMPL (i, name, stpcpy,
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __stpcpy_ssse3)
e354a5
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __stpcpy_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
e354a5
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
e354a5
 
e354a5
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
 
e354a5
   /* Support sysdeps/x86_64/multiarch/strcat.c.  */
e354a5
   IFUNC_IMPL (i, name, strcat,
e354a5
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __strcat_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __strcat_ssse3)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
e354a5
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
 
e354a5
   /* Support sysdeps/x86_64/multiarch/strcpy.c.  */
e354a5
   IFUNC_IMPL (i, name, strcpy,
e354a5
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __strcpy_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __strcpy_ssse3)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
e354a5
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
 
e354a5
   /* Support sysdeps/x86_64/multiarch/strncat.c.  */
e354a5
   IFUNC_IMPL (i, name, strncat,
e354a5
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __strncat_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __strncat_ssse3)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
e354a5
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
 
e354a5
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
e354a5
   IFUNC_IMPL (i, name, strncpy,
e354a5
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
e354a5
+			      __strncpy_avx2)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
e354a5
 			      __strncpy_ssse3)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
e354a5
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
e354a5
similarity index 83%
e354a5
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
e354a5
rename to sysdeps/x86_64/multiarch/ifunc-strcpy.h
e354a5
index 81805f9832345923..4f2286fefccda069 100644
e354a5
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
e354a5
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
e354a5
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
e354a5
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
e354a5
   attribute_hidden;
e354a5
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
e354a5
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
e354a5
 
e354a5
 static inline void *
e354a5
 IFUNC_SELECTOR (void)
e354a5
 {
e354a5
   const struct cpu_features* cpu_features = __get_cpu_features ();
e354a5
 
e354a5
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
e354a5
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
e354a5
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
e354a5
+    return OPTIMIZE (avx2);
e354a5
+
e354a5
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
e354a5
     return OPTIMIZE (sse2_unaligned);
e354a5
 
e354a5
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..f0bd3029fe3047ed
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
e354a5
@@ -0,0 +1,3 @@
e354a5
+#define USE_AS_STPCPY
e354a5
+#define STRCPY __stpcpy_avx2
e354a5
+#include "strcpy-avx2.S"
e354a5
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
e354a5
index 1e340fca991a021c..8ffd13b48c83ca8e 100644
e354a5
--- a/sysdeps/x86_64/multiarch/stpcpy.c
e354a5
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
e354a5
@@ -28,7 +28,7 @@
e354a5
 # undef __stpcpy
e354a5
 
e354a5
 # define SYMBOL_NAME stpcpy
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
e354a5
 
e354a5
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..032b0407d08c6a9d
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
e354a5
@@ -0,0 +1,4 @@
e354a5
+#define USE_AS_STPCPY
e354a5
+#define USE_AS_STRNCPY
e354a5
+#define STRCPY __stpncpy_avx2
e354a5
+#include "strcpy-avx2.S"
e354a5
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
e354a5
index 28842ece2b0998e3..f3e203f78cca2e61 100644
e354a5
--- a/sysdeps/x86_64/multiarch/stpncpy.c
e354a5
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
e354a5
@@ -26,7 +26,7 @@
e354a5
 # undef __stpncpy
e354a5
 
e354a5
 # define SYMBOL_NAME stpncpy
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
e354a5
 
e354a5
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..b062356427677ca6
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
e354a5
@@ -0,0 +1,275 @@
e354a5
+/* strcat with AVX2
e354a5
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
e354a5
+   Contributed by Intel Corporation.
e354a5
+   This file is part of the GNU C Library.
e354a5
+
e354a5
+   The GNU C Library is free software; you can redistribute it and/or
e354a5
+   modify it under the terms of the GNU Lesser General Public
e354a5
+   License as published by the Free Software Foundation; either
e354a5
+   version 2.1 of the License, or (at your option) any later version.
e354a5
+
e354a5
+   The GNU C Library is distributed in the hope that it will be useful,
e354a5
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e354a5
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e354a5
+   Lesser General Public License for more details.
e354a5
+
e354a5
+   You should have received a copy of the GNU Lesser General Public
e354a5
+   License along with the GNU C Library; if not, see
e354a5
+   <http://www.gnu.org/licenses/>.  */
e354a5
+
e354a5
+#if IS_IN (libc)
e354a5
+
e354a5
+# include <sysdep.h>
e354a5
+
e354a5
+# ifndef STRCAT
e354a5
+#  define STRCAT  __strcat_avx2
e354a5
+# endif
e354a5
+
e354a5
+# define USE_AS_STRCAT
e354a5
+
e354a5
+/* Number of bytes in a vector register */
e354a5
+# define VEC_SIZE	32
e354a5
+
e354a5
+	.section .text.avx,"ax",@progbits
e354a5
+ENTRY (STRCAT)
e354a5
+	mov	%rdi, %r9
e354a5
+# ifdef USE_AS_STRNCAT
e354a5
+	mov	%rdx, %r8
e354a5
+# endif
e354a5
+
e354a5
+	xor	%eax, %eax
e354a5
+	mov	%edi, %ecx
e354a5
+	and	$((VEC_SIZE * 4) - 1), %ecx
e354a5
+	vpxor	%xmm6, %xmm6, %xmm6
e354a5
+	cmp	$(VEC_SIZE * 3), %ecx
e354a5
+	ja	L(fourth_vector_boundary)
e354a5
+	vpcmpeqb (%rdi), %ymm6, %ymm0
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_first_vector)
e354a5
+	mov	%rdi, %rax
e354a5
+	and	$-VEC_SIZE, %rax
e354a5
+	jmp	L(align_vec_size_start)
e354a5
+L(fourth_vector_boundary):
e354a5
+	mov	%rdi, %rax
e354a5
+	and	$-VEC_SIZE, %rax
e354a5
+	vpcmpeqb	(%rax), %ymm6, %ymm0
e354a5
+	mov	$-1, %r10d
e354a5
+	sub	%rax, %rcx
e354a5
+	shl	%cl, %r10d
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	and	%r10d, %edx
e354a5
+	jnz	L(exit)
e354a5
+
e354a5
+L(align_vec_size_start):
e354a5
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_second_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_third_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fourth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fifth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
e354a5
+	add	$(VEC_SIZE * 4), %rax
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_second_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_third_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fourth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fifth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
e354a5
+	add	$(VEC_SIZE * 4), %rax
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_second_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_third_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fourth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fifth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
e354a5
+	add	$(VEC_SIZE * 4), %rax
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_second_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_third_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fourth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fifth_vector)
e354a5
+
e354a5
+	test	$((VEC_SIZE * 4) - 1), %rax
e354a5
+	jz	L(align_four_vec_loop)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
e354a5
+	add	$(VEC_SIZE * 5), %rax
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit)
e354a5
+
e354a5
+	test	$((VEC_SIZE * 4) - 1), %rax
e354a5
+	jz	L(align_four_vec_loop)
e354a5
+
e354a5
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
e354a5
+	add	$VEC_SIZE, %rax
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit)
e354a5
+
e354a5
+	test	$((VEC_SIZE * 4) - 1), %rax
e354a5
+	jz	L(align_four_vec_loop)
e354a5
+
e354a5
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
e354a5
+	add	$VEC_SIZE, %rax
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit)
e354a5
+
e354a5
+	test	$((VEC_SIZE * 4) - 1), %rax
e354a5
+	jz	L(align_four_vec_loop)
e354a5
+
e354a5
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
e354a5
+	add	$VEC_SIZE, %rax
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit)
e354a5
+
e354a5
+	add	$VEC_SIZE, %rax
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(align_four_vec_loop):
e354a5
+	vmovaps	(%rax),	%ymm4
e354a5
+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
e354a5
+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
e354a5
+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
e354a5
+	add	$(VEC_SIZE * 4),	%rax
e354a5
+	vpminub	%ymm4,	%ymm5, %ymm5
e354a5
+	vpcmpeqb %ymm5,	%ymm6, %ymm5
e354a5
+	vpmovmskb %ymm5,	%edx
e354a5
+	test	%edx,	%edx
e354a5
+	jz	L(align_four_vec_loop)
e354a5
+
e354a5
+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
e354a5
+	sub	$(VEC_SIZE * 5),	%rax
e354a5
+	vpmovmskb %ymm0, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_second_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
e354a5
+	vpmovmskb %ymm1, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_third_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(exit_null_on_fourth_vector)
e354a5
+
e354a5
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+	sub	%rdi, %rax
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	add	$(VEC_SIZE * 4), %rax
e354a5
+	jmp	L(StartStrcpyPart)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(exit):
e354a5
+	sub	%rdi, %rax
e354a5
+L(exit_null_on_first_vector):
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	jmp	L(StartStrcpyPart)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(exit_null_on_second_vector):
e354a5
+	sub	%rdi, %rax
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	add	$VEC_SIZE, %rax
e354a5
+	jmp	L(StartStrcpyPart)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(exit_null_on_third_vector):
e354a5
+	sub	%rdi, %rax
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	add	$(VEC_SIZE * 2), %rax
e354a5
+	jmp	L(StartStrcpyPart)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(exit_null_on_fourth_vector):
e354a5
+	sub	%rdi, %rax
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	add	$(VEC_SIZE * 3), %rax
e354a5
+	jmp	L(StartStrcpyPart)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(exit_null_on_fifth_vector):
e354a5
+	sub	%rdi, %rax
e354a5
+	bsf	%rdx, %rdx
e354a5
+	add	%rdx, %rax
e354a5
+	add	$(VEC_SIZE * 4), %rax
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StartStrcpyPart):
e354a5
+	lea	(%r9, %rax), %rdi
e354a5
+	mov	%rsi, %rcx
e354a5
+	mov	%r9, %rax      /* save result */
e354a5
+
e354a5
+# ifdef USE_AS_STRNCAT
e354a5
+	test	%r8, %r8
e354a5
+	jz	L(ExitZero)
e354a5
+#  define USE_AS_STRNCPY
e354a5
+# endif
e354a5
+
e354a5
+# include "strcpy-avx2.S"
e354a5
+#endif
e354a5
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
e354a5
index 1f7f6263f35ba402..694b9b2405827bd4 100644
e354a5
--- a/sysdeps/x86_64/multiarch/strcat.c
e354a5
+++ b/sysdeps/x86_64/multiarch/strcat.c
e354a5
@@ -24,7 +24,7 @@
e354a5
 # undef strcat
e354a5
 
e354a5
 # define SYMBOL_NAME strcat
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
e354a5
 
e354a5
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..81677f9060773a49
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
e354a5
@@ -0,0 +1,1022 @@
e354a5
+/* strcpy with AVX2
e354a5
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
e354a5
+   Contributed by Intel Corporation.
e354a5
+   This file is part of the GNU C Library.
e354a5
+
e354a5
+   The GNU C Library is free software; you can redistribute it and/or
e354a5
+   modify it under the terms of the GNU Lesser General Public
e354a5
+   License as published by the Free Software Foundation; either
e354a5
+   version 2.1 of the License, or (at your option) any later version.
e354a5
+
e354a5
+   The GNU C Library is distributed in the hope that it will be useful,
e354a5
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e354a5
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e354a5
+   Lesser General Public License for more details.
e354a5
+
e354a5
+   You should have received a copy of the GNU Lesser General Public
e354a5
+   License along with the GNU C Library; if not, see
e354a5
+   <http://www.gnu.org/licenses/>.  */
e354a5
+
e354a5
+#if IS_IN (libc)
e354a5
+
e354a5
+# ifndef USE_AS_STRCAT
e354a5
+#  include <sysdep.h>
e354a5
+
e354a5
+#  ifndef STRCPY
e354a5
+#   define STRCPY  __strcpy_avx2
e354a5
+#  endif
e354a5
+
e354a5
+# endif
e354a5
+
e354a5
+/* Number of bytes in a vector register */
e354a5
+# ifndef VEC_SIZE
e354a5
+#  define VEC_SIZE	32
e354a5
+# endif
e354a5
+
e354a5
+# ifndef VZEROUPPER
e354a5
+#  define VZEROUPPER	vzeroupper
e354a5
+# endif
e354a5
+
e354a5
+/* zero register */
e354a5
+#define xmmZ	xmm0
e354a5
+#define ymmZ	ymm0
e354a5
+
e354a5
+/* mask register */
e354a5
+#define ymmM	ymm1
e354a5
+
e354a5
+# ifndef USE_AS_STRCAT
e354a5
+
e354a5
+	.section .text.avx,"ax",@progbits
e354a5
+ENTRY (STRCPY)
e354a5
+#  ifdef USE_AS_STRNCPY
e354a5
+	mov	%rdx, %r8
e354a5
+	test	%r8, %r8
e354a5
+	jz	L(ExitZero)
e354a5
+#  endif
e354a5
+	mov	%rsi, %rcx
e354a5
+#  ifndef USE_AS_STPCPY
e354a5
+	mov	%rdi, %rax      /* save result */
e354a5
+#  endif
e354a5
+
e354a5
+# endif
e354a5
+
e354a5
+	vpxor	%xmmZ, %xmmZ, %xmmZ
e354a5
+
e354a5
+	and	$((VEC_SIZE * 4) - 1), %ecx
e354a5
+	cmp	$(VEC_SIZE * 2), %ecx
e354a5
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
e354a5
+
e354a5
+	and	$-VEC_SIZE, %rsi
e354a5
+	and	$(VEC_SIZE - 1), %ecx
e354a5
+
e354a5
+	vpcmpeqb (%rsi), %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	shr	%cl, %rdx
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
e354a5
+	mov	$VEC_SIZE, %r10
e354a5
+	sub	%rcx, %r10
e354a5
+	cmp	%r10, %r8
e354a5
+#  else
e354a5
+	mov	$(VEC_SIZE + 1), %r10
e354a5
+	sub	%rcx, %r10
e354a5
+	cmp	%r10, %r8
e354a5
+#  endif
e354a5
+	jbe	L(CopyVecSizeTailCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyVecSizeTail)
e354a5
+
e354a5
+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
e354a5
+	vpmovmskb %ymm2, %edx
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	add	$VEC_SIZE, %r10
e354a5
+	cmp	%r10, %r8
e354a5
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyTwoVecSize)
e354a5
+
e354a5
+	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
e354a5
+	vmovdqu %ymm2, (%rdi)
e354a5
+
e354a5
+/* If source address alignment != destination address alignment */
e354a5
+	.p2align 4
e354a5
+L(UnalignVecSizeBoth):
e354a5
+	sub	%rcx, %rdi
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	add	%rcx, %r8
e354a5
+	sbb	%rcx, %rcx
e354a5
+	or	%rcx, %r8
e354a5
+# endif
e354a5
+	mov	$VEC_SIZE, %rcx
e354a5
+	vmovdqa (%rsi, %rcx), %ymm2
e354a5
+	vmovdqu %ymm2, (%rdi, %rcx)
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
e354a5
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$(VEC_SIZE * 3), %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec2)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqu %ymm2, (%rdi, %rcx)
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
e354a5
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec3)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqu %ymm3, (%rdi, %rcx)
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
e354a5
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec4)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqu %ymm4, (%rdi, %rcx)
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
e354a5
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec2)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqu %ymm2, (%rdi, %rcx)
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
e354a5
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec2)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
e354a5
+	vmovdqu %ymm2, (%rdi, %rcx)
e354a5
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec3)
e354a5
+# else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+# endif
e354a5
+
e354a5
+	vmovdqu %ymm3, (%rdi, %rcx)
e354a5
+	mov	%rsi, %rdx
e354a5
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
e354a5
+	and	$-(VEC_SIZE * 4), %rsi
e354a5
+	sub	%rsi, %rdx
e354a5
+	sub	%rdx, %rdi
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
e354a5
+# endif
e354a5
+L(UnalignedFourVecSizeLoop):
e354a5
+	vmovdqa (%rsi), %ymm4
e354a5
+	vmovdqa VEC_SIZE(%rsi), %ymm5
e354a5
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
e354a5
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
e354a5
+	vpminub %ymm5, %ymm4, %ymm2
e354a5
+	vpminub %ymm7, %ymm6, %ymm3
e354a5
+	vpminub %ymm2, %ymm3, %ymm3
e354a5
+	vpcmpeqb %ymmM, %ymm3, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$(VEC_SIZE * 4), %r8
e354a5
+	jbe	L(UnalignedLeaveCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(UnalignedFourVecSizeLeave)
e354a5
+
e354a5
+L(UnalignedFourVecSizeLoop_start):
e354a5
+	add	$(VEC_SIZE * 4), %rdi
e354a5
+	add	$(VEC_SIZE * 4), %rsi
e354a5
+	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
e354a5
+	vmovdqa (%rsi), %ymm4
e354a5
+	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
e354a5
+	vmovdqa VEC_SIZE(%rsi), %ymm5
e354a5
+	vpminub %ymm5, %ymm4, %ymm2
e354a5
+	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
e354a5
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
e354a5
+	vmovdqu %ymm7, -VEC_SIZE(%rdi)
e354a5
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
e354a5
+	vpminub %ymm7, %ymm6, %ymm3
e354a5
+	vpminub %ymm2, %ymm3, %ymm3
e354a5
+	vpcmpeqb %ymmM, %ymm3, %ymm3
e354a5
+	vpmovmskb %ymm3, %edx
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+	sub	$(VEC_SIZE * 4), %r8
e354a5
+	jbe	L(UnalignedLeaveCase2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jz	L(UnalignedFourVecSizeLoop_start)
e354a5
+
e354a5
+L(UnalignedFourVecSizeLeave):
e354a5
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyVecSizeUnaligned_0)
e354a5
+
e354a5
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %ecx
e354a5
+	test	%ecx, %ecx
e354a5
+	jnz	L(CopyVecSizeUnaligned_16)
e354a5
+
e354a5
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyVecSizeUnaligned_32)
e354a5
+
e354a5
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %ecx
e354a5
+	bsf	%ecx, %edx
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
e354a5
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
e354a5
+	add	$(VEC_SIZE - 1), %r8
e354a5
+	sub	%rdx, %r8
e354a5
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
e354a5
+	jmp	L(StrncpyFillTailWithZero)
e354a5
+# else
e354a5
+	add	$(VEC_SIZE * 3), %rsi
e354a5
+	add	$(VEC_SIZE * 3), %rdi
e354a5
+	jmp	L(CopyVecSizeExit)
e354a5
+# endif
e354a5
+
e354a5
+/* If source address alignment == destination address alignment */
e354a5
+
e354a5
+L(SourceStringAlignmentLessTwoVecSize):
e354a5
+	vmovdqu (%rsi), %ymm3
e354a5
+	vmovdqu VEC_SIZE(%rsi), %ymm2
e354a5
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
e354a5
+	cmp	$VEC_SIZE, %r8
e354a5
+#  else
e354a5
+	cmp	$(VEC_SIZE + 1), %r8
e354a5
+#  endif
e354a5
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyVecSizeTail1)
e354a5
+
e354a5
+	vmovdqu %ymm3, (%rdi)
e354a5
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
e354a5
+	cmp	$(VEC_SIZE * 2), %r8
e354a5
+#  else
e354a5
+	cmp	$((VEC_SIZE * 2) + 1), %r8
e354a5
+#  endif
e354a5
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
e354a5
+# endif
e354a5
+	test	%edx, %edx
e354a5
+	jnz	L(CopyTwoVecSize1)
e354a5
+
e354a5
+	and	$-VEC_SIZE, %rsi
e354a5
+	and	$(VEC_SIZE - 1), %ecx
e354a5
+	jmp	L(UnalignVecSizeBoth)
e354a5
+
e354a5
+/*------End of main part with loops---------------------*/
e354a5
+
e354a5
+/* Case1 */
e354a5
+
e354a5
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
e354a5
+	.p2align 4
e354a5
+L(CopyVecSize):
e354a5
+	add	%rcx, %rdi
e354a5
+# endif
e354a5
+L(CopyVecSizeTail):
e354a5
+	add	%rcx, %rsi
e354a5
+L(CopyVecSizeTail1):
e354a5
+	bsf	%edx, %edx
e354a5
+L(CopyVecSizeExit):
e354a5
+	cmp	$32, %edx
e354a5
+	jae	L(Exit32_63)
e354a5
+	cmp	$16, %edx
e354a5
+	jae	L(Exit16_31)
e354a5
+	cmp	$8, %edx
e354a5
+	jae	L(Exit8_15)
e354a5
+	cmp	$4, %edx
e354a5
+	jae	L(Exit4_7)
e354a5
+	cmp	$3, %edx
e354a5
+	je	L(Exit3)
e354a5
+	cmp	$1, %edx
e354a5
+	ja	L(Exit2)
e354a5
+	je	L(Exit1)
e354a5
+	movb	$0, (%rdi)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	$1, %r8
e354a5
+	lea	1(%rdi), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyTwoVecSize1):
e354a5
+	add	$VEC_SIZE, %rsi
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+# endif
e354a5
+	jmp	L(CopyVecSizeTail1)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyTwoVecSize):
e354a5
+	bsf	%edx, %edx
e354a5
+	add	%rcx, %rsi
e354a5
+	add	$VEC_SIZE, %edx
e354a5
+	sub	%ecx, %edx
e354a5
+	jmp	L(CopyVecSizeExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnaligned_0):
e354a5
+	bsf	%edx, %edx
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+	add	$((VEC_SIZE * 4) - 1), %r8
e354a5
+	sub	%rdx, %r8
e354a5
+	lea	1(%rdi, %rdx), %rdi
e354a5
+	jmp	L(StrncpyFillTailWithZero)
e354a5
+# else
e354a5
+	jmp	L(CopyVecSizeExit)
e354a5
+# endif
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnaligned_16):
e354a5
+	bsf	%ecx, %edx
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	VEC_SIZE(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
e354a5
+	add	$((VEC_SIZE * 3) - 1), %r8
e354a5
+	sub	%rdx, %r8
e354a5
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
e354a5
+	jmp	L(StrncpyFillTailWithZero)
e354a5
+# else
e354a5
+	add	$VEC_SIZE, %rsi
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+	jmp	L(CopyVecSizeExit)
e354a5
+# endif
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnaligned_32):
e354a5
+	bsf	%edx, %edx
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
e354a5
+	add	$((VEC_SIZE * 2) - 1), %r8
e354a5
+	sub	%rdx, %r8
e354a5
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
e354a5
+	jmp	L(StrncpyFillTailWithZero)
e354a5
+# else
e354a5
+	add	$(VEC_SIZE * 2), %rsi
e354a5
+	add	$(VEC_SIZE * 2), %rdi
e354a5
+	jmp	L(CopyVecSizeExit)
e354a5
+# endif
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnalignedVec6):
e354a5
+	vmovdqu %ymm6, (%rdi, %rcx)
e354a5
+	jmp	L(CopyVecSizeVecExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnalignedVec5):
e354a5
+	vmovdqu %ymm5, (%rdi, %rcx)
e354a5
+	jmp	L(CopyVecSizeVecExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnalignedVec4):
e354a5
+	vmovdqu %ymm4, (%rdi, %rcx)
e354a5
+	jmp	L(CopyVecSizeVecExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnalignedVec3):
e354a5
+	vmovdqu %ymm3, (%rdi, %rcx)
e354a5
+	jmp	L(CopyVecSizeVecExit)
e354a5
+#  endif
e354a5
+
e354a5
+/* Case2 */
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeCase2):
e354a5
+	add	$VEC_SIZE, %r8
e354a5
+	add	%rcx, %rdi
e354a5
+	add	%rcx, %rsi
e354a5
+	bsf	%edx, %edx
e354a5
+	cmp	%r8d, %edx
e354a5
+	jb	L(CopyVecSizeExit)
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyTwoVecSizeCase2):
e354a5
+	add	%rcx, %rsi
e354a5
+	bsf	%edx, %edx
e354a5
+	add	$VEC_SIZE, %edx
e354a5
+	sub	%ecx, %edx
e354a5
+	cmp	%r8d, %edx
e354a5
+	jb	L(CopyVecSizeExit)
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+L(CopyVecSizeTailCase2):
e354a5
+	add	%rcx, %rsi
e354a5
+	bsf	%edx, %edx
e354a5
+	cmp	%r8d, %edx
e354a5
+	jb	L(CopyVecSizeExit)
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+L(CopyVecSizeTail1Case2):
e354a5
+	bsf	%edx, %edx
e354a5
+	cmp	%r8d, %edx
e354a5
+	jb	L(CopyVecSizeExit)
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+/* Case2 or Case3,  Case3 */
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeCase2OrCase3):
e354a5
+	test	%rdx, %rdx
e354a5
+	jnz	L(CopyVecSizeCase2)
e354a5
+L(CopyVecSizeCase3):
e354a5
+	add	$VEC_SIZE, %r8
e354a5
+	add	%rcx, %rdi
e354a5
+	add	%rcx, %rsi
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyTwoVecSizeCase2OrCase3):
e354a5
+	test	%rdx, %rdx
e354a5
+	jnz	L(CopyTwoVecSizeCase2)
e354a5
+	add	%rcx, %rsi
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeTailCase2OrCase3):
e354a5
+	test	%rdx, %rdx
e354a5
+	jnz	L(CopyVecSizeTailCase2)
e354a5
+	add	%rcx, %rsi
e354a5
+	jmp	L(StrncpyExit)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyTwoVecSize1Case2OrCase3):
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+	add	$VEC_SIZE, %rsi
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+L(CopyVecSizeTail1Case2OrCase3):
e354a5
+	test	%rdx, %rdx
e354a5
+	jnz	L(CopyVecSizeTail1Case2)
e354a5
+	jmp	L(StrncpyExit)
e354a5
+# endif
e354a5
+
e354a5
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit1):
e354a5
+	movzwl	(%rsi), %edx
e354a5
+	mov	%dx, (%rdi)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	1(%rdi), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	$2, %r8
e354a5
+	lea	2(%rdi), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit2):
e354a5
+	movzwl	(%rsi), %ecx
e354a5
+	mov	%cx, (%rdi)
e354a5
+	movb	$0, 2(%rdi)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	2(%rdi), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	$3, %r8
e354a5
+	lea	3(%rdi), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit3):
e354a5
+	mov	(%rsi), %edx
e354a5
+	mov	%edx, (%rdi)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	3(%rdi), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	$4, %r8
e354a5
+	lea	4(%rdi), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit4_7):
e354a5
+	mov	(%rsi), %ecx
e354a5
+	mov	%ecx, (%rdi)
e354a5
+	mov	-3(%rsi, %rdx), %ecx
e354a5
+	mov	%ecx, -3(%rdi, %rdx)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	%rdx, %r8
e354a5
+	sub	$1, %r8
e354a5
+	lea	1(%rdi, %rdx), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit8_15):
e354a5
+	mov	(%rsi), %rcx
e354a5
+	mov	-7(%rsi, %rdx), %r9
e354a5
+	mov	%rcx, (%rdi)
e354a5
+	mov	%r9, -7(%rdi, %rdx)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	%rdx, %r8
e354a5
+	sub	$1, %r8
e354a5
+	lea	1(%rdi, %rdx), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit16_31):
e354a5
+	vmovdqu (%rsi), %xmm2
e354a5
+	vmovdqu -15(%rsi, %rdx), %xmm3
e354a5
+	vmovdqu %xmm2, (%rdi)
e354a5
+	vmovdqu %xmm3, -15(%rdi, %rdx)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub %rdx, %r8
e354a5
+	sub $1, %r8
e354a5
+	lea 1(%rdi, %rdx), %rdi
e354a5
+	jnz L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Exit32_63):
e354a5
+	vmovdqu (%rsi), %ymm2
e354a5
+	vmovdqu -31(%rsi, %rdx), %ymm3
e354a5
+	vmovdqu %ymm2, (%rdi)
e354a5
+	vmovdqu %ymm3, -31(%rdi, %rdx)
e354a5
+# ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+# endif
e354a5
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
e354a5
+	sub	%rdx, %r8
e354a5
+	sub	$1, %r8
e354a5
+	lea	1(%rdi, %rdx), %rdi
e354a5
+	jnz	L(StrncpyFillTailWithZero)
e354a5
+# endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+# ifdef USE_AS_STRNCPY
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit1):
e354a5
+	movzbl	(%rsi), %edx
e354a5
+	mov	%dl, (%rdi)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	1(%rdi), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, 1(%rdi)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit2):
e354a5
+	movzwl	(%rsi), %edx
e354a5
+	mov	%dx, (%rdi)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	2(%rdi), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, 2(%rdi)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit3_4):
e354a5
+	movzwl	(%rsi), %ecx
e354a5
+	movzwl	-2(%rsi, %r8), %edx
e354a5
+	mov	%cx, (%rdi)
e354a5
+	mov	%dx, -2(%rdi, %r8)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %r8), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi, %r8)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit5_8):
e354a5
+	mov	(%rsi), %ecx
e354a5
+	mov	-4(%rsi, %r8), %edx
e354a5
+	mov	%ecx, (%rdi)
e354a5
+	mov	%edx, -4(%rdi, %r8)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %r8), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi, %r8)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit9_16):
e354a5
+	mov	(%rsi), %rcx
e354a5
+	mov	-8(%rsi, %r8), %rdx
e354a5
+	mov	%rcx, (%rdi)
e354a5
+	mov	%rdx, -8(%rdi, %r8)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %r8), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi, %r8)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit17_32):
e354a5
+	vmovdqu (%rsi), %xmm2
e354a5
+	vmovdqu -16(%rsi, %r8), %xmm3
e354a5
+	vmovdqu %xmm2, (%rdi)
e354a5
+	vmovdqu %xmm3, -16(%rdi, %r8)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %r8), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi, %r8)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit33_64):
e354a5
+	/*  0/32, 31/16 */
e354a5
+	vmovdqu (%rsi), %ymm2
e354a5
+	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
e354a5
+	vmovdqu %ymm2, (%rdi)
e354a5
+	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %r8), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi, %r8)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyExit65):
e354a5
+	/* 0/32, 32/32, 64/1 */
e354a5
+	vmovdqu (%rsi), %ymm2
e354a5
+	vmovdqu 32(%rsi), %ymm3
e354a5
+	mov	64(%rsi), %cl
e354a5
+	vmovdqu %ymm2, (%rdi)
e354a5
+	vmovdqu %ymm3, 32(%rdi)
e354a5
+	mov	%cl, 64(%rdi)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	65(%rdi), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, 65(%rdi)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill1):
e354a5
+	mov	%dl, (%rdi)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill2):
e354a5
+	mov	%dx, (%rdi)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill3_4):
e354a5
+	mov	%dx, (%rdi)
e354a5
+	mov     %dx, -2(%rdi, %r8)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill5_8):
e354a5
+	mov	%edx, (%rdi)
e354a5
+	mov     %edx, -4(%rdi, %r8)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill9_16):
e354a5
+	mov	%rdx, (%rdi)
e354a5
+	mov	%rdx, -8(%rdi, %r8)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(Fill17_32):
e354a5
+	vmovdqu %xmmZ, (%rdi)
e354a5
+	vmovdqu %xmmZ, -16(%rdi, %r8)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeUnalignedVec2):
e354a5
+	vmovdqu %ymm2, (%rdi, %rcx)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(CopyVecSizeVecExit):
e354a5
+	bsf	%edx, %edx
e354a5
+	add	$(VEC_SIZE - 1), %r8
e354a5
+	add	%rcx, %rdi
e354a5
+#   ifdef USE_AS_STPCPY
e354a5
+	lea	(%rdi, %rdx), %rax
e354a5
+#   endif
e354a5
+	sub	%rdx, %r8
e354a5
+	lea	1(%rdi, %rdx), %rdi
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyFillTailWithZero):
e354a5
+	xor	%edx, %edx
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(StrncpyFillExit)
e354a5
+
e354a5
+	vmovdqu %ymmZ, (%rdi)
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+
e354a5
+	mov	%rdi, %rsi
e354a5
+	and	$(VEC_SIZE - 1), %esi
e354a5
+	sub	%rsi, %rdi
e354a5
+	add	%rsi, %r8
e354a5
+	sub	$(VEC_SIZE * 4), %r8
e354a5
+	jb	L(StrncpyFillLessFourVecSize)
e354a5
+
e354a5
+L(StrncpyFillLoopVmovdqa):
e354a5
+	vmovdqa %ymmZ, (%rdi)
e354a5
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
e354a5
+	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
e354a5
+	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
e354a5
+	add	$(VEC_SIZE * 4), %rdi
e354a5
+	sub	$(VEC_SIZE * 4), %r8
e354a5
+	jae	L(StrncpyFillLoopVmovdqa)
e354a5
+
e354a5
+L(StrncpyFillLessFourVecSize):
e354a5
+	add	$(VEC_SIZE * 2), %r8
e354a5
+	jl	L(StrncpyFillLessTwoVecSize)
e354a5
+	vmovdqa %ymmZ, (%rdi)
e354a5
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
e354a5
+	add	$(VEC_SIZE * 2), %rdi
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jl	L(StrncpyFillExit)
e354a5
+	vmovdqa %ymmZ, (%rdi)
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+	jmp	L(Fill)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyFillLessTwoVecSize):
e354a5
+	add	$VEC_SIZE, %r8
e354a5
+	jl	L(StrncpyFillExit)
e354a5
+	vmovdqa %ymmZ, (%rdi)
e354a5
+	add	$VEC_SIZE, %rdi
e354a5
+	jmp	L(Fill)
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(StrncpyFillExit):
e354a5
+	add	$VEC_SIZE, %r8
e354a5
+L(Fill):
e354a5
+	cmp	$17, %r8d
e354a5
+	jae	L(Fill17_32)
e354a5
+	cmp	$9, %r8d
e354a5
+	jae	L(Fill9_16)
e354a5
+	cmp	$5, %r8d
e354a5
+	jae	L(Fill5_8)
e354a5
+	cmp	$3, %r8d
e354a5
+	jae	L(Fill3_4)
e354a5
+	cmp	$1, %r8d
e354a5
+	ja	L(Fill2)
e354a5
+	je	L(Fill1)
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+/* end of ifndef USE_AS_STRCAT */
e354a5
+#  endif
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(UnalignedLeaveCase2OrCase3):
e354a5
+	test	%rdx, %rdx
e354a5
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
e354a5
+L(UnalignedFourVecSizeLeaveCase3):
e354a5
+	lea	(VEC_SIZE * 4)(%r8), %rcx
e354a5
+	and	$-VEC_SIZE, %rcx
e354a5
+	add	$(VEC_SIZE * 3), %r8
e354a5
+	jl	L(CopyVecSizeCase3)
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jb	L(CopyVecSizeCase3)
e354a5
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jb	L(CopyVecSizeCase3)
e354a5
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jb	L(CopyVecSizeCase3)
e354a5
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	lea	(VEC_SIZE * 4)(%rdi), %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (VEC_SIZE * 4)(%rdi)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(UnalignedFourVecSizeLeaveCase2):
e354a5
+	xor	%ecx, %ecx
e354a5
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	add	$(VEC_SIZE * 3), %r8
e354a5
+	jle	L(CopyVecSizeCase2OrCase3)
e354a5
+	test	%edx, %edx
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec4)
e354a5
+#  else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+#  endif
e354a5
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	vmovdqu %ymm4, (%rdi)
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+	test	%edx, %edx
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec5)
e354a5
+#  else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+#  endif
e354a5
+
e354a5
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
e354a5
+	add	$VEC_SIZE, %rcx
e354a5
+	sub	$VEC_SIZE, %r8
e354a5
+	jbe	L(CopyVecSizeCase2OrCase3)
e354a5
+	test	%edx, %edx
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+	jnz	L(CopyVecSizeUnalignedVec6)
e354a5
+#  else
e354a5
+	jnz	L(CopyVecSize)
e354a5
+#  endif
e354a5
+
e354a5
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
e354a5
+	vpmovmskb %ymmM, %edx
e354a5
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
e354a5
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
e354a5
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
e354a5
+	bsf	%edx, %edx
e354a5
+	cmp	%r8d, %edx
e354a5
+	jb	L(CopyVecSizeExit)
e354a5
+L(StrncpyExit):
e354a5
+	cmp	$65, %r8d
e354a5
+	je	L(StrncpyExit65)
e354a5
+	cmp	$33, %r8d
e354a5
+	jae	L(StrncpyExit33_64)
e354a5
+	cmp	$17, %r8d
e354a5
+	jae	L(StrncpyExit17_32)
e354a5
+	cmp	$9, %r8d
e354a5
+	jae	L(StrncpyExit9_16)
e354a5
+	cmp	$5, %r8d
e354a5
+	jae	L(StrncpyExit5_8)
e354a5
+	cmp	$3, %r8d
e354a5
+	jae	L(StrncpyExit3_4)
e354a5
+	cmp	$1, %r8d
e354a5
+	ja	L(StrncpyExit2)
e354a5
+	je	L(StrncpyExit1)
e354a5
+#  ifdef USE_AS_STPCPY
e354a5
+	mov	%rdi, %rax
e354a5
+#  endif
e354a5
+#  ifdef USE_AS_STRCAT
e354a5
+	movb	$0, (%rdi)
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+	.p2align 4
e354a5
+L(ExitZero):
e354a5
+#  ifndef USE_AS_STRCAT
e354a5
+	mov	%rdi, %rax
e354a5
+#  endif
e354a5
+	VZEROUPPER
e354a5
+	ret
e354a5
+
e354a5
+# endif
e354a5
+
e354a5
+# ifndef USE_AS_STRCAT
e354a5
+END (STRCPY)
e354a5
+# else
e354a5
+END (STRCAT)
e354a5
+# endif
e354a5
+#endif
e354a5
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
e354a5
index 12e0e3ffe20602c6..ecf90d4b044a1b01 100644
e354a5
--- a/sysdeps/x86_64/multiarch/strcpy.c
e354a5
+++ b/sysdeps/x86_64/multiarch/strcpy.c
e354a5
@@ -24,7 +24,7 @@
e354a5
 # undef strcpy
e354a5
 
e354a5
 # define SYMBOL_NAME strcpy
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
e354a5
 
e354a5
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..bfefa659bb6281fa
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
e354a5
@@ -0,0 +1,3 @@
e354a5
+#define USE_AS_STRNCAT
e354a5
+#define STRCAT __strncat_avx2
e354a5
+#include "strcat-avx2.S"
e354a5
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
e354a5
index 841c165565add132..74f7d028ae23d700 100644
e354a5
--- a/sysdeps/x86_64/multiarch/strncat.c
e354a5
+++ b/sysdeps/x86_64/multiarch/strncat.c
e354a5
@@ -24,7 +24,7 @@
e354a5
 # undef strncat
e354a5
 
e354a5
 # define SYMBOL_NAME strncat
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
e354a5
 strong_alias (strncat, __strncat);
e354a5
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..9ef8c87627dc4924
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
e354a5
@@ -0,0 +1,3 @@
e354a5
+#define USE_AS_STRNCPY
e354a5
+#define STRCPY __strncpy_avx2
e354a5
+#include "strcpy-avx2.S"
e354a5
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
e354a5
index 3c3de8b18ebb177f..93dfb4cfde79467a 100644
e354a5
--- a/sysdeps/x86_64/multiarch/strncpy.c
e354a5
+++ b/sysdeps/x86_64/multiarch/strncpy.c
e354a5
@@ -24,7 +24,7 @@
e354a5
 # undef strncpy
e354a5
 
e354a5
 # define SYMBOL_NAME strncpy
e354a5
-# include "ifunc-unaligned-ssse3.h"
e354a5
+# include "ifunc-strcpy.h"
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
e354a5