446cf2
commit 1a153e47fcc9401d8ea424ad86569a57ed0f8c52
446cf2
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
446cf2
Date:   Mon Oct 8 08:59:50 2018 -0500
446cf2
446cf2
    x86-64: Optimize strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2
446cf2
    
446cf2
    Optimize x86-64 strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2.
446cf2
    It uses vector comparison as much as possible. In general, the larger the
446cf2
    source string, the greater performance gain observed, reaching speedups of
446cf2
    1.6x compared to SSE2 unaligned routines. Select AVX2 strcat/strncat,
446cf2
    strcpy/strncpy and stpcpy/stpncpy on AVX2 machines where vzeroupper is
446cf2
    preferred and AVX unaligned load is fast.
446cf2
    
446cf2
            * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
446cf2
            strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
446cf2
            stpcpy-avx2 and stpncpy-avx2.
446cf2
            * sysdeps/x86_64/multiarch/ifunc-impl-list.c:
446cf2
            (__libc_ifunc_impl_list): Add tests for __strcat_avx2,
446cf2
            __strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
446cf2
            and __stpncpy_avx2.
446cf2
            * sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
446cf2
            ifunc-strcpy.h}: rename header for a more generic name.
446cf2
            * sysdeps/x86_64/multiarch/ifunc-strcpy.h:
446cf2
            (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
446cf2
            AVX unaligned load is fast and vzeroupper is preferred.
446cf2
            * sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file
446cf2
            * sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise
446cf2
            * sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise
446cf2
            * sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise
446cf2
            * sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise
446cf2
            * sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise
446cf2
446cf2
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
446cf2
index bb5e97073520ee51..395e432c092ca17c 100644
446cf2
--- a/sysdeps/x86_64/multiarch/Makefile
446cf2
+++ b/sysdeps/x86_64/multiarch/Makefile
446cf2
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
446cf2
 		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
446cf2
 		   strrchr-sse2 strrchr-avx2 \
446cf2
 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
446cf2
+		   strcat-avx2 strncat-avx2 \
446cf2
 		   strcat-ssse3 strncat-ssse3\
446cf2
+		   strcpy-avx2 strncpy-avx2 \
446cf2
 		   strcpy-sse2 stpcpy-sse2 \
446cf2
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
446cf2
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
446cf2
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
446cf2
+		   stpcpy-avx2 stpncpy-avx2 \
446cf2
 		   strcat-sse2 \
446cf2
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
446cf2
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
446cf2
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
446cf2
index 9aaaef7251b8edfe..8b55bb6954000cc2 100644
446cf2
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
446cf2
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
446cf2
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
   IFUNC_IMPL (i, name, stpncpy,
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __stpncpy_ssse3)
446cf2
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __stpncpy_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
446cf2
 			      __stpncpy_sse2_unaligned)
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
446cf2
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
   IFUNC_IMPL (i, name, stpcpy,
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __stpcpy_ssse3)
446cf2
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __stpcpy_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
446cf2
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
446cf2
 
446cf2
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/x86_64/multiarch/strcat.c.  */
446cf2
   IFUNC_IMPL (i, name, strcat,
446cf2
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __strcat_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __strcat_ssse3)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
446cf2
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/x86_64/multiarch/strcpy.c.  */
446cf2
   IFUNC_IMPL (i, name, strcpy,
446cf2
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __strcpy_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __strcpy_ssse3)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
446cf2
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/x86_64/multiarch/strncat.c.  */
446cf2
   IFUNC_IMPL (i, name, strncat,
446cf2
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __strncat_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __strncat_ssse3)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
446cf2
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
446cf2
   IFUNC_IMPL (i, name, strncpy,
446cf2
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
446cf2
+			      __strncpy_avx2)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
446cf2
 			      __strncpy_ssse3)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
446cf2
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
446cf2
similarity index 83%
446cf2
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
446cf2
rename to sysdeps/x86_64/multiarch/ifunc-strcpy.h
446cf2
index 81805f9832345923..4f2286fefccda069 100644
446cf2
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
446cf2
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
446cf2
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
446cf2
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
446cf2
   attribute_hidden;
446cf2
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
446cf2
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
446cf2
 
446cf2
 static inline void *
446cf2
 IFUNC_SELECTOR (void)
446cf2
 {
446cf2
   const struct cpu_features* cpu_features = __get_cpu_features ();
446cf2
 
446cf2
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
446cf2
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
446cf2
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
446cf2
+    return OPTIMIZE (avx2);
446cf2
+
446cf2
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
446cf2
     return OPTIMIZE (sse2_unaligned);
446cf2
 
446cf2
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..f0bd3029fe3047ed
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
446cf2
@@ -0,0 +1,3 @@
446cf2
+#define USE_AS_STPCPY
446cf2
+#define STRCPY __stpcpy_avx2
446cf2
+#include "strcpy-avx2.S"
446cf2
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
446cf2
index 1e340fca991a021c..8ffd13b48c83ca8e 100644
446cf2
--- a/sysdeps/x86_64/multiarch/stpcpy.c
446cf2
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
446cf2
@@ -28,7 +28,7 @@
446cf2
 # undef __stpcpy
446cf2
 
446cf2
 # define SYMBOL_NAME stpcpy
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
446cf2
 
446cf2
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..032b0407d08c6a9d
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
446cf2
@@ -0,0 +1,4 @@
446cf2
+#define USE_AS_STPCPY
446cf2
+#define USE_AS_STRNCPY
446cf2
+#define STRCPY __stpncpy_avx2
446cf2
+#include "strcpy-avx2.S"
446cf2
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
446cf2
index 28842ece2b0998e3..f3e203f78cca2e61 100644
446cf2
--- a/sysdeps/x86_64/multiarch/stpncpy.c
446cf2
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
446cf2
@@ -26,7 +26,7 @@
446cf2
 # undef __stpncpy
446cf2
 
446cf2
 # define SYMBOL_NAME stpncpy
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
446cf2
 
446cf2
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..b062356427677ca6
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
446cf2
@@ -0,0 +1,275 @@
446cf2
+/* strcat with AVX2
446cf2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
446cf2
+   Contributed by Intel Corporation.
446cf2
+   This file is part of the GNU C Library.
446cf2
+
446cf2
+   The GNU C Library is free software; you can redistribute it and/or
446cf2
+   modify it under the terms of the GNU Lesser General Public
446cf2
+   License as published by the Free Software Foundation; either
446cf2
+   version 2.1 of the License, or (at your option) any later version.
446cf2
+
446cf2
+   The GNU C Library is distributed in the hope that it will be useful,
446cf2
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
446cf2
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
446cf2
+   Lesser General Public License for more details.
446cf2
+
446cf2
+   You should have received a copy of the GNU Lesser General Public
446cf2
+   License along with the GNU C Library; if not, see
446cf2
+   <http://www.gnu.org/licenses/>.  */
446cf2
+
446cf2
+#if IS_IN (libc)
446cf2
+
446cf2
+# include <sysdep.h>
446cf2
+
446cf2
+# ifndef STRCAT
446cf2
+#  define STRCAT  __strcat_avx2
446cf2
+# endif
446cf2
+
446cf2
+# define USE_AS_STRCAT
446cf2
+
446cf2
+/* Number of bytes in a vector register */
446cf2
+# define VEC_SIZE	32
446cf2
+
446cf2
+	.section .text.avx,"ax",@progbits
446cf2
+ENTRY (STRCAT)
446cf2
+	mov	%rdi, %r9
446cf2
+# ifdef USE_AS_STRNCAT
446cf2
+	mov	%rdx, %r8
446cf2
+# endif
446cf2
+
446cf2
+	xor	%eax, %eax
446cf2
+	mov	%edi, %ecx
446cf2
+	and	$((VEC_SIZE * 4) - 1), %ecx
446cf2
+	vpxor	%xmm6, %xmm6, %xmm6
446cf2
+	cmp	$(VEC_SIZE * 3), %ecx
446cf2
+	ja	L(fourth_vector_boundary)
446cf2
+	vpcmpeqb (%rdi), %ymm6, %ymm0
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_first_vector)
446cf2
+	mov	%rdi, %rax
446cf2
+	and	$-VEC_SIZE, %rax
446cf2
+	jmp	L(align_vec_size_start)
446cf2
+L(fourth_vector_boundary):
446cf2
+	mov	%rdi, %rax
446cf2
+	and	$-VEC_SIZE, %rax
446cf2
+	vpcmpeqb	(%rax), %ymm6, %ymm0
446cf2
+	mov	$-1, %r10d
446cf2
+	sub	%rax, %rcx
446cf2
+	shl	%cl, %r10d
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	and	%r10d, %edx
446cf2
+	jnz	L(exit)
446cf2
+
446cf2
+L(align_vec_size_start):
446cf2
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_second_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_third_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fourth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fifth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
446cf2
+	add	$(VEC_SIZE * 4), %rax
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_second_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_third_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fourth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fifth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
446cf2
+	add	$(VEC_SIZE * 4), %rax
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_second_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_third_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fourth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fifth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
446cf2
+	add	$(VEC_SIZE * 4), %rax
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_second_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_third_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fourth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fifth_vector)
446cf2
+
446cf2
+	test	$((VEC_SIZE * 4) - 1), %rax
446cf2
+	jz	L(align_four_vec_loop)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
446cf2
+	add	$(VEC_SIZE * 5), %rax
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit)
446cf2
+
446cf2
+	test	$((VEC_SIZE * 4) - 1), %rax
446cf2
+	jz	L(align_four_vec_loop)
446cf2
+
446cf2
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
446cf2
+	add	$VEC_SIZE, %rax
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit)
446cf2
+
446cf2
+	test	$((VEC_SIZE * 4) - 1), %rax
446cf2
+	jz	L(align_four_vec_loop)
446cf2
+
446cf2
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
446cf2
+	add	$VEC_SIZE, %rax
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit)
446cf2
+
446cf2
+	test	$((VEC_SIZE * 4) - 1), %rax
446cf2
+	jz	L(align_four_vec_loop)
446cf2
+
446cf2
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
446cf2
+	add	$VEC_SIZE, %rax
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit)
446cf2
+
446cf2
+	add	$VEC_SIZE, %rax
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(align_four_vec_loop):
446cf2
+	vmovaps	(%rax),	%ymm4
446cf2
+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
446cf2
+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
446cf2
+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
446cf2
+	add	$(VEC_SIZE * 4),	%rax
446cf2
+	vpminub	%ymm4,	%ymm5, %ymm5
446cf2
+	vpcmpeqb %ymm5,	%ymm6, %ymm5
446cf2
+	vpmovmskb %ymm5,	%edx
446cf2
+	test	%edx,	%edx
446cf2
+	jz	L(align_four_vec_loop)
446cf2
+
446cf2
+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
446cf2
+	sub	$(VEC_SIZE * 5),	%rax
446cf2
+	vpmovmskb %ymm0, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_second_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
446cf2
+	vpmovmskb %ymm1, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_third_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(exit_null_on_fourth_vector)
446cf2
+
446cf2
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+	sub	%rdi, %rax
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	add	$(VEC_SIZE * 4), %rax
446cf2
+	jmp	L(StartStrcpyPart)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(exit):
446cf2
+	sub	%rdi, %rax
446cf2
+L(exit_null_on_first_vector):
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	jmp	L(StartStrcpyPart)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(exit_null_on_second_vector):
446cf2
+	sub	%rdi, %rax
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	add	$VEC_SIZE, %rax
446cf2
+	jmp	L(StartStrcpyPart)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(exit_null_on_third_vector):
446cf2
+	sub	%rdi, %rax
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	add	$(VEC_SIZE * 2), %rax
446cf2
+	jmp	L(StartStrcpyPart)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(exit_null_on_fourth_vector):
446cf2
+	sub	%rdi, %rax
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	add	$(VEC_SIZE * 3), %rax
446cf2
+	jmp	L(StartStrcpyPart)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(exit_null_on_fifth_vector):
446cf2
+	sub	%rdi, %rax
446cf2
+	bsf	%rdx, %rdx
446cf2
+	add	%rdx, %rax
446cf2
+	add	$(VEC_SIZE * 4), %rax
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StartStrcpyPart):
446cf2
+	lea	(%r9, %rax), %rdi
446cf2
+	mov	%rsi, %rcx
446cf2
+	mov	%r9, %rax      /* save result */
446cf2
+
446cf2
+# ifdef USE_AS_STRNCAT
446cf2
+	test	%r8, %r8
446cf2
+	jz	L(ExitZero)
446cf2
+#  define USE_AS_STRNCPY
446cf2
+# endif
446cf2
+
446cf2
+# include "strcpy-avx2.S"
446cf2
+#endif
446cf2
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
446cf2
index 1f7f6263f35ba402..694b9b2405827bd4 100644
446cf2
--- a/sysdeps/x86_64/multiarch/strcat.c
446cf2
+++ b/sysdeps/x86_64/multiarch/strcat.c
446cf2
@@ -24,7 +24,7 @@
446cf2
 # undef strcat
446cf2
 
446cf2
 # define SYMBOL_NAME strcat
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
446cf2
 
446cf2
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..81677f9060773a49
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
446cf2
@@ -0,0 +1,1022 @@
446cf2
+/* strcpy with AVX2
446cf2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
446cf2
+   Contributed by Intel Corporation.
446cf2
+   This file is part of the GNU C Library.
446cf2
+
446cf2
+   The GNU C Library is free software; you can redistribute it and/or
446cf2
+   modify it under the terms of the GNU Lesser General Public
446cf2
+   License as published by the Free Software Foundation; either
446cf2
+   version 2.1 of the License, or (at your option) any later version.
446cf2
+
446cf2
+   The GNU C Library is distributed in the hope that it will be useful,
446cf2
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
446cf2
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
446cf2
+   Lesser General Public License for more details.
446cf2
+
446cf2
+   You should have received a copy of the GNU Lesser General Public
446cf2
+   License along with the GNU C Library; if not, see
446cf2
+   <http://www.gnu.org/licenses/>.  */
446cf2
+
446cf2
+#if IS_IN (libc)
446cf2
+
446cf2
+# ifndef USE_AS_STRCAT
446cf2
+#  include <sysdep.h>
446cf2
+
446cf2
+#  ifndef STRCPY
446cf2
+#   define STRCPY  __strcpy_avx2
446cf2
+#  endif
446cf2
+
446cf2
+# endif
446cf2
+
446cf2
+/* Number of bytes in a vector register */
446cf2
+# ifndef VEC_SIZE
446cf2
+#  define VEC_SIZE	32
446cf2
+# endif
446cf2
+
446cf2
+# ifndef VZEROUPPER
446cf2
+#  define VZEROUPPER	vzeroupper
446cf2
+# endif
446cf2
+
446cf2
+/* zero register */
446cf2
+#define xmmZ	xmm0
446cf2
+#define ymmZ	ymm0
446cf2
+
446cf2
+/* mask register */
446cf2
+#define ymmM	ymm1
446cf2
+
446cf2
+# ifndef USE_AS_STRCAT
446cf2
+
446cf2
+	.section .text.avx,"ax",@progbits
446cf2
+ENTRY (STRCPY)
446cf2
+#  ifdef USE_AS_STRNCPY
446cf2
+	mov	%rdx, %r8
446cf2
+	test	%r8, %r8
446cf2
+	jz	L(ExitZero)
446cf2
+#  endif
446cf2
+	mov	%rsi, %rcx
446cf2
+#  ifndef USE_AS_STPCPY
446cf2
+	mov	%rdi, %rax      /* save result */
446cf2
+#  endif
446cf2
+
446cf2
+# endif
446cf2
+
446cf2
+	vpxor	%xmmZ, %xmmZ, %xmmZ
446cf2
+
446cf2
+	and	$((VEC_SIZE * 4) - 1), %ecx
446cf2
+	cmp	$(VEC_SIZE * 2), %ecx
446cf2
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
446cf2
+
446cf2
+	and	$-VEC_SIZE, %rsi
446cf2
+	and	$(VEC_SIZE - 1), %ecx
446cf2
+
446cf2
+	vpcmpeqb (%rsi), %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	shr	%cl, %rdx
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
446cf2
+	mov	$VEC_SIZE, %r10
446cf2
+	sub	%rcx, %r10
446cf2
+	cmp	%r10, %r8
446cf2
+#  else
446cf2
+	mov	$(VEC_SIZE + 1), %r10
446cf2
+	sub	%rcx, %r10
446cf2
+	cmp	%r10, %r8
446cf2
+#  endif
446cf2
+	jbe	L(CopyVecSizeTailCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyVecSizeTail)
446cf2
+
446cf2
+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
446cf2
+	vpmovmskb %ymm2, %edx
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	add	$VEC_SIZE, %r10
446cf2
+	cmp	%r10, %r8
446cf2
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyTwoVecSize)
446cf2
+
446cf2
+	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
446cf2
+	vmovdqu %ymm2, (%rdi)
446cf2
+
446cf2
+/* If source address alignment != destination address alignment */
446cf2
+	.p2align 4
446cf2
+L(UnalignVecSizeBoth):
446cf2
+	sub	%rcx, %rdi
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	add	%rcx, %r8
446cf2
+	sbb	%rcx, %rcx
446cf2
+	or	%rcx, %r8
446cf2
+# endif
446cf2
+	mov	$VEC_SIZE, %rcx
446cf2
+	vmovdqa (%rsi, %rcx), %ymm2
446cf2
+	vmovdqu %ymm2, (%rdi, %rcx)
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
446cf2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$(VEC_SIZE * 3), %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec2)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqu %ymm2, (%rdi, %rcx)
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
446cf2
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec3)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqu %ymm3, (%rdi, %rcx)
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
446cf2
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec4)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqu %ymm4, (%rdi, %rcx)
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
446cf2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec2)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqu %ymm2, (%rdi, %rcx)
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
446cf2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec2)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
446cf2
+	vmovdqu %ymm2, (%rdi, %rcx)
446cf2
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec3)
446cf2
+# else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+# endif
446cf2
+
446cf2
+	vmovdqu %ymm3, (%rdi, %rcx)
446cf2
+	mov	%rsi, %rdx
446cf2
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
446cf2
+	and	$-(VEC_SIZE * 4), %rsi
446cf2
+	sub	%rsi, %rdx
446cf2
+	sub	%rdx, %rdi
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
446cf2
+# endif
446cf2
+L(UnalignedFourVecSizeLoop):
446cf2
+	vmovdqa (%rsi), %ymm4
446cf2
+	vmovdqa VEC_SIZE(%rsi), %ymm5
446cf2
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
446cf2
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
446cf2
+	vpminub %ymm5, %ymm4, %ymm2
446cf2
+	vpminub %ymm7, %ymm6, %ymm3
446cf2
+	vpminub %ymm2, %ymm3, %ymm3
446cf2
+	vpcmpeqb %ymmM, %ymm3, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$(VEC_SIZE * 4), %r8
446cf2
+	jbe	L(UnalignedLeaveCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(UnalignedFourVecSizeLeave)
446cf2
+
446cf2
+L(UnalignedFourVecSizeLoop_start):
446cf2
+	add	$(VEC_SIZE * 4), %rdi
446cf2
+	add	$(VEC_SIZE * 4), %rsi
446cf2
+	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
446cf2
+	vmovdqa (%rsi), %ymm4
446cf2
+	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
446cf2
+	vmovdqa VEC_SIZE(%rsi), %ymm5
446cf2
+	vpminub %ymm5, %ymm4, %ymm2
446cf2
+	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
446cf2
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
446cf2
+	vmovdqu %ymm7, -VEC_SIZE(%rdi)
446cf2
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
446cf2
+	vpminub %ymm7, %ymm6, %ymm3
446cf2
+	vpminub %ymm2, %ymm3, %ymm3
446cf2
+	vpcmpeqb %ymmM, %ymm3, %ymm3
446cf2
+	vpmovmskb %ymm3, %edx
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+	sub	$(VEC_SIZE * 4), %r8
446cf2
+	jbe	L(UnalignedLeaveCase2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jz	L(UnalignedFourVecSizeLoop_start)
446cf2
+
446cf2
+L(UnalignedFourVecSizeLeave):
446cf2
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyVecSizeUnaligned_0)
446cf2
+
446cf2
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %ecx
446cf2
+	test	%ecx, %ecx
446cf2
+	jnz	L(CopyVecSizeUnaligned_16)
446cf2
+
446cf2
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyVecSizeUnaligned_32)
446cf2
+
446cf2
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %ecx
446cf2
+	bsf	%ecx, %edx
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
446cf2
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
446cf2
+	add	$(VEC_SIZE - 1), %r8
446cf2
+	sub	%rdx, %r8
446cf2
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
446cf2
+	jmp	L(StrncpyFillTailWithZero)
446cf2
+# else
446cf2
+	add	$(VEC_SIZE * 3), %rsi
446cf2
+	add	$(VEC_SIZE * 3), %rdi
446cf2
+	jmp	L(CopyVecSizeExit)
446cf2
+# endif
446cf2
+
446cf2
+/* If source address alignment == destination address alignment */
446cf2
+
446cf2
+L(SourceStringAlignmentLessTwoVecSize):
446cf2
+	vmovdqu (%rsi), %ymm3
446cf2
+	vmovdqu VEC_SIZE(%rsi), %ymm2
446cf2
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
446cf2
+	cmp	$VEC_SIZE, %r8
446cf2
+#  else
446cf2
+	cmp	$(VEC_SIZE + 1), %r8
446cf2
+#  endif
446cf2
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyVecSizeTail1)
446cf2
+
446cf2
+	vmovdqu %ymm3, (%rdi)
446cf2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
446cf2
+	cmp	$(VEC_SIZE * 2), %r8
446cf2
+#  else
446cf2
+	cmp	$((VEC_SIZE * 2) + 1), %r8
446cf2
+#  endif
446cf2
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
446cf2
+# endif
446cf2
+	test	%edx, %edx
446cf2
+	jnz	L(CopyTwoVecSize1)
446cf2
+
446cf2
+	and	$-VEC_SIZE, %rsi
446cf2
+	and	$(VEC_SIZE - 1), %ecx
446cf2
+	jmp	L(UnalignVecSizeBoth)
446cf2
+
446cf2
+/*------End of main part with loops---------------------*/
446cf2
+
446cf2
+/* Case1 */
446cf2
+
446cf2
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
446cf2
+	.p2align 4
446cf2
+L(CopyVecSize):
446cf2
+	add	%rcx, %rdi
446cf2
+# endif
446cf2
+L(CopyVecSizeTail):
446cf2
+	add	%rcx, %rsi
446cf2
+L(CopyVecSizeTail1):
446cf2
+	bsf	%edx, %edx
446cf2
+L(CopyVecSizeExit):
446cf2
+	cmp	$32, %edx
446cf2
+	jae	L(Exit32_63)
446cf2
+	cmp	$16, %edx
446cf2
+	jae	L(Exit16_31)
446cf2
+	cmp	$8, %edx
446cf2
+	jae	L(Exit8_15)
446cf2
+	cmp	$4, %edx
446cf2
+	jae	L(Exit4_7)
446cf2
+	cmp	$3, %edx
446cf2
+	je	L(Exit3)
446cf2
+	cmp	$1, %edx
446cf2
+	ja	L(Exit2)
446cf2
+	je	L(Exit1)
446cf2
+	movb	$0, (%rdi)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	$1, %r8
446cf2
+	lea	1(%rdi), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyTwoVecSize1):
446cf2
+	add	$VEC_SIZE, %rsi
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+# endif
446cf2
+	jmp	L(CopyVecSizeTail1)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyTwoVecSize):
446cf2
+	bsf	%edx, %edx
446cf2
+	add	%rcx, %rsi
446cf2
+	add	$VEC_SIZE, %edx
446cf2
+	sub	%ecx, %edx
446cf2
+	jmp	L(CopyVecSizeExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnaligned_0):
446cf2
+	bsf	%edx, %edx
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+	add	$((VEC_SIZE * 4) - 1), %r8
446cf2
+	sub	%rdx, %r8
446cf2
+	lea	1(%rdi, %rdx), %rdi
446cf2
+	jmp	L(StrncpyFillTailWithZero)
446cf2
+# else
446cf2
+	jmp	L(CopyVecSizeExit)
446cf2
+# endif
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnaligned_16):
446cf2
+	bsf	%ecx, %edx
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	VEC_SIZE(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
446cf2
+	add	$((VEC_SIZE * 3) - 1), %r8
446cf2
+	sub	%rdx, %r8
446cf2
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
446cf2
+	jmp	L(StrncpyFillTailWithZero)
446cf2
+# else
446cf2
+	add	$VEC_SIZE, %rsi
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+	jmp	L(CopyVecSizeExit)
446cf2
+# endif
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnaligned_32):
446cf2
+	bsf	%edx, %edx
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
446cf2
+	add	$((VEC_SIZE * 2) - 1), %r8
446cf2
+	sub	%rdx, %r8
446cf2
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
446cf2
+	jmp	L(StrncpyFillTailWithZero)
446cf2
+# else
446cf2
+	add	$(VEC_SIZE * 2), %rsi
446cf2
+	add	$(VEC_SIZE * 2), %rdi
446cf2
+	jmp	L(CopyVecSizeExit)
446cf2
+# endif
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnalignedVec6):
446cf2
+	vmovdqu %ymm6, (%rdi, %rcx)
446cf2
+	jmp	L(CopyVecSizeVecExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnalignedVec5):
446cf2
+	vmovdqu %ymm5, (%rdi, %rcx)
446cf2
+	jmp	L(CopyVecSizeVecExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnalignedVec4):
446cf2
+	vmovdqu %ymm4, (%rdi, %rcx)
446cf2
+	jmp	L(CopyVecSizeVecExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnalignedVec3):
446cf2
+	vmovdqu %ymm3, (%rdi, %rcx)
446cf2
+	jmp	L(CopyVecSizeVecExit)
446cf2
+#  endif
446cf2
+
446cf2
+/* Case2 */
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeCase2):
446cf2
+	add	$VEC_SIZE, %r8
446cf2
+	add	%rcx, %rdi
446cf2
+	add	%rcx, %rsi
446cf2
+	bsf	%edx, %edx
446cf2
+	cmp	%r8d, %edx
446cf2
+	jb	L(CopyVecSizeExit)
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyTwoVecSizeCase2):
446cf2
+	add	%rcx, %rsi
446cf2
+	bsf	%edx, %edx
446cf2
+	add	$VEC_SIZE, %edx
446cf2
+	sub	%ecx, %edx
446cf2
+	cmp	%r8d, %edx
446cf2
+	jb	L(CopyVecSizeExit)
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+L(CopyVecSizeTailCase2):
446cf2
+	add	%rcx, %rsi
446cf2
+	bsf	%edx, %edx
446cf2
+	cmp	%r8d, %edx
446cf2
+	jb	L(CopyVecSizeExit)
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+L(CopyVecSizeTail1Case2):
446cf2
+	bsf	%edx, %edx
446cf2
+	cmp	%r8d, %edx
446cf2
+	jb	L(CopyVecSizeExit)
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+/* Case2 or Case3,  Case3 */
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeCase2OrCase3):
446cf2
+	test	%rdx, %rdx
446cf2
+	jnz	L(CopyVecSizeCase2)
446cf2
+L(CopyVecSizeCase3):
446cf2
+	add	$VEC_SIZE, %r8
446cf2
+	add	%rcx, %rdi
446cf2
+	add	%rcx, %rsi
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyTwoVecSizeCase2OrCase3):
446cf2
+	test	%rdx, %rdx
446cf2
+	jnz	L(CopyTwoVecSizeCase2)
446cf2
+	add	%rcx, %rsi
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeTailCase2OrCase3):
446cf2
+	test	%rdx, %rdx
446cf2
+	jnz	L(CopyVecSizeTailCase2)
446cf2
+	add	%rcx, %rsi
446cf2
+	jmp	L(StrncpyExit)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyTwoVecSize1Case2OrCase3):
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+	add	$VEC_SIZE, %rsi
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+L(CopyVecSizeTail1Case2OrCase3):
446cf2
+	test	%rdx, %rdx
446cf2
+	jnz	L(CopyVecSizeTail1Case2)
446cf2
+	jmp	L(StrncpyExit)
446cf2
+# endif
446cf2
+
446cf2
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit1):
446cf2
+	movzwl	(%rsi), %edx
446cf2
+	mov	%dx, (%rdi)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	1(%rdi), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	$2, %r8
446cf2
+	lea	2(%rdi), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit2):
446cf2
+	movzwl	(%rsi), %ecx
446cf2
+	mov	%cx, (%rdi)
446cf2
+	movb	$0, 2(%rdi)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	2(%rdi), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	$3, %r8
446cf2
+	lea	3(%rdi), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit3):
446cf2
+	mov	(%rsi), %edx
446cf2
+	mov	%edx, (%rdi)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	3(%rdi), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	$4, %r8
446cf2
+	lea	4(%rdi), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit4_7):
446cf2
+	mov	(%rsi), %ecx
446cf2
+	mov	%ecx, (%rdi)
446cf2
+	mov	-3(%rsi, %rdx), %ecx
446cf2
+	mov	%ecx, -3(%rdi, %rdx)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	%rdx, %r8
446cf2
+	sub	$1, %r8
446cf2
+	lea	1(%rdi, %rdx), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit8_15):
446cf2
+	mov	(%rsi), %rcx
446cf2
+	mov	-7(%rsi, %rdx), %r9
446cf2
+	mov	%rcx, (%rdi)
446cf2
+	mov	%r9, -7(%rdi, %rdx)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	%rdx, %r8
446cf2
+	sub	$1, %r8
446cf2
+	lea	1(%rdi, %rdx), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit16_31):
446cf2
+	vmovdqu (%rsi), %xmm2
446cf2
+	vmovdqu -15(%rsi, %rdx), %xmm3
446cf2
+	vmovdqu %xmm2, (%rdi)
446cf2
+	vmovdqu %xmm3, -15(%rdi, %rdx)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub %rdx, %r8
446cf2
+	sub $1, %r8
446cf2
+	lea 1(%rdi, %rdx), %rdi
446cf2
+	jnz L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Exit32_63):
446cf2
+	vmovdqu (%rsi), %ymm2
446cf2
+	vmovdqu -31(%rsi, %rdx), %ymm3
446cf2
+	vmovdqu %ymm2, (%rdi)
446cf2
+	vmovdqu %ymm3, -31(%rdi, %rdx)
446cf2
+# ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+# endif
446cf2
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
446cf2
+	sub	%rdx, %r8
446cf2
+	sub	$1, %r8
446cf2
+	lea	1(%rdi, %rdx), %rdi
446cf2
+	jnz	L(StrncpyFillTailWithZero)
446cf2
+# endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+# ifdef USE_AS_STRNCPY
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit1):
446cf2
+	movzbl	(%rsi), %edx
446cf2
+	mov	%dl, (%rdi)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	1(%rdi), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, 1(%rdi)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit2):
446cf2
+	movzwl	(%rsi), %edx
446cf2
+	mov	%dx, (%rdi)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	2(%rdi), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, 2(%rdi)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit3_4):
446cf2
+	movzwl	(%rsi), %ecx
446cf2
+	movzwl	-2(%rsi, %r8), %edx
446cf2
+	mov	%cx, (%rdi)
446cf2
+	mov	%dx, -2(%rdi, %r8)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %r8), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi, %r8)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit5_8):
446cf2
+	mov	(%rsi), %ecx
446cf2
+	mov	-4(%rsi, %r8), %edx
446cf2
+	mov	%ecx, (%rdi)
446cf2
+	mov	%edx, -4(%rdi, %r8)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %r8), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi, %r8)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit9_16):
446cf2
+	mov	(%rsi), %rcx
446cf2
+	mov	-8(%rsi, %r8), %rdx
446cf2
+	mov	%rcx, (%rdi)
446cf2
+	mov	%rdx, -8(%rdi, %r8)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %r8), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi, %r8)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit17_32):
446cf2
+	vmovdqu (%rsi), %xmm2
446cf2
+	vmovdqu -16(%rsi, %r8), %xmm3
446cf2
+	vmovdqu %xmm2, (%rdi)
446cf2
+	vmovdqu %xmm3, -16(%rdi, %r8)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %r8), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi, %r8)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit33_64):
446cf2
+	/*  0/32, 31/16 */
446cf2
+	vmovdqu (%rsi), %ymm2
446cf2
+	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
446cf2
+	vmovdqu %ymm2, (%rdi)
446cf2
+	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %r8), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi, %r8)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyExit65):
446cf2
+	/* 0/32, 32/32, 64/1 */
446cf2
+	vmovdqu (%rsi), %ymm2
446cf2
+	vmovdqu 32(%rsi), %ymm3
446cf2
+	mov	64(%rsi), %cl
446cf2
+	vmovdqu %ymm2, (%rdi)
446cf2
+	vmovdqu %ymm3, 32(%rdi)
446cf2
+	mov	%cl, 64(%rdi)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	65(%rdi), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, 65(%rdi)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill1):
446cf2
+	mov	%dl, (%rdi)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill2):
446cf2
+	mov	%dx, (%rdi)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill3_4):
446cf2
+	mov	%dx, (%rdi)
446cf2
+	mov     %dx, -2(%rdi, %r8)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill5_8):
446cf2
+	mov	%edx, (%rdi)
446cf2
+	mov     %edx, -4(%rdi, %r8)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill9_16):
446cf2
+	mov	%rdx, (%rdi)
446cf2
+	mov	%rdx, -8(%rdi, %r8)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(Fill17_32):
446cf2
+	vmovdqu %xmmZ, (%rdi)
446cf2
+	vmovdqu %xmmZ, -16(%rdi, %r8)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeUnalignedVec2):
446cf2
+	vmovdqu %ymm2, (%rdi, %rcx)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(CopyVecSizeVecExit):
446cf2
+	bsf	%edx, %edx
446cf2
+	add	$(VEC_SIZE - 1), %r8
446cf2
+	add	%rcx, %rdi
446cf2
+#   ifdef USE_AS_STPCPY
446cf2
+	lea	(%rdi, %rdx), %rax
446cf2
+#   endif
446cf2
+	sub	%rdx, %r8
446cf2
+	lea	1(%rdi, %rdx), %rdi
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyFillTailWithZero):
446cf2
+	xor	%edx, %edx
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(StrncpyFillExit)
446cf2
+
446cf2
+	vmovdqu %ymmZ, (%rdi)
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+
446cf2
+	mov	%rdi, %rsi
446cf2
+	and	$(VEC_SIZE - 1), %esi
446cf2
+	sub	%rsi, %rdi
446cf2
+	add	%rsi, %r8
446cf2
+	sub	$(VEC_SIZE * 4), %r8
446cf2
+	jb	L(StrncpyFillLessFourVecSize)
446cf2
+
446cf2
+L(StrncpyFillLoopVmovdqa):
446cf2
+	vmovdqa %ymmZ, (%rdi)
446cf2
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
446cf2
+	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
446cf2
+	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
446cf2
+	add	$(VEC_SIZE * 4), %rdi
446cf2
+	sub	$(VEC_SIZE * 4), %r8
446cf2
+	jae	L(StrncpyFillLoopVmovdqa)
446cf2
+
446cf2
+L(StrncpyFillLessFourVecSize):
446cf2
+	add	$(VEC_SIZE * 2), %r8
446cf2
+	jl	L(StrncpyFillLessTwoVecSize)
446cf2
+	vmovdqa %ymmZ, (%rdi)
446cf2
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
446cf2
+	add	$(VEC_SIZE * 2), %rdi
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jl	L(StrncpyFillExit)
446cf2
+	vmovdqa %ymmZ, (%rdi)
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+	jmp	L(Fill)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyFillLessTwoVecSize):
446cf2
+	add	$VEC_SIZE, %r8
446cf2
+	jl	L(StrncpyFillExit)
446cf2
+	vmovdqa %ymmZ, (%rdi)
446cf2
+	add	$VEC_SIZE, %rdi
446cf2
+	jmp	L(Fill)
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(StrncpyFillExit):
446cf2
+	add	$VEC_SIZE, %r8
446cf2
+L(Fill):
446cf2
+	cmp	$17, %r8d
446cf2
+	jae	L(Fill17_32)
446cf2
+	cmp	$9, %r8d
446cf2
+	jae	L(Fill9_16)
446cf2
+	cmp	$5, %r8d
446cf2
+	jae	L(Fill5_8)
446cf2
+	cmp	$3, %r8d
446cf2
+	jae	L(Fill3_4)
446cf2
+	cmp	$1, %r8d
446cf2
+	ja	L(Fill2)
446cf2
+	je	L(Fill1)
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+/* end of ifndef USE_AS_STRCAT */
446cf2
+#  endif
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(UnalignedLeaveCase2OrCase3):
446cf2
+	test	%rdx, %rdx
446cf2
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
446cf2
+L(UnalignedFourVecSizeLeaveCase3):
446cf2
+	lea	(VEC_SIZE * 4)(%r8), %rcx
446cf2
+	and	$-VEC_SIZE, %rcx
446cf2
+	add	$(VEC_SIZE * 3), %r8
446cf2
+	jl	L(CopyVecSizeCase3)
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jb	L(CopyVecSizeCase3)
446cf2
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jb	L(CopyVecSizeCase3)
446cf2
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jb	L(CopyVecSizeCase3)
446cf2
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	lea	(VEC_SIZE * 4)(%rdi), %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (VEC_SIZE * 4)(%rdi)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(UnalignedFourVecSizeLeaveCase2):
446cf2
+	xor	%ecx, %ecx
446cf2
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	add	$(VEC_SIZE * 3), %r8
446cf2
+	jle	L(CopyVecSizeCase2OrCase3)
446cf2
+	test	%edx, %edx
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec4)
446cf2
+#  else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+#  endif
446cf2
+	vpcmpeqb %ymm5, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	vmovdqu %ymm4, (%rdi)
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+	test	%edx, %edx
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec5)
446cf2
+#  else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+#  endif
446cf2
+
446cf2
+	vpcmpeqb %ymm6, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
446cf2
+	add	$VEC_SIZE, %rcx
446cf2
+	sub	$VEC_SIZE, %r8
446cf2
+	jbe	L(CopyVecSizeCase2OrCase3)
446cf2
+	test	%edx, %edx
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+	jnz	L(CopyVecSizeUnalignedVec6)
446cf2
+#  else
446cf2
+	jnz	L(CopyVecSize)
446cf2
+#  endif
446cf2
+
446cf2
+	vpcmpeqb %ymm7, %ymmZ, %ymmM
446cf2
+	vpmovmskb %ymmM, %edx
446cf2
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
446cf2
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
446cf2
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
446cf2
+	bsf	%edx, %edx
446cf2
+	cmp	%r8d, %edx
446cf2
+	jb	L(CopyVecSizeExit)
446cf2
+L(StrncpyExit):
446cf2
+	cmp	$65, %r8d
446cf2
+	je	L(StrncpyExit65)
446cf2
+	cmp	$33, %r8d
446cf2
+	jae	L(StrncpyExit33_64)
446cf2
+	cmp	$17, %r8d
446cf2
+	jae	L(StrncpyExit17_32)
446cf2
+	cmp	$9, %r8d
446cf2
+	jae	L(StrncpyExit9_16)
446cf2
+	cmp	$5, %r8d
446cf2
+	jae	L(StrncpyExit5_8)
446cf2
+	cmp	$3, %r8d
446cf2
+	jae	L(StrncpyExit3_4)
446cf2
+	cmp	$1, %r8d
446cf2
+	ja	L(StrncpyExit2)
446cf2
+	je	L(StrncpyExit1)
446cf2
+#  ifdef USE_AS_STPCPY
446cf2
+	mov	%rdi, %rax
446cf2
+#  endif
446cf2
+#  ifdef USE_AS_STRCAT
446cf2
+	movb	$0, (%rdi)
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+	.p2align 4
446cf2
+L(ExitZero):
446cf2
+#  ifndef USE_AS_STRCAT
446cf2
+	mov	%rdi, %rax
446cf2
+#  endif
446cf2
+	VZEROUPPER
446cf2
+	ret
446cf2
+
446cf2
+# endif
446cf2
+
446cf2
+# ifndef USE_AS_STRCAT
446cf2
+END (STRCPY)
446cf2
+# else
446cf2
+END (STRCAT)
446cf2
+# endif
446cf2
+#endif
446cf2
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
446cf2
index 12e0e3ffe20602c6..ecf90d4b044a1b01 100644
446cf2
--- a/sysdeps/x86_64/multiarch/strcpy.c
446cf2
+++ b/sysdeps/x86_64/multiarch/strcpy.c
446cf2
@@ -24,7 +24,7 @@
446cf2
 # undef strcpy
446cf2
 
446cf2
 # define SYMBOL_NAME strcpy
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
446cf2
 
446cf2
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..bfefa659bb6281fa
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
446cf2
@@ -0,0 +1,3 @@
446cf2
+#define USE_AS_STRNCAT
446cf2
+#define STRCAT __strncat_avx2
446cf2
+#include "strcat-avx2.S"
446cf2
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
446cf2
index 841c165565add132..74f7d028ae23d700 100644
446cf2
--- a/sysdeps/x86_64/multiarch/strncat.c
446cf2
+++ b/sysdeps/x86_64/multiarch/strncat.c
446cf2
@@ -24,7 +24,7 @@
446cf2
 # undef strncat
446cf2
 
446cf2
 # define SYMBOL_NAME strncat
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
446cf2
 strong_alias (strncat, __strncat);
446cf2
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..9ef8c87627dc4924
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
446cf2
@@ -0,0 +1,3 @@
446cf2
+#define USE_AS_STRNCPY
446cf2
+#define STRCPY __strncpy_avx2
446cf2
+#include "strcpy-avx2.S"
446cf2
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
446cf2
index 3c3de8b18ebb177f..93dfb4cfde79467a 100644
446cf2
--- a/sysdeps/x86_64/multiarch/strncpy.c
446cf2
+++ b/sysdeps/x86_64/multiarch/strncpy.c
446cf2
@@ -24,7 +24,7 @@
446cf2
 # undef strncpy
446cf2
 
446cf2
 # define SYMBOL_NAME strncpy
446cf2
-# include "ifunc-unaligned-ssse3.h"
446cf2
+# include "ifunc-strcpy.h"
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
446cf2