08c3a6
commit 6d18a93dbbde2958001d65dff3080beed7ae675a
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Sep 20 16:20:15 2021 -0500
08c3a6
08c3a6
    x86: Optimize memset-vec-unaligned-erms.S
08c3a6
    
08c3a6
    No bug.
08c3a6
    
08c3a6
    Optimization are
08c3a6
    
08c3a6
    1. change control flow for L(more_2x_vec) to fall through to loop and
08c3a6
       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
08c3a6
       size and saves jumps for length > 4x VEC_SIZE.
08c3a6
    
08c3a6
    2. For EVEX/AVX512 move L(less_vec) closer to entry.
08c3a6
    
08c3a6
    3. Avoid complex address mode for length > 2x VEC_SIZE
08c3a6
    
08c3a6
    4. Slightly better aligning code for the loop from the perspective of
08c3a6
       code size and uops.
08c3a6
    
08c3a6
    5. Align targets so they make full use of their fetch block and if
08c3a6
       possible cache line.
08c3a6
    
08c3a6
    6. Try and reduce total number of icache lines that will need to be
08c3a6
       pulled in for a given length.
08c3a6
    
08c3a6
    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
08c3a6
       jumping to the stosb target in the sse2 code section will almost
08c3a6
       certainly be to a new page. The new version does increase code size
08c3a6
       marginally by duplicating the target but should get better iTLB
08c3a6
       behavior as a result.
08c3a6
    
08c3a6
    test-memset, test-wmemset, and test-bzero are all passing.
08c3a6
    
08c3a6
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
08c3a6
index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
08c3a6
--- a/sysdeps/x86_64/memset.S
08c3a6
+++ b/sysdeps/x86_64/memset.S
08c3a6
@@ -18,13 +18,15 @@
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
 #include <sysdep.h>
08c3a6
+#define USE_WITH_SSE2	1
08c3a6
 
08c3a6
 #define VEC_SIZE	16
08c3a6
+#define MOV_SIZE	3
08c3a6
+#define RET_SIZE	1
08c3a6
+
08c3a6
 #define VEC(i)		xmm##i
08c3a6
-/* Don't use movups and movaps since it will get larger nop paddings for
08c3a6
-   alignment.  */
08c3a6
-#define VMOVU		movdqu
08c3a6
-#define VMOVA		movdqa
08c3a6
+#define VMOVU     movups
08c3a6
+#define VMOVA     movaps
08c3a6
 
08c3a6
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
   movd d, %xmm0; \
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
index ae0860f36a47d594..1af668af0aeda59e 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
@@ -1,8 +1,14 @@
08c3a6
 #if IS_IN (libc)
08c3a6
+# define USE_WITH_AVX2	1
08c3a6
+
08c3a6
 # define VEC_SIZE	32
08c3a6
+# define MOV_SIZE	4
08c3a6
+# define RET_SIZE	4
08c3a6
+
08c3a6
 # define VEC(i)		ymm##i
08c3a6
-# define VMOVU		vmovdqu
08c3a6
-# define VMOVA		vmovdqa
08c3a6
+
08c3a6
+# define VMOVU     vmovdqu
08c3a6
+# define VMOVA     vmovdqa
08c3a6
 
08c3a6
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
   vmovd d, %xmm0; \
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
index 8ad842fc2f140527..f14d6f8493c21a36 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
@@ -1,11 +1,18 @@
08c3a6
 #if IS_IN (libc)
08c3a6
+# define USE_WITH_AVX512	1
08c3a6
+
08c3a6
 # define VEC_SIZE	64
08c3a6
+# define MOV_SIZE	6
08c3a6
+# define RET_SIZE	1
08c3a6
+
08c3a6
 # define XMM0		xmm16
08c3a6
 # define YMM0		ymm16
08c3a6
 # define VEC0		zmm16
08c3a6
 # define VEC(i)		VEC##i
08c3a6
-# define VMOVU		vmovdqu64
08c3a6
-# define VMOVA		vmovdqa64
08c3a6
+
08c3a6
+# define VMOVU     vmovdqu64
08c3a6
+# define VMOVA     vmovdqa64
08c3a6
+
08c3a6
 # define VZEROUPPER
08c3a6
 
08c3a6
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
index 640f092903302ad0..64b09e77cc20cc42 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
@@ -1,11 +1,18 @@
08c3a6
 #if IS_IN (libc)
08c3a6
+# define USE_WITH_EVEX	1
08c3a6
+
08c3a6
 # define VEC_SIZE	32
08c3a6
+# define MOV_SIZE	6
08c3a6
+# define RET_SIZE	1
08c3a6
+
08c3a6
 # define XMM0		xmm16
08c3a6
 # define YMM0		ymm16
08c3a6
 # define VEC0		ymm16
08c3a6
 # define VEC(i)		VEC##i
08c3a6
-# define VMOVU		vmovdqu64
08c3a6
-# define VMOVA		vmovdqa64
08c3a6
+
08c3a6
+# define VMOVU     vmovdqu64
08c3a6
+# define VMOVA     vmovdqa64
08c3a6
+
08c3a6
 # define VZEROUPPER
08c3a6
 
08c3a6
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
index ff196844a093dc3b..e723413a664c088f 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
@@ -63,8 +63,27 @@
08c3a6
 # endif
08c3a6
 #endif
08c3a6
 
08c3a6
+#if VEC_SIZE == 64
08c3a6
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
08c3a6
+#else
08c3a6
+# define LOOP_4X_OFFSET	(0)
08c3a6
+#endif
08c3a6
+
08c3a6
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
+# define END_REG	rcx
08c3a6
+# define LOOP_REG	rdi
08c3a6
+#else
08c3a6
+# define END_REG	rdi
08c3a6
+# define LOOP_REG	rdx
08c3a6
+#endif
08c3a6
+
08c3a6
 #define PAGE_SIZE 4096
08c3a6
 
08c3a6
+/* Macro to calculate size of small memset block for aligning
08c3a6
+   purposes.  */
08c3a6
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
08c3a6
+
08c3a6
+
08c3a6
 #ifndef SECTION
08c3a6
 # error SECTION is not defined!
08c3a6
 #endif
08c3a6
@@ -74,6 +93,7 @@
08c3a6
 ENTRY (__bzero)
08c3a6
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
08c3a6
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
08c3a6
+	xorl	%esi, %esi
08c3a6
 	pxor	%XMM0, %XMM0
08c3a6
 	jmp	L(entry_from_bzero)
08c3a6
 END (__bzero)
08c3a6
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
08c3a6
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
08c3a6
 # endif
08c3a6
 
08c3a6
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
08c3a6
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
08c3a6
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
 # ifdef __ILP32__
08c3a6
 	/* Clear the upper 32 bits.  */
08c3a6
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
08c3a6
 	jb	L(less_vec)
08c3a6
 	cmp	$(VEC_SIZE * 2), %RDX_LP
08c3a6
 	ja	L(stosb_more_2x_vec)
08c3a6
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
08c3a6
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
08c3a6
-	VMOVU	%VEC(0), (%rdi)
08c3a6
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
08c3a6
+	 */
08c3a6
+	VMOVU	%VEC(0), (%rax)
08c3a6
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(stosb_more_2x_vec):
08c3a6
-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
08c3a6
-	ja	L(stosb)
08c3a6
-#else
08c3a6
-	.p2align 4
08c3a6
 #endif
08c3a6
-L(more_2x_vec):
08c3a6
-	/* Stores to first 2x VEC before cmp as any path forward will
08c3a6
-	   require it.  */
08c3a6
-	VMOVU	%VEC(0), (%rdi)
08c3a6
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
08c3a6
-	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
-	ja	L(loop_start)
08c3a6
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
08c3a6
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
08c3a6
-L(return):
08c3a6
-#if VEC_SIZE > 16
08c3a6
-	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
+
08c3a6
+	.p2align 4,, 10
08c3a6
+L(last_2x_vec):
08c3a6
+#ifdef USE_LESS_VEC_MASK_STORE
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
08c3a6
 #else
08c3a6
-	ret
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
08c3a6
 #endif
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-L(loop_start):
08c3a6
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
08c3a6
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
08c3a6
-	cmpq	$(VEC_SIZE * 8), %rdx
08c3a6
-	jbe	L(loop_end)
08c3a6
-	andq	$-(VEC_SIZE * 2), %rdi
08c3a6
-	subq	$-(VEC_SIZE * 4), %rdi
08c3a6
-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
08c3a6
-	.p2align 4
08c3a6
-L(loop):
08c3a6
-	VMOVA	%VEC(0), (%rdi)
08c3a6
-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
08c3a6
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
08c3a6
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
08c3a6
-	subq	$-(VEC_SIZE * 4), %rdi
08c3a6
-	cmpq	%rcx, %rdi
08c3a6
-	jb	L(loop)
08c3a6
-L(loop_end):
08c3a6
-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
08c3a6
-	       rdx as length is also unchanged.  */
08c3a6
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
08c3a6
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
08c3a6
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
08c3a6
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
08c3a6
-	VZEROUPPER_SHORT_RETURN
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
+	/* If have AVX512 mask instructions put L(less_vec) close to
08c3a6
+	   entry as it doesn't take much space and is likely a hot target.
08c3a6
+	 */
08c3a6
+#ifdef USE_LESS_VEC_MASK_STORE
08c3a6
+	.p2align 4,, 10
08c3a6
 L(less_vec):
08c3a6
 	/* Less than 1 VEC.  */
08c3a6
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
08c3a6
 #  error Unsupported VEC_SIZE!
08c3a6
 # endif
08c3a6
-# ifdef USE_LESS_VEC_MASK_STORE
08c3a6
 	/* Clear high bits from edi. Only keeping bits relevant to page
08c3a6
 	   cross check. Note that we are using rax which is set in
08c3a6
-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
08c3a6
-	 */
08c3a6
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
08c3a6
 	andl	$(PAGE_SIZE - 1), %edi
08c3a6
-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
08c3a6
-	   performance degradation when it has to fault supress.  */
08c3a6
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
08c3a6
+	   serious performance degradation when it has to fault supress.
08c3a6
+	 */
08c3a6
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
08c3a6
+	/* This is generally considered a cold target.  */
08c3a6
 	ja	L(cross_page)
08c3a6
 # if VEC_SIZE > 32
08c3a6
 	movq	$-1, %rcx
08c3a6
@@ -247,58 +235,185 @@ L(less_vec):
08c3a6
 	bzhil	%edx, %ecx, %ecx
08c3a6
 	kmovd	%ecx, %k1
08c3a6
 # endif
08c3a6
-	vmovdqu8	%VEC(0), (%rax) {%k1}
08c3a6
+	vmovdqu8 %VEC(0), (%rax){%k1}
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 
08c3a6
+# if defined USE_MULTIARCH && IS_IN (libc)
08c3a6
+	/* Include L(stosb_local) here if including L(less_vec) between
08c3a6
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
08c3a6
+	   L(stosb_more_2x_vec) target.  */
08c3a6
+	.p2align 4,, 10
08c3a6
+L(stosb_local):
08c3a6
+	movzbl	%sil, %eax
08c3a6
+	mov	%RDX_LP, %RCX_LP
08c3a6
+	mov	%RDI_LP, %RDX_LP
08c3a6
+	rep	stosb
08c3a6
+	mov	%RDX_LP, %RAX_LP
08c3a6
+	VZEROUPPER_RETURN
08c3a6
+# endif
08c3a6
+#endif
08c3a6
+
08c3a6
+#if defined USE_MULTIARCH && IS_IN (libc)
08c3a6
 	.p2align 4
08c3a6
-L(cross_page):
08c3a6
+L(stosb_more_2x_vec):
08c3a6
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
08c3a6
+	ja	L(stosb_local)
08c3a6
+#endif
08c3a6
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
08c3a6
+	   and (4x, 8x] jump to target.  */
08c3a6
+L(more_2x_vec):
08c3a6
+
08c3a6
+	/* Two different methods of setting up pointers / compare. The
08c3a6
+	   two methods are based on the fact that EVEX/AVX512 mov
08c3a6
+	   instructions take more bytes then AVX2/SSE2 mov instructions. As
08c3a6
+	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
08c3a6
+	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
08c3a6
+	   this saves code size and keeps a few targets in one fetch block.
08c3a6
+	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
08c3a6
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
08c3a6
+	   LOOP_4X_OFFSET) with LEA_BID.  */
08c3a6
+
08c3a6
+	/* END_REG is rcx for EVEX/AVX512.  */
08c3a6
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
08c3a6
+#endif
08c3a6
+
08c3a6
+	/* Stores to first 2x VEC before cmp as any path forward will
08c3a6
+	   require it.  */
08c3a6
+	VMOVU	%VEC(0), (%rax)
08c3a6
+	VMOVU	%VEC(0), VEC_SIZE(%rax)
08c3a6
+
08c3a6
+
08c3a6
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
08c3a6
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
08c3a6
+	addq	%rdx, %END_REG
08c3a6
+#endif
08c3a6
+
08c3a6
+	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
+	jbe	L(last_2x_vec)
08c3a6
+
08c3a6
+	/* Store next 2x vec regardless.  */
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
08c3a6
+
08c3a6
+
08c3a6
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
08c3a6
+	   extra offset to addresses in loop. Used for AVX512 to save space
08c3a6
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
08c3a6
+# if LOOP_4X_OFFSET == 0
08c3a6
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
08c3a6
 # endif
08c3a6
-# if VEC_SIZE > 32
08c3a6
-	cmpb	$32, %dl
08c3a6
-	jae	L(between_32_63)
08c3a6
+	/* Avoid imm32 compare here to save code size.  */
08c3a6
+	cmpq	%rdi, %rcx
08c3a6
+#else
08c3a6
+	addq	$-(VEC_SIZE * 4), %END_REG
08c3a6
+	cmpq	$(VEC_SIZE * 8), %rdx
08c3a6
+#endif
08c3a6
+	jbe	L(last_4x_vec)
08c3a6
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
08c3a6
+	/* Set LOOP_REG (rdx).  */
08c3a6
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
08c3a6
+#endif
08c3a6
+	/* Align dst for loop.  */
08c3a6
+	andq	$(VEC_SIZE * -2), %LOOP_REG
08c3a6
+	.p2align 4
08c3a6
+L(loop):
08c3a6
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
08c3a6
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
08c3a6
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
08c3a6
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
08c3a6
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
08c3a6
+	cmpq	%END_REG, %LOOP_REG
08c3a6
+	jb	L(loop)
08c3a6
+	.p2align 4,, MOV_SIZE
08c3a6
+L(last_4x_vec):
08c3a6
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
08c3a6
+L(return):
08c3a6
+#if VEC_SIZE > 16
08c3a6
+	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
+#else
08c3a6
+	ret
08c3a6
+#endif
08c3a6
+
08c3a6
+	.p2align 4,, 10
08c3a6
+#ifndef USE_LESS_VEC_MASK_STORE
08c3a6
+# if defined USE_MULTIARCH && IS_IN (libc)
08c3a6
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
08c3a6
+	   range for 2-byte jump encoding.  */
08c3a6
+L(stosb_local):
08c3a6
+	movzbl	%sil, %eax
08c3a6
+	mov	%RDX_LP, %RCX_LP
08c3a6
+	mov	%RDI_LP, %RDX_LP
08c3a6
+	rep	stosb
08c3a6
+	mov	%RDX_LP, %RAX_LP
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 # endif
08c3a6
-# if VEC_SIZE > 16
08c3a6
-	cmpb	$16, %dl
08c3a6
+	/* Define L(less_vec) only if not otherwise defined.  */
08c3a6
+	.p2align 4
08c3a6
+L(less_vec):
08c3a6
+#endif
08c3a6
+L(cross_page):
08c3a6
+#if VEC_SIZE > 32
08c3a6
+	cmpl	$32, %edx
08c3a6
+	jae	L(between_32_63)
08c3a6
+#endif
08c3a6
+#if VEC_SIZE > 16
08c3a6
+	cmpl	$16, %edx
08c3a6
 	jae	L(between_16_31)
08c3a6
-# endif
08c3a6
-	MOVQ	%XMM0, %rcx
08c3a6
-	cmpb	$8, %dl
08c3a6
+#endif
08c3a6
+	MOVQ	%XMM0, %rdi
08c3a6
+	cmpl	$8, %edx
08c3a6
 	jae	L(between_8_15)
08c3a6
-	cmpb	$4, %dl
08c3a6
+	cmpl	$4, %edx
08c3a6
 	jae	L(between_4_7)
08c3a6
-	cmpb	$1, %dl
08c3a6
+	cmpl	$1, %edx
08c3a6
 	ja	L(between_2_3)
08c3a6
-	jb	1f
08c3a6
-	movb	%cl, (%rax)
08c3a6
-1:
08c3a6
+	jb	L(return)
08c3a6
+	movb	%sil, (%rax)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-# if VEC_SIZE > 32
08c3a6
+
08c3a6
+	/* Align small targets only if not doing so would cross a fetch
08c3a6
+	   line.  */
08c3a6
+#if VEC_SIZE > 32
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
08c3a6
 	/* From 32 to 63.  No branch when size == 32.  */
08c3a6
 L(between_32_63):
08c3a6
-	VMOVU	%YMM0, -32(%rax,%rdx)
08c3a6
 	VMOVU	%YMM0, (%rax)
08c3a6
+	VMOVU	%YMM0, -32(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-# endif
08c3a6
-# if VEC_SIZE > 16
08c3a6
-	/* From 16 to 31.  No branch when size == 16.  */
08c3a6
+#endif
08c3a6
+
08c3a6
+#if VEC_SIZE >= 32
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
08c3a6
 L(between_16_31):
08c3a6
-	VMOVU	%XMM0, -16(%rax,%rdx)
08c3a6
+	/* From 16 to 31.  No branch when size == 16.  */
08c3a6
 	VMOVU	%XMM0, (%rax)
08c3a6
+	VMOVU	%XMM0, -16(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-# endif
08c3a6
-	/* From 8 to 15.  No branch when size == 8.  */
08c3a6
+#endif
08c3a6
+
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
08c3a6
 L(between_8_15):
08c3a6
-	movq	%rcx, -8(%rax,%rdx)
08c3a6
-	movq	%rcx, (%rax)
08c3a6
+	/* From 8 to 15.  No branch when size == 8.  */
08c3a6
+	movq	%rdi, (%rax)
08c3a6
+	movq	%rdi, -8(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
+
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
08c3a6
 L(between_4_7):
08c3a6
 	/* From 4 to 7.  No branch when size == 4.  */
08c3a6
-	movl	%ecx, -4(%rax,%rdx)
08c3a6
-	movl	%ecx, (%rax)
08c3a6
+	movl	%edi, (%rax)
08c3a6
+	movl	%edi, -4(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
+
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
08c3a6
 L(between_2_3):
08c3a6
 	/* From 2 to 3.  No branch when size == 2.  */
08c3a6
-	movw	%cx, -2(%rax,%rdx)
08c3a6
-	movw	%cx, (%rax)
08c3a6
+	movw	%di, (%rax)
08c3a6
+	movb	%dil, -1(%rax, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 END (MEMSET_SYMBOL (__memset, unaligned_erms))