076f82
commit 6d18a93dbbde2958001d65dff3080beed7ae675a
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Sep 20 16:20:15 2021 -0500
076f82
076f82
    x86: Optimize memset-vec-unaligned-erms.S
076f82
    
076f82
    No bug.
076f82
    
076f82
    Optimization are
076f82
    
076f82
    1. change control flow for L(more_2x_vec) to fall through to loop and
076f82
       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
076f82
       size and saves jumps for length > 4x VEC_SIZE.
076f82
    
076f82
    2. For EVEX/AVX512 move L(less_vec) closer to entry.
076f82
    
076f82
    3. Avoid complex address mode for length > 2x VEC_SIZE
076f82
    
076f82
    4. Slightly better aligning code for the loop from the perspective of
076f82
       code size and uops.
076f82
    
076f82
    5. Align targets so they make full use of their fetch block and if
076f82
       possible cache line.
076f82
    
076f82
    6. Try and reduce total number of icache lines that will need to be
076f82
       pulled in for a given length.
076f82
    
076f82
    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
076f82
       jumping to the stosb target in the sse2 code section will almost
076f82
       certainly be to a new page. The new version does increase code size
076f82
       marginally by duplicating the target but should get better iTLB
076f82
       behavior as a result.
076f82
    
076f82
    test-memset, test-wmemset, and test-bzero are all passing.
076f82
    
076f82
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
076f82
076f82
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
076f82
index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
076f82
--- a/sysdeps/x86_64/memset.S
076f82
+++ b/sysdeps/x86_64/memset.S
076f82
@@ -18,13 +18,15 @@
076f82
    <https://www.gnu.org/licenses/>.  */
076f82
 
076f82
 #include <sysdep.h>
076f82
+#define USE_WITH_SSE2	1
076f82
 
076f82
 #define VEC_SIZE	16
076f82
+#define MOV_SIZE	3
076f82
+#define RET_SIZE	1
076f82
+
076f82
 #define VEC(i)		xmm##i
076f82
-/* Don't use movups and movaps since it will get larger nop paddings for
076f82
-   alignment.  */
076f82
-#define VMOVU		movdqu
076f82
-#define VMOVA		movdqa
076f82
+#define VMOVU     movups
076f82
+#define VMOVA     movaps
076f82
 
076f82
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
   movd d, %xmm0; \
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
index ae0860f36a47d594..1af668af0aeda59e 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
@@ -1,8 +1,14 @@
076f82
 #if IS_IN (libc)
076f82
+# define USE_WITH_AVX2	1
076f82
+
076f82
 # define VEC_SIZE	32
076f82
+# define MOV_SIZE	4
076f82
+# define RET_SIZE	4
076f82
+
076f82
 # define VEC(i)		ymm##i
076f82
-# define VMOVU		vmovdqu
076f82
-# define VMOVA		vmovdqa
076f82
+
076f82
+# define VMOVU     vmovdqu
076f82
+# define VMOVA     vmovdqa
076f82
 
076f82
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
   vmovd d, %xmm0; \
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
index 8ad842fc2f140527..f14d6f8493c21a36 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
@@ -1,11 +1,18 @@
076f82
 #if IS_IN (libc)
076f82
+# define USE_WITH_AVX512	1
076f82
+
076f82
 # define VEC_SIZE	64
076f82
+# define MOV_SIZE	6
076f82
+# define RET_SIZE	1
076f82
+
076f82
 # define XMM0		xmm16
076f82
 # define YMM0		ymm16
076f82
 # define VEC0		zmm16
076f82
 # define VEC(i)		VEC##i
076f82
-# define VMOVU		vmovdqu64
076f82
-# define VMOVA		vmovdqa64
076f82
+
076f82
+# define VMOVU     vmovdqu64
076f82
+# define VMOVA     vmovdqa64
076f82
+
076f82
 # define VZEROUPPER
076f82
 
076f82
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
index 640f092903302ad0..64b09e77cc20cc42 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
@@ -1,11 +1,18 @@
076f82
 #if IS_IN (libc)
076f82
+# define USE_WITH_EVEX	1
076f82
+
076f82
 # define VEC_SIZE	32
076f82
+# define MOV_SIZE	6
076f82
+# define RET_SIZE	1
076f82
+
076f82
 # define XMM0		xmm16
076f82
 # define YMM0		ymm16
076f82
 # define VEC0		ymm16
076f82
 # define VEC(i)		VEC##i
076f82
-# define VMOVU		vmovdqu64
076f82
-# define VMOVA		vmovdqa64
076f82
+
076f82
+# define VMOVU     vmovdqu64
076f82
+# define VMOVA     vmovdqa64
076f82
+
076f82
 # define VZEROUPPER
076f82
 
076f82
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
index ff196844a093dc3b..e723413a664c088f 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
@@ -63,8 +63,27 @@
076f82
 # endif
076f82
 #endif
076f82
 
076f82
+#if VEC_SIZE == 64
076f82
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
076f82
+#else
076f82
+# define LOOP_4X_OFFSET	(0)
076f82
+#endif
076f82
+
076f82
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
+# define END_REG	rcx
076f82
+# define LOOP_REG	rdi
076f82
+#else
076f82
+# define END_REG	rdi
076f82
+# define LOOP_REG	rdx
076f82
+#endif
076f82
+
076f82
 #define PAGE_SIZE 4096
076f82
 
076f82
+/* Macro to calculate size of small memset block for aligning
076f82
+   purposes.  */
076f82
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
076f82
+
076f82
+
076f82
 #ifndef SECTION
076f82
 # error SECTION is not defined!
076f82
 #endif
076f82
@@ -74,6 +93,7 @@
076f82
 ENTRY (__bzero)
076f82
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
076f82
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
076f82
+	xorl	%esi, %esi
076f82
 	pxor	%XMM0, %XMM0
076f82
 	jmp	L(entry_from_bzero)
076f82
 END (__bzero)
076f82
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
076f82
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
076f82
 # endif
076f82
 
076f82
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
076f82
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
076f82
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
 # ifdef __ILP32__
076f82
 	/* Clear the upper 32 bits.  */
076f82
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
076f82
 	jb	L(less_vec)
076f82
 	cmp	$(VEC_SIZE * 2), %RDX_LP
076f82
 	ja	L(stosb_more_2x_vec)
076f82
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
076f82
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
076f82
-	VMOVU	%VEC(0), (%rdi)
076f82
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
076f82
+	 */
076f82
+	VMOVU	%VEC(0), (%rax)
076f82
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
-
076f82
-	.p2align 4
076f82
-L(stosb_more_2x_vec):
076f82
-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
076f82
-	ja	L(stosb)
076f82
-#else
076f82
-	.p2align 4
076f82
 #endif
076f82
-L(more_2x_vec):
076f82
-	/* Stores to first 2x VEC before cmp as any path forward will
076f82
-	   require it.  */
076f82
-	VMOVU	%VEC(0), (%rdi)
076f82
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
076f82
-	cmpq	$(VEC_SIZE * 4), %rdx
076f82
-	ja	L(loop_start)
076f82
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
076f82
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
076f82
-L(return):
076f82
-#if VEC_SIZE > 16
076f82
-	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(last_2x_vec):
076f82
+#ifdef USE_LESS_VEC_MASK_STORE
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
076f82
 #else
076f82
-	ret
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
076f82
 #endif
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-L(loop_start):
076f82
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
076f82
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
076f82
-	cmpq	$(VEC_SIZE * 8), %rdx
076f82
-	jbe	L(loop_end)
076f82
-	andq	$-(VEC_SIZE * 2), %rdi
076f82
-	subq	$-(VEC_SIZE * 4), %rdi
076f82
-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
076f82
-	.p2align 4
076f82
-L(loop):
076f82
-	VMOVA	%VEC(0), (%rdi)
076f82
-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
076f82
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
076f82
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
076f82
-	subq	$-(VEC_SIZE * 4), %rdi
076f82
-	cmpq	%rcx, %rdi
076f82
-	jb	L(loop)
076f82
-L(loop_end):
076f82
-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
076f82
-	       rdx as length is also unchanged.  */
076f82
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
076f82
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
076f82
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
076f82
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
076f82
-	VZEROUPPER_SHORT_RETURN
076f82
-
076f82
-	.p2align 4
076f82
+	/* If have AVX512 mask instructions put L(less_vec) close to
076f82
+	   entry as it doesn't take much space and is likely a hot target.
076f82
+	 */
076f82
+#ifdef USE_LESS_VEC_MASK_STORE
076f82
+	.p2align 4,, 10
076f82
 L(less_vec):
076f82
 	/* Less than 1 VEC.  */
076f82
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
076f82
 #  error Unsupported VEC_SIZE!
076f82
 # endif
076f82
-# ifdef USE_LESS_VEC_MASK_STORE
076f82
 	/* Clear high bits from edi. Only keeping bits relevant to page
076f82
 	   cross check. Note that we are using rax which is set in
076f82
-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
076f82
-	 */
076f82
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
076f82
 	andl	$(PAGE_SIZE - 1), %edi
076f82
-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
076f82
-	   performance degradation when it has to fault supress.  */
076f82
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
076f82
+	   serious performance degradation when it has to fault supress.
076f82
+	 */
076f82
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
076f82
+	/* This is generally considered a cold target.  */
076f82
 	ja	L(cross_page)
076f82
 # if VEC_SIZE > 32
076f82
 	movq	$-1, %rcx
076f82
@@ -247,58 +235,185 @@ L(less_vec):
076f82
 	bzhil	%edx, %ecx, %ecx
076f82
 	kmovd	%ecx, %k1
076f82
 # endif
076f82
-	vmovdqu8	%VEC(0), (%rax) {%k1}
076f82
+	vmovdqu8 %VEC(0), (%rax){%k1}
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
+# if defined USE_MULTIARCH && IS_IN (libc)
076f82
+	/* Include L(stosb_local) here if including L(less_vec) between
076f82
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
076f82
+	   L(stosb_more_2x_vec) target.  */
076f82
+	.p2align 4,, 10
076f82
+L(stosb_local):
076f82
+	movzbl	%sil, %eax
076f82
+	mov	%RDX_LP, %RCX_LP
076f82
+	mov	%RDI_LP, %RDX_LP
076f82
+	rep	stosb
076f82
+	mov	%RDX_LP, %RAX_LP
076f82
+	VZEROUPPER_RETURN
076f82
+# endif
076f82
+#endif
076f82
+
076f82
+#if defined USE_MULTIARCH && IS_IN (libc)
076f82
 	.p2align 4
076f82
-L(cross_page):
076f82
+L(stosb_more_2x_vec):
076f82
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
076f82
+	ja	L(stosb_local)
076f82
+#endif
076f82
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
076f82
+	   and (4x, 8x] jump to target.  */
076f82
+L(more_2x_vec):
076f82
+
076f82
+	/* Two different methods of setting up pointers / compare. The
076f82
+	   two methods are based on the fact that EVEX/AVX512 mov
076f82
+	   instructions take more bytes then AVX2/SSE2 mov instructions. As
076f82
+	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
076f82
+	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
076f82
+	   this saves code size and keeps a few targets in one fetch block.
076f82
+	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
076f82
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
076f82
+	   LOOP_4X_OFFSET) with LEA_BID.  */
076f82
+
076f82
+	/* END_REG is rcx for EVEX/AVX512.  */
076f82
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
076f82
+#endif
076f82
+
076f82
+	/* Stores to first 2x VEC before cmp as any path forward will
076f82
+	   require it.  */
076f82
+	VMOVU	%VEC(0), (%rax)
076f82
+	VMOVU	%VEC(0), VEC_SIZE(%rax)
076f82
+
076f82
+
076f82
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
076f82
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
076f82
+	addq	%rdx, %END_REG
076f82
+#endif
076f82
+
076f82
+	cmpq	$(VEC_SIZE * 4), %rdx
076f82
+	jbe	L(last_2x_vec)
076f82
+
076f82
+	/* Store next 2x vec regardless.  */
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
076f82
+
076f82
+
076f82
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
076f82
+	   extra offset to addresses in loop. Used for AVX512 to save space
076f82
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
076f82
+# if LOOP_4X_OFFSET == 0
076f82
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
076f82
 # endif
076f82
-# if VEC_SIZE > 32
076f82
-	cmpb	$32, %dl
076f82
-	jae	L(between_32_63)
076f82
+	/* Avoid imm32 compare here to save code size.  */
076f82
+	cmpq	%rdi, %rcx
076f82
+#else
076f82
+	addq	$-(VEC_SIZE * 4), %END_REG
076f82
+	cmpq	$(VEC_SIZE * 8), %rdx
076f82
+#endif
076f82
+	jbe	L(last_4x_vec)
076f82
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
076f82
+	/* Set LOOP_REG (rdx).  */
076f82
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
076f82
+#endif
076f82
+	/* Align dst for loop.  */
076f82
+	andq	$(VEC_SIZE * -2), %LOOP_REG
076f82
+	.p2align 4
076f82
+L(loop):
076f82
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
076f82
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
076f82
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
076f82
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
076f82
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
076f82
+	cmpq	%END_REG, %LOOP_REG
076f82
+	jb	L(loop)
076f82
+	.p2align 4,, MOV_SIZE
076f82
+L(last_4x_vec):
076f82
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
076f82
+L(return):
076f82
+#if VEC_SIZE > 16
076f82
+	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
+#else
076f82
+	ret
076f82
+#endif
076f82
+
076f82
+	.p2align 4,, 10
076f82
+#ifndef USE_LESS_VEC_MASK_STORE
076f82
+# if defined USE_MULTIARCH && IS_IN (libc)
076f82
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
076f82
+	   range for 2-byte jump encoding.  */
076f82
+L(stosb_local):
076f82
+	movzbl	%sil, %eax
076f82
+	mov	%RDX_LP, %RCX_LP
076f82
+	mov	%RDI_LP, %RDX_LP
076f82
+	rep	stosb
076f82
+	mov	%RDX_LP, %RAX_LP
076f82
+	VZEROUPPER_RETURN
076f82
 # endif
076f82
-# if VEC_SIZE > 16
076f82
-	cmpb	$16, %dl
076f82
+	/* Define L(less_vec) only if not otherwise defined.  */
076f82
+	.p2align 4
076f82
+L(less_vec):
076f82
+#endif
076f82
+L(cross_page):
076f82
+#if VEC_SIZE > 32
076f82
+	cmpl	$32, %edx
076f82
+	jae	L(between_32_63)
076f82
+#endif
076f82
+#if VEC_SIZE > 16
076f82
+	cmpl	$16, %edx
076f82
 	jae	L(between_16_31)
076f82
-# endif
076f82
-	MOVQ	%XMM0, %rcx
076f82
-	cmpb	$8, %dl
076f82
+#endif
076f82
+	MOVQ	%XMM0, %rdi
076f82
+	cmpl	$8, %edx
076f82
 	jae	L(between_8_15)
076f82
-	cmpb	$4, %dl
076f82
+	cmpl	$4, %edx
076f82
 	jae	L(between_4_7)
076f82
-	cmpb	$1, %dl
076f82
+	cmpl	$1, %edx
076f82
 	ja	L(between_2_3)
076f82
-	jb	1f
076f82
-	movb	%cl, (%rax)
076f82
-1:
076f82
+	jb	L(return)
076f82
+	movb	%sil, (%rax)
076f82
 	VZEROUPPER_RETURN
076f82
-# if VEC_SIZE > 32
076f82
+
076f82
+	/* Align small targets only if not doing so would cross a fetch
076f82
+	   line.  */
076f82
+#if VEC_SIZE > 32
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
076f82
 	/* From 32 to 63.  No branch when size == 32.  */
076f82
 L(between_32_63):
076f82
-	VMOVU	%YMM0, -32(%rax,%rdx)
076f82
 	VMOVU	%YMM0, (%rax)
076f82
+	VMOVU	%YMM0, -32(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
-# endif
076f82
-# if VEC_SIZE > 16
076f82
-	/* From 16 to 31.  No branch when size == 16.  */
076f82
+#endif
076f82
+
076f82
+#if VEC_SIZE >= 32
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
076f82
 L(between_16_31):
076f82
-	VMOVU	%XMM0, -16(%rax,%rdx)
076f82
+	/* From 16 to 31.  No branch when size == 16.  */
076f82
 	VMOVU	%XMM0, (%rax)
076f82
+	VMOVU	%XMM0, -16(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
-# endif
076f82
-	/* From 8 to 15.  No branch when size == 8.  */
076f82
+#endif
076f82
+
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
076f82
 L(between_8_15):
076f82
-	movq	%rcx, -8(%rax,%rdx)
076f82
-	movq	%rcx, (%rax)
076f82
+	/* From 8 to 15.  No branch when size == 8.  */
076f82
+	movq	%rdi, (%rax)
076f82
+	movq	%rdi, -8(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
+
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
076f82
 L(between_4_7):
076f82
 	/* From 4 to 7.  No branch when size == 4.  */
076f82
-	movl	%ecx, -4(%rax,%rdx)
076f82
-	movl	%ecx, (%rax)
076f82
+	movl	%edi, (%rax)
076f82
+	movl	%edi, -4(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
+
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
076f82
 L(between_2_3):
076f82
 	/* From 2 to 3.  No branch when size == 2.  */
076f82
-	movw	%cx, -2(%rax,%rdx)
076f82
-	movw	%cx, (%rax)
076f82
+	movw	%di, (%rax)
076f82
+	movb	%dil, -1(%rax, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
 END (MEMSET_SYMBOL (__memset, unaligned_erms))