076f82
commit ea19c490a3f5628d55ded271cbb753e66b2f05e8
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Sun Feb 6 00:54:18 2022 -0600
076f82
076f82
    x86: Improve vec generation in memset-vec-unaligned-erms.S
076f82
    
076f82
    No bug.
076f82
    
076f82
    Split vec generation into multiple steps. This allows the
076f82
    broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
076f82
    case. This saves an expensive lane-cross instruction and removes
076f82
    the need for 'vzeroupper'.
076f82
    
076f82
    For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
076f82
    byte broadcast.
076f82
    
076f82
    Results for memset-avx2 small (geomean of N = 20 benchset runs).
076f82
    
076f82
    size, New Time, Old Time, New / Old
076f82
       0,    4.100,    3.831,     0.934
076f82
       1,    5.074,    4.399,     0.867
076f82
       2,    4.433,    4.411,     0.995
076f82
       4,    4.487,    4.415,     0.984
076f82
       8,    4.454,    4.396,     0.987
076f82
      16,    4.502,    4.443,     0.987
076f82
    
076f82
    All relevant string/wcsmbs tests are passing.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
076f82
076f82
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
076f82
index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644
076f82
--- a/sysdeps/x86_64/memset.S
076f82
+++ b/sysdeps/x86_64/memset.S
076f82
@@ -28,17 +28,22 @@
076f82
 #define VMOVU     movups
076f82
 #define VMOVA     movaps
076f82
 
076f82
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
   movd d, %xmm0; \
076f82
-  movq r, %rax; \
076f82
-  punpcklbw %xmm0, %xmm0; \
076f82
-  punpcklwd %xmm0, %xmm0; \
076f82
-  pshufd $0, %xmm0, %xmm0
076f82
+  pxor %xmm1, %xmm1; \
076f82
+  pshufb %xmm1, %xmm0; \
076f82
+  movq r, %rax
076f82
 
076f82
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
   movd d, %xmm0; \
076f82
-  movq r, %rax; \
076f82
-  pshufd $0, %xmm0, %xmm0
076f82
+  pshufd $0, %xmm0, %xmm0; \
076f82
+  movq r, %rax
076f82
+
076f82
+# define MEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define MEMSET_VDUP_TO_VEC0_LOW()
076f82
+
076f82
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define WMEMSET_VDUP_TO_VEC0_LOW()
076f82
 
076f82
 #define SECTION(p)		p
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
index 1af668af0aeda59e..c0bf2875d03d51ab 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
076f82
@@ -10,15 +10,18 @@
076f82
 # define VMOVU     vmovdqu
076f82
 # define VMOVA     vmovdqa
076f82
 
076f82
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
   vmovd d, %xmm0; \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastb %xmm0, %ymm0
076f82
+  movq r, %rax;
076f82
 
076f82
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
-  vmovd d, %xmm0; \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastd %xmm0, %ymm0
076f82
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
076f82
+
076f82
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
076f82
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
076f82
+
076f82
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
076f82
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
076f82
 
076f82
 # ifndef SECTION
076f82
 #  define SECTION(p)		p##.avx
076f82
@@ -30,5 +33,6 @@
076f82
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
076f82
 # endif
076f82
 
076f82
+# define USE_XMM_LESS_VEC
076f82
 # include "memset-vec-unaligned-erms.S"
076f82
 #endif
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
index f14d6f8493c21a36..5241216a77bf72b7 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
076f82
@@ -15,13 +15,19 @@
076f82
 
076f82
 # define VZEROUPPER
076f82
 
076f82
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastb d, %VEC0
076f82
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
+  vpbroadcastb d, %VEC0; \
076f82
+  movq r, %rax
076f82
 
076f82
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastd d, %VEC0
076f82
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
+  vpbroadcastd d, %VEC0; \
076f82
+  movq r, %rax
076f82
+
076f82
+# define MEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define MEMSET_VDUP_TO_VEC0_LOW()
076f82
+
076f82
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define WMEMSET_VDUP_TO_VEC0_LOW()
076f82
 
076f82
 # define SECTION(p)		p##.evex512
076f82
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
index 64b09e77cc20cc42..637002150659123c 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
076f82
@@ -15,13 +15,19 @@
076f82
 
076f82
 # define VZEROUPPER
076f82
 
076f82
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastb d, %VEC0
076f82
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
+  vpbroadcastb d, %VEC0; \
076f82
+  movq r, %rax
076f82
 
076f82
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
076f82
-  movq r, %rax; \
076f82
-  vpbroadcastd d, %VEC0
076f82
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
076f82
+  vpbroadcastd d, %VEC0; \
076f82
+  movq r, %rax
076f82
+
076f82
+# define MEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define MEMSET_VDUP_TO_VEC0_LOW()
076f82
+
076f82
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
076f82
+# define WMEMSET_VDUP_TO_VEC0_LOW()
076f82
 
076f82
 # define SECTION(p)		p##.evex
076f82
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
076f82
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
index e723413a664c088f..c8db87dcbf69f0d8 100644
076f82
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
076f82
@@ -58,8 +58,10 @@
076f82
 #ifndef MOVQ
076f82
 # if VEC_SIZE > 16
076f82
 #  define MOVQ				vmovq
076f82
+#  define MOVD				vmovd
076f82
 # else
076f82
 #  define MOVQ				movq
076f82
+#  define MOVD				movd
076f82
 # endif
076f82
 #endif
076f82
 
076f82
@@ -72,9 +74,17 @@
076f82
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
 # define END_REG	rcx
076f82
 # define LOOP_REG	rdi
076f82
+# define LESS_VEC_REG	rax
076f82
 #else
076f82
 # define END_REG	rdi
076f82
 # define LOOP_REG	rdx
076f82
+# define LESS_VEC_REG	rdi
076f82
+#endif
076f82
+
076f82
+#ifdef USE_XMM_LESS_VEC
076f82
+# define XMM_SMALL	1
076f82
+#else
076f82
+# define XMM_SMALL	0
076f82
 #endif
076f82
 
076f82
 #define PAGE_SIZE 4096
076f82
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
076f82
 
076f82
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
076f82
 	shl	$2, %RDX_LP
076f82
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
-	jmp	L(entry_from_bzero)
076f82
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
+	WMEMSET_VDUP_TO_VEC0_LOW()
076f82
+	cmpq	$VEC_SIZE, %rdx
076f82
+	jb	L(less_vec_no_vdup)
076f82
+	WMEMSET_VDUP_TO_VEC0_HIGH()
076f82
+	jmp	L(entry_from_wmemset)
076f82
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
076f82
 #endif
076f82
 
076f82
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
076f82
 #endif
076f82
 
076f82
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
076f82
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
 # ifdef __ILP32__
076f82
 	/* Clear the upper 32 bits.  */
076f82
 	mov	%edx, %edx
076f82
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
076f82
 L(entry_from_bzero):
076f82
 	cmpq	$VEC_SIZE, %rdx
076f82
 	jb	L(less_vec)
076f82
+	MEMSET_VDUP_TO_VEC0_HIGH()
076f82
+L(entry_from_wmemset):
076f82
 	cmpq	$(VEC_SIZE * 2), %rdx
076f82
 	ja	L(more_2x_vec)
076f82
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
076f82
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
076f82
 # endif
076f82
 
076f82
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
076f82
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
076f82
 # ifdef __ILP32__
076f82
 	/* Clear the upper 32 bits.  */
076f82
 	mov	%edx, %edx
076f82
 # endif
076f82
 	cmp	$VEC_SIZE, %RDX_LP
076f82
 	jb	L(less_vec)
076f82
+	MEMSET_VDUP_TO_VEC0_HIGH ()
076f82
 	cmp	$(VEC_SIZE * 2), %RDX_LP
076f82
 	ja	L(stosb_more_2x_vec)
076f82
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
076f82
-	 */
076f82
-	VMOVU	%VEC(0), (%rax)
076f82
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
076f82
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
076f82
+	VMOVU	%VEC(0), (%rdi)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
 #endif
076f82
 
076f82
-	.p2align 4,, 10
076f82
+	.p2align 4,, 4
076f82
 L(last_2x_vec):
076f82
 #ifdef USE_LESS_VEC_MASK_STORE
076f82
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
076f82
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
076f82
 #else
076f82
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
076f82
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
076f82
@@ -212,6 +228,7 @@ L(last_2x_vec):
076f82
 #ifdef USE_LESS_VEC_MASK_STORE
076f82
 	.p2align 4,, 10
076f82
 L(less_vec):
076f82
+L(less_vec_no_vdup):
076f82
 	/* Less than 1 VEC.  */
076f82
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
076f82
 #  error Unsupported VEC_SIZE!
076f82
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
076f82
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
076f82
 	   and (4x, 8x] jump to target.  */
076f82
 L(more_2x_vec):
076f82
-
076f82
-	/* Two different methods of setting up pointers / compare. The
076f82
-	   two methods are based on the fact that EVEX/AVX512 mov
076f82
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
076f82
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
076f82
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
076f82
-	   this saves code size and keeps a few targets in one fetch block.
076f82
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
076f82
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
076f82
-	   LOOP_4X_OFFSET) with LEA_BID.  */
076f82
-
076f82
-	/* END_REG is rcx for EVEX/AVX512.  */
076f82
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
076f82
-#endif
076f82
-
076f82
-	/* Stores to first 2x VEC before cmp as any path forward will
076f82
-	   require it.  */
076f82
-	VMOVU	%VEC(0), (%rax)
076f82
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
076f82
+	/* Store next 2x vec regardless.  */
076f82
+	VMOVU	%VEC(0), (%rdi)
076f82
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
076f82
 
076f82
 
076f82
+	/* Two different methods of setting up pointers / compare. The two
076f82
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
076f82
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
076f82
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
076f82
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
076f82
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
076f82
+	   bottlenecks.  */
076f82
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
076f82
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
076f82
 	addq	%rdx, %END_REG
076f82
@@ -292,6 +299,15 @@ L(more_2x_vec):
076f82
 	cmpq	$(VEC_SIZE * 4), %rdx
076f82
 	jbe	L(last_2x_vec)
076f82
 
076f82
+
076f82
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
076f82
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
076f82
+	   LEA_BID.  */
076f82
+
076f82
+	/* END_REG is rcx for EVEX/AVX512.  */
076f82
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
076f82
+#endif
076f82
+
076f82
 	/* Store next 2x vec regardless.  */
076f82
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
076f82
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
076f82
@@ -355,65 +371,93 @@ L(stosb_local):
076f82
 	/* Define L(less_vec) only if not otherwise defined.  */
076f82
 	.p2align 4
076f82
 L(less_vec):
076f82
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
076f82
+	   xmm). This is only does anything for AVX2.  */
076f82
+	MEMSET_VDUP_TO_VEC0_LOW ()
076f82
+L(less_vec_no_vdup):
076f82
 #endif
076f82
 L(cross_page):
076f82
 #if VEC_SIZE > 32
076f82
 	cmpl	$32, %edx
076f82
-	jae	L(between_32_63)
076f82
+	jge	L(between_32_63)
076f82
 #endif
076f82
 #if VEC_SIZE > 16
076f82
 	cmpl	$16, %edx
076f82
-	jae	L(between_16_31)
076f82
+	jge	L(between_16_31)
076f82
+#endif
076f82
+#ifndef USE_XMM_LESS_VEC
076f82
+	MOVQ	%XMM0, %rcx
076f82
 #endif
076f82
-	MOVQ	%XMM0, %rdi
076f82
 	cmpl	$8, %edx
076f82
-	jae	L(between_8_15)
076f82
+	jge	L(between_8_15)
076f82
 	cmpl	$4, %edx
076f82
-	jae	L(between_4_7)
076f82
+	jge	L(between_4_7)
076f82
 	cmpl	$1, %edx
076f82
-	ja	L(between_2_3)
076f82
-	jb	L(return)
076f82
-	movb	%sil, (%rax)
076f82
-	VZEROUPPER_RETURN
076f82
+	jg	L(between_2_3)
076f82
+	jl	L(between_0_0)
076f82
+	movb	%sil, (%LESS_VEC_REG)
076f82
+L(between_0_0):
076f82
+	ret
076f82
 
076f82
-	/* Align small targets only if not doing so would cross a fetch
076f82
-	   line.  */
076f82
+	/* Align small targets only if not doing so would cross a fetch line.
076f82
+	 */
076f82
 #if VEC_SIZE > 32
076f82
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
076f82
 	/* From 32 to 63.  No branch when size == 32.  */
076f82
 L(between_32_63):
076f82
-	VMOVU	%YMM0, (%rax)
076f82
-	VMOVU	%YMM0, -32(%rax, %rdx)
076f82
+	VMOVU	%YMM0, (%LESS_VEC_REG)
076f82
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
076f82
 	VZEROUPPER_RETURN
076f82
 #endif
076f82
 
076f82
 #if VEC_SIZE >= 32
076f82
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
076f82
 L(between_16_31):
076f82
 	/* From 16 to 31.  No branch when size == 16.  */
076f82
-	VMOVU	%XMM0, (%rax)
076f82
-	VMOVU	%XMM0, -16(%rax, %rdx)
076f82
-	VZEROUPPER_RETURN
076f82
+	VMOVU	%XMM0, (%LESS_VEC_REG)
076f82
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
076f82
+	ret
076f82
 #endif
076f82
 
076f82
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
076f82
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
076f82
+	 */
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
076f82
 L(between_8_15):
076f82
 	/* From 8 to 15.  No branch when size == 8.  */
076f82
-	movq	%rdi, (%rax)
076f82
-	movq	%rdi, -8(%rax, %rdx)
076f82
-	VZEROUPPER_RETURN
076f82
+#ifdef USE_XMM_LESS_VEC
076f82
+	MOVQ	%XMM0, (%rdi)
076f82
+	MOVQ	%XMM0, -8(%rdi, %rdx)
076f82
+#else
076f82
+	movq	%rcx, (%LESS_VEC_REG)
076f82
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
076f82
+#endif
076f82
+	ret
076f82
 
076f82
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
076f82
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
076f82
+	 */
076f82
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
076f82
 L(between_4_7):
076f82
 	/* From 4 to 7.  No branch when size == 4.  */
076f82
-	movl	%edi, (%rax)
076f82
-	movl	%edi, -4(%rax, %rdx)
076f82
-	VZEROUPPER_RETURN
076f82
+#ifdef USE_XMM_LESS_VEC
076f82
+	MOVD	%XMM0, (%rdi)
076f82
+	MOVD	%XMM0, -4(%rdi, %rdx)
076f82
+#else
076f82
+	movl	%ecx, (%LESS_VEC_REG)
076f82
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
076f82
+#endif
076f82
+	ret
076f82
 
076f82
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
076f82
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
076f82
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
076f82
 L(between_2_3):
076f82
 	/* From 2 to 3.  No branch when size == 2.  */
076f82
-	movw	%di, (%rax)
076f82
-	movb	%dil, -1(%rax, %rdx)
076f82
-	VZEROUPPER_RETURN
076f82
+#ifdef USE_XMM_LESS_VEC
076f82
+	movb	%sil, (%rdi)
076f82
+	movb	%sil, 1(%rdi)
076f82
+	movb	%sil, -1(%rdi, %rdx)
076f82
+#else
076f82
+	movw	%cx, (%LESS_VEC_REG)
076f82
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
076f82
+#endif
076f82
+	ret
076f82
 END (MEMSET_SYMBOL (__memset, unaligned_erms))