08c3a6
commit ea19c490a3f5628d55ded271cbb753e66b2f05e8
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Sun Feb 6 00:54:18 2022 -0600
08c3a6
08c3a6
    x86: Improve vec generation in memset-vec-unaligned-erms.S
08c3a6
    
08c3a6
    No bug.
08c3a6
    
08c3a6
    Split vec generation into multiple steps. This allows the
08c3a6
    broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
08c3a6
    case. This saves an expensive lane-cross instruction and removes
08c3a6
    the need for 'vzeroupper'.
08c3a6
    
08c3a6
    For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
08c3a6
    byte broadcast.
08c3a6
    
08c3a6
    Results for memset-avx2 small (geomean of N = 20 benchset runs).
08c3a6
    
08c3a6
    size, New Time, Old Time, New / Old
08c3a6
       0,    4.100,    3.831,     0.934
08c3a6
       1,    5.074,    4.399,     0.867
08c3a6
       2,    4.433,    4.411,     0.995
08c3a6
       4,    4.487,    4.415,     0.984
08c3a6
       8,    4.454,    4.396,     0.987
08c3a6
      16,    4.502,    4.443,     0.987
08c3a6
    
08c3a6
    All relevant string/wcsmbs tests are passing.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
08c3a6
index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644
08c3a6
--- a/sysdeps/x86_64/memset.S
08c3a6
+++ b/sysdeps/x86_64/memset.S
08c3a6
@@ -28,17 +28,22 @@
08c3a6
 #define VMOVU     movups
08c3a6
 #define VMOVA     movaps
08c3a6
 
08c3a6
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
   movd d, %xmm0; \
08c3a6
-  movq r, %rax; \
08c3a6
-  punpcklbw %xmm0, %xmm0; \
08c3a6
-  punpcklwd %xmm0, %xmm0; \
08c3a6
-  pshufd $0, %xmm0, %xmm0
08c3a6
+  pxor %xmm1, %xmm1; \
08c3a6
+  pshufb %xmm1, %xmm0; \
08c3a6
+  movq r, %rax
08c3a6
 
08c3a6
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
   movd d, %xmm0; \
08c3a6
-  movq r, %rax; \
08c3a6
-  pshufd $0, %xmm0, %xmm0
08c3a6
+  pshufd $0, %xmm0, %xmm0; \
08c3a6
+  movq r, %rax
08c3a6
+
08c3a6
+# define MEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define MEMSET_VDUP_TO_VEC0_LOW()
08c3a6
+
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_LOW()
08c3a6
 
08c3a6
 #define SECTION(p)		p
08c3a6
 
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
index 1af668af0aeda59e..c0bf2875d03d51ab 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
08c3a6
@@ -10,15 +10,18 @@
08c3a6
 # define VMOVU     vmovdqu
08c3a6
 # define VMOVA     vmovdqa
08c3a6
 
08c3a6
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
   vmovd d, %xmm0; \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastb %xmm0, %ymm0
08c3a6
+  movq r, %rax;
08c3a6
 
08c3a6
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
-  vmovd d, %xmm0; \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastd %xmm0, %ymm0
08c3a6
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
08c3a6
+
08c3a6
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
08c3a6
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
08c3a6
+
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
08c3a6
 
08c3a6
 # ifndef SECTION
08c3a6
 #  define SECTION(p)		p##.avx
08c3a6
@@ -30,5 +33,6 @@
08c3a6
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
08c3a6
 # endif
08c3a6
 
08c3a6
+# define USE_XMM_LESS_VEC
08c3a6
 # include "memset-vec-unaligned-erms.S"
08c3a6
 #endif
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
index f14d6f8493c21a36..5241216a77bf72b7 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
08c3a6
@@ -15,13 +15,19 @@
08c3a6
 
08c3a6
 # define VZEROUPPER
08c3a6
 
08c3a6
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastb d, %VEC0
08c3a6
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+  vpbroadcastb d, %VEC0; \
08c3a6
+  movq r, %rax
08c3a6
 
08c3a6
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastd d, %VEC0
08c3a6
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+  vpbroadcastd d, %VEC0; \
08c3a6
+  movq r, %rax
08c3a6
+
08c3a6
+# define MEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define MEMSET_VDUP_TO_VEC0_LOW()
08c3a6
+
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_LOW()
08c3a6
 
08c3a6
 # define SECTION(p)		p##.evex512
08c3a6
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
index 64b09e77cc20cc42..637002150659123c 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
08c3a6
@@ -15,13 +15,19 @@
08c3a6
 
08c3a6
 # define VZEROUPPER
08c3a6
 
08c3a6
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastb d, %VEC0
08c3a6
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+  vpbroadcastb d, %VEC0; \
08c3a6
+  movq r, %rax
08c3a6
 
08c3a6
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
08c3a6
-  movq r, %rax; \
08c3a6
-  vpbroadcastd d, %VEC0
08c3a6
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
08c3a6
+  vpbroadcastd d, %VEC0; \
08c3a6
+  movq r, %rax
08c3a6
+
08c3a6
+# define MEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define MEMSET_VDUP_TO_VEC0_LOW()
08c3a6
+
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+# define WMEMSET_VDUP_TO_VEC0_LOW()
08c3a6
 
08c3a6
 # define SECTION(p)		p##.evex
08c3a6
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
index e723413a664c088f..c8db87dcbf69f0d8 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
08c3a6
@@ -58,8 +58,10 @@
08c3a6
 #ifndef MOVQ
08c3a6
 # if VEC_SIZE > 16
08c3a6
 #  define MOVQ				vmovq
08c3a6
+#  define MOVD				vmovd
08c3a6
 # else
08c3a6
 #  define MOVQ				movq
08c3a6
+#  define MOVD				movd
08c3a6
 # endif
08c3a6
 #endif
08c3a6
 
08c3a6
@@ -72,9 +74,17 @@
08c3a6
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
 # define END_REG	rcx
08c3a6
 # define LOOP_REG	rdi
08c3a6
+# define LESS_VEC_REG	rax
08c3a6
 #else
08c3a6
 # define END_REG	rdi
08c3a6
 # define LOOP_REG	rdx
08c3a6
+# define LESS_VEC_REG	rdi
08c3a6
+#endif
08c3a6
+
08c3a6
+#ifdef USE_XMM_LESS_VEC
08c3a6
+# define XMM_SMALL	1
08c3a6
+#else
08c3a6
+# define XMM_SMALL	0
08c3a6
 #endif
08c3a6
 
08c3a6
 #define PAGE_SIZE 4096
08c3a6
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
08c3a6
 
08c3a6
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
08c3a6
 	shl	$2, %RDX_LP
08c3a6
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
-	jmp	L(entry_from_bzero)
08c3a6
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
+	WMEMSET_VDUP_TO_VEC0_LOW()
08c3a6
+	cmpq	$VEC_SIZE, %rdx
08c3a6
+	jb	L(less_vec_no_vdup)
08c3a6
+	WMEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+	jmp	L(entry_from_wmemset)
08c3a6
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
08c3a6
 #endif
08c3a6
 
08c3a6
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
08c3a6
 #endif
08c3a6
 
08c3a6
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
08c3a6
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
 # ifdef __ILP32__
08c3a6
 	/* Clear the upper 32 bits.  */
08c3a6
 	mov	%edx, %edx
08c3a6
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
08c3a6
 L(entry_from_bzero):
08c3a6
 	cmpq	$VEC_SIZE, %rdx
08c3a6
 	jb	L(less_vec)
08c3a6
+	MEMSET_VDUP_TO_VEC0_HIGH()
08c3a6
+L(entry_from_wmemset):
08c3a6
 	cmpq	$(VEC_SIZE * 2), %rdx
08c3a6
 	ja	L(more_2x_vec)
08c3a6
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
08c3a6
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
08c3a6
 # endif
08c3a6
 
08c3a6
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
08c3a6
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
08c3a6
 # ifdef __ILP32__
08c3a6
 	/* Clear the upper 32 bits.  */
08c3a6
 	mov	%edx, %edx
08c3a6
 # endif
08c3a6
 	cmp	$VEC_SIZE, %RDX_LP
08c3a6
 	jb	L(less_vec)
08c3a6
+	MEMSET_VDUP_TO_VEC0_HIGH ()
08c3a6
 	cmp	$(VEC_SIZE * 2), %RDX_LP
08c3a6
 	ja	L(stosb_more_2x_vec)
08c3a6
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
08c3a6
-	 */
08c3a6
-	VMOVU	%VEC(0), (%rax)
08c3a6
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
08c3a6
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
08c3a6
+	VMOVU	%VEC(0), (%rdi)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 #endif
08c3a6
 
08c3a6
-	.p2align 4,, 10
08c3a6
+	.p2align 4,, 4
08c3a6
 L(last_2x_vec):
08c3a6
 #ifdef USE_LESS_VEC_MASK_STORE
08c3a6
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
08c3a6
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
08c3a6
 #else
08c3a6
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
08c3a6
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
08c3a6
@@ -212,6 +228,7 @@ L(last_2x_vec):
08c3a6
 #ifdef USE_LESS_VEC_MASK_STORE
08c3a6
 	.p2align 4,, 10
08c3a6
 L(less_vec):
08c3a6
+L(less_vec_no_vdup):
08c3a6
 	/* Less than 1 VEC.  */
08c3a6
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
08c3a6
 #  error Unsupported VEC_SIZE!
08c3a6
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
08c3a6
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
08c3a6
 	   and (4x, 8x] jump to target.  */
08c3a6
 L(more_2x_vec):
08c3a6
-
08c3a6
-	/* Two different methods of setting up pointers / compare. The
08c3a6
-	   two methods are based on the fact that EVEX/AVX512 mov
08c3a6
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
08c3a6
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
08c3a6
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
08c3a6
-	   this saves code size and keeps a few targets in one fetch block.
08c3a6
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
08c3a6
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
08c3a6
-	   LOOP_4X_OFFSET) with LEA_BID.  */
08c3a6
-
08c3a6
-	/* END_REG is rcx for EVEX/AVX512.  */
08c3a6
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
08c3a6
-#endif
08c3a6
-
08c3a6
-	/* Stores to first 2x VEC before cmp as any path forward will
08c3a6
-	   require it.  */
08c3a6
-	VMOVU	%VEC(0), (%rax)
08c3a6
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
08c3a6
+	/* Store next 2x vec regardless.  */
08c3a6
+	VMOVU	%VEC(0), (%rdi)
08c3a6
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
08c3a6
 
08c3a6
 
08c3a6
+	/* Two different methods of setting up pointers / compare. The two
08c3a6
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
08c3a6
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
08c3a6
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
08c3a6
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
08c3a6
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
08c3a6
+	   bottlenecks.  */
08c3a6
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
08c3a6
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
08c3a6
 	addq	%rdx, %END_REG
08c3a6
@@ -292,6 +299,15 @@ L(more_2x_vec):
08c3a6
 	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
 	jbe	L(last_2x_vec)
08c3a6
 
08c3a6
+
08c3a6
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
08c3a6
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
08c3a6
+	   LEA_BID.  */
08c3a6
+
08c3a6
+	/* END_REG is rcx for EVEX/AVX512.  */
08c3a6
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
08c3a6
+#endif
08c3a6
+
08c3a6
 	/* Store next 2x vec regardless.  */
08c3a6
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
08c3a6
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
08c3a6
@@ -355,65 +371,93 @@ L(stosb_local):
08c3a6
 	/* Define L(less_vec) only if not otherwise defined.  */
08c3a6
 	.p2align 4
08c3a6
 L(less_vec):
08c3a6
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
08c3a6
+	   xmm). This is only does anything for AVX2.  */
08c3a6
+	MEMSET_VDUP_TO_VEC0_LOW ()
08c3a6
+L(less_vec_no_vdup):
08c3a6
 #endif
08c3a6
 L(cross_page):
08c3a6
 #if VEC_SIZE > 32
08c3a6
 	cmpl	$32, %edx
08c3a6
-	jae	L(between_32_63)
08c3a6
+	jge	L(between_32_63)
08c3a6
 #endif
08c3a6
 #if VEC_SIZE > 16
08c3a6
 	cmpl	$16, %edx
08c3a6
-	jae	L(between_16_31)
08c3a6
+	jge	L(between_16_31)
08c3a6
+#endif
08c3a6
+#ifndef USE_XMM_LESS_VEC
08c3a6
+	MOVQ	%XMM0, %rcx
08c3a6
 #endif
08c3a6
-	MOVQ	%XMM0, %rdi
08c3a6
 	cmpl	$8, %edx
08c3a6
-	jae	L(between_8_15)
08c3a6
+	jge	L(between_8_15)
08c3a6
 	cmpl	$4, %edx
08c3a6
-	jae	L(between_4_7)
08c3a6
+	jge	L(between_4_7)
08c3a6
 	cmpl	$1, %edx
08c3a6
-	ja	L(between_2_3)
08c3a6
-	jb	L(return)
08c3a6
-	movb	%sil, (%rax)
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	jg	L(between_2_3)
08c3a6
+	jl	L(between_0_0)
08c3a6
+	movb	%sil, (%LESS_VEC_REG)
08c3a6
+L(between_0_0):
08c3a6
+	ret
08c3a6
 
08c3a6
-	/* Align small targets only if not doing so would cross a fetch
08c3a6
-	   line.  */
08c3a6
+	/* Align small targets only if not doing so would cross a fetch line.
08c3a6
+	 */
08c3a6
 #if VEC_SIZE > 32
08c3a6
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
08c3a6
 	/* From 32 to 63.  No branch when size == 32.  */
08c3a6
 L(between_32_63):
08c3a6
-	VMOVU	%YMM0, (%rax)
08c3a6
-	VMOVU	%YMM0, -32(%rax, %rdx)
08c3a6
+	VMOVU	%YMM0, (%LESS_VEC_REG)
08c3a6
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 #endif
08c3a6
 
08c3a6
 #if VEC_SIZE >= 32
08c3a6
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
08c3a6
 L(between_16_31):
08c3a6
 	/* From 16 to 31.  No branch when size == 16.  */
08c3a6
-	VMOVU	%XMM0, (%rax)
08c3a6
-	VMOVU	%XMM0, -16(%rax, %rdx)
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	VMOVU	%XMM0, (%LESS_VEC_REG)
08c3a6
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
08c3a6
+	ret
08c3a6
 #endif
08c3a6
 
08c3a6
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
08c3a6
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
08c3a6
+	 */
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
08c3a6
 L(between_8_15):
08c3a6
 	/* From 8 to 15.  No branch when size == 8.  */
08c3a6
-	movq	%rdi, (%rax)
08c3a6
-	movq	%rdi, -8(%rax, %rdx)
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+#ifdef USE_XMM_LESS_VEC
08c3a6
+	MOVQ	%XMM0, (%rdi)
08c3a6
+	MOVQ	%XMM0, -8(%rdi, %rdx)
08c3a6
+#else
08c3a6
+	movq	%rcx, (%LESS_VEC_REG)
08c3a6
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
08c3a6
+#endif
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
08c3a6
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
08c3a6
+	 */
08c3a6
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
08c3a6
 L(between_4_7):
08c3a6
 	/* From 4 to 7.  No branch when size == 4.  */
08c3a6
-	movl	%edi, (%rax)
08c3a6
-	movl	%edi, -4(%rax, %rdx)
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+#ifdef USE_XMM_LESS_VEC
08c3a6
+	MOVD	%XMM0, (%rdi)
08c3a6
+	MOVD	%XMM0, -4(%rdi, %rdx)
08c3a6
+#else
08c3a6
+	movl	%ecx, (%LESS_VEC_REG)
08c3a6
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
08c3a6
+#endif
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
08c3a6
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
08c3a6
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
08c3a6
 L(between_2_3):
08c3a6
 	/* From 2 to 3.  No branch when size == 2.  */
08c3a6
-	movw	%di, (%rax)
08c3a6
-	movb	%dil, -1(%rax, %rdx)
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+#ifdef USE_XMM_LESS_VEC
08c3a6
+	movb	%sil, (%rdi)
08c3a6
+	movb	%sil, 1(%rdi)
08c3a6
+	movb	%sil, -1(%rdi, %rdx)
08c3a6
+#else
08c3a6
+	movw	%cx, (%LESS_VEC_REG)
08c3a6
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
08c3a6
+#endif
08c3a6
+	ret
08c3a6
 END (MEMSET_SYMBOL (__memset, unaligned_erms))