Blame SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch

513694
From 413e4abc92aeb12fb4c188aa53f0425ceac0ef15 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Sun, 6 Feb 2022 00:54:18 -0600
513694
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
513694
513694
No bug.
513694
513694
Split vec generation into multiple steps. This allows the
513694
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
513694
case. This saves an expensive lane-cross instruction and removes
513694
the need for 'vzeroupper'.
513694
513694
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
513694
byte broadcast.
513694
513694
Results for memset-avx2 small (geomean of N = 20 benchset runs).
513694
513694
size, New Time, Old Time, New / Old
513694
   0,    4.100,    3.831,     0.934
513694
   1,    5.074,    4.399,     0.867
513694
   2,    4.433,    4.411,     0.995
513694
   4,    4.487,    4.415,     0.984
513694
   8,    4.454,    4.396,     0.987
513694
  16,    4.502,    4.443,     0.987
513694
513694
All relevant string/wcsmbs tests are passing.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
513694
---
513694
 sysdeps/x86_64/memset.S                       |  21 ++-
513694
 .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
513694
 .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
513694
 .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
513694
 .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
513694
 5 files changed, 152 insertions(+), 87 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
513694
index 8672b030..27debd2b 100644
513694
--- a/sysdeps/x86_64/memset.S
513694
+++ b/sysdeps/x86_64/memset.S
513694
@@ -28,17 +28,22 @@
513694
 #define VMOVU     movups
513694
 #define VMOVA     movaps
513694
 
513694
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
   movd d, %xmm0; \
513694
-  movq r, %rax; \
513694
-  punpcklbw %xmm0, %xmm0; \
513694
-  punpcklwd %xmm0, %xmm0; \
513694
-  pshufd $0, %xmm0, %xmm0
513694
+  pxor %xmm1, %xmm1; \
513694
+  pshufb %xmm1, %xmm0; \
513694
+  movq r, %rax
513694
 
513694
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
   movd d, %xmm0; \
513694
-  movq r, %rax; \
513694
-  pshufd $0, %xmm0, %xmm0
513694
+  pshufd $0, %xmm0, %xmm0; \
513694
+  movq r, %rax
513694
+
513694
+# define MEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define MEMSET_VDUP_TO_VEC0_LOW()
513694
+
513694
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define WMEMSET_VDUP_TO_VEC0_LOW()
513694
 
513694
 #define SECTION(p)		p
513694
 
513694
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
513694
index 1af668af..c0bf2875 100644
513694
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
513694
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
513694
@@ -10,15 +10,18 @@
513694
 # define VMOVU     vmovdqu
513694
 # define VMOVA     vmovdqa
513694
 
513694
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
   vmovd d, %xmm0; \
513694
-  movq r, %rax; \
513694
-  vpbroadcastb %xmm0, %ymm0
513694
+  movq r, %rax;
513694
 
513694
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
-  vmovd d, %xmm0; \
513694
-  movq r, %rax; \
513694
-  vpbroadcastd %xmm0, %ymm0
513694
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
513694
+
513694
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
513694
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
513694
+
513694
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
513694
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
513694
 
513694
 # ifndef SECTION
513694
 #  define SECTION(p)		p##.avx
513694
@@ -30,5 +33,6 @@
513694
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
513694
 # endif
513694
 
513694
+# define USE_XMM_LESS_VEC
513694
 # include "memset-vec-unaligned-erms.S"
513694
 #endif
513694
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
513694
index f14d6f84..5241216a 100644
513694
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
513694
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
513694
@@ -15,13 +15,19 @@
513694
 
513694
 # define VZEROUPPER
513694
 
513694
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
-  movq r, %rax; \
513694
-  vpbroadcastb d, %VEC0
513694
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
+  vpbroadcastb d, %VEC0; \
513694
+  movq r, %rax
513694
 
513694
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
-  movq r, %rax; \
513694
-  vpbroadcastd d, %VEC0
513694
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
+  vpbroadcastd d, %VEC0; \
513694
+  movq r, %rax
513694
+
513694
+# define MEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define MEMSET_VDUP_TO_VEC0_LOW()
513694
+
513694
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define WMEMSET_VDUP_TO_VEC0_LOW()
513694
 
513694
 # define SECTION(p)		p##.evex512
513694
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
513694
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
513694
index 64b09e77..63700215 100644
513694
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
513694
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
513694
@@ -15,13 +15,19 @@
513694
 
513694
 # define VZEROUPPER
513694
 
513694
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
-  movq r, %rax; \
513694
-  vpbroadcastb d, %VEC0
513694
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
+  vpbroadcastb d, %VEC0; \
513694
+  movq r, %rax
513694
 
513694
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
513694
-  movq r, %rax; \
513694
-  vpbroadcastd d, %VEC0
513694
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
513694
+  vpbroadcastd d, %VEC0; \
513694
+  movq r, %rax
513694
+
513694
+# define MEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define MEMSET_VDUP_TO_VEC0_LOW()
513694
+
513694
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
513694
+# define WMEMSET_VDUP_TO_VEC0_LOW()
513694
 
513694
 # define SECTION(p)		p##.evex
513694
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
513694
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
513694
index f08b7323..a67f9833 100644
513694
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
513694
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
513694
@@ -58,8 +58,10 @@
513694
 #ifndef MOVQ
513694
 # if VEC_SIZE > 16
513694
 #  define MOVQ				vmovq
513694
+#  define MOVD				vmovd
513694
 # else
513694
 #  define MOVQ				movq
513694
+#  define MOVD				movd
513694
 # endif
513694
 #endif
513694
 
513694
@@ -72,9 +74,17 @@
513694
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
513694
 # define END_REG	rcx
513694
 # define LOOP_REG	rdi
513694
+# define LESS_VEC_REG	rax
513694
 #else
513694
 # define END_REG	rdi
513694
 # define LOOP_REG	rdx
513694
+# define LESS_VEC_REG	rdi
513694
+#endif
513694
+
513694
+#ifdef USE_XMM_LESS_VEC
513694
+# define XMM_SMALL	1
513694
+#else
513694
+# define XMM_SMALL	0
513694
 #endif
513694
 
513694
 #define PAGE_SIZE 4096
513694
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
513694
 
513694
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
513694
 	shl	$2, %RDX_LP
513694
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
-	jmp	L(entry_from_bzero)
513694
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
+	WMEMSET_VDUP_TO_VEC0_LOW()
513694
+	cmpq	$VEC_SIZE, %rdx
513694
+	jb	L(less_vec_no_vdup)
513694
+	WMEMSET_VDUP_TO_VEC0_HIGH()
513694
+	jmp	L(entry_from_wmemset)
513694
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
513694
 #endif
513694
 
513694
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
513694
 #endif
513694
 
513694
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
513694
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
 # ifdef __ILP32__
513694
 	/* Clear the upper 32 bits.  */
513694
 	mov	%edx, %edx
513694
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
513694
 L(entry_from_bzero):
513694
 	cmpq	$VEC_SIZE, %rdx
513694
 	jb	L(less_vec)
513694
+	MEMSET_VDUP_TO_VEC0_HIGH()
513694
+L(entry_from_wmemset):
513694
 	cmpq	$(VEC_SIZE * 2), %rdx
513694
 	ja	L(more_2x_vec)
513694
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
513694
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
513694
 # endif
513694
 
513694
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
513694
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
513694
 # ifdef __ILP32__
513694
 	/* Clear the upper 32 bits.  */
513694
 	mov	%edx, %edx
513694
 # endif
513694
 	cmp	$VEC_SIZE, %RDX_LP
513694
 	jb	L(less_vec)
513694
+	MEMSET_VDUP_TO_VEC0_HIGH ()
513694
 	cmp	$(VEC_SIZE * 2), %RDX_LP
513694
 	ja	L(stosb_more_2x_vec)
513694
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
513694
-	 */
513694
-	VMOVU	%VEC(0), (%rax)
513694
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
513694
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
513694
+	VMOVU	%VEC(0), (%rdi)
513694
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
513694
 	VZEROUPPER_RETURN
513694
 #endif
513694
 
513694
-	.p2align 4,, 10
513694
+	.p2align 4,, 4
513694
 L(last_2x_vec):
513694
 #ifdef USE_LESS_VEC_MASK_STORE
513694
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
513694
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
513694
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
513694
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
513694
 #else
513694
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
513694
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
513694
@@ -212,6 +228,7 @@ L(last_2x_vec):
513694
 #ifdef USE_LESS_VEC_MASK_STORE
513694
 	.p2align 4,, 10
513694
 L(less_vec):
513694
+L(less_vec_no_vdup):
513694
 	/* Less than 1 VEC.  */
513694
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
513694
 #  error Unsupported VEC_SIZE!
513694
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
513694
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
513694
 	   and (4x, 8x] jump to target.  */
513694
 L(more_2x_vec):
513694
-
513694
-	/* Two different methods of setting up pointers / compare. The
513694
-	   two methods are based on the fact that EVEX/AVX512 mov
513694
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
513694
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
513694
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
513694
-	   this saves code size and keeps a few targets in one fetch block.
513694
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
513694
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
513694
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
513694
-	   LOOP_4X_OFFSET) with LEA_BID.  */
513694
-
513694
-	/* END_REG is rcx for EVEX/AVX512.  */
513694
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
513694
-#endif
513694
-
513694
-	/* Stores to first 2x VEC before cmp as any path forward will
513694
-	   require it.  */
513694
-	VMOVU	%VEC(0), (%rax)
513694
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
513694
+	/* Store next 2x vec regardless.  */
513694
+	VMOVU	%VEC(0), (%rdi)
513694
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
513694
 
513694
 
513694
+	/* Two different methods of setting up pointers / compare. The two
513694
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
513694
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
513694
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
513694
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
513694
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
513694
+	   bottlenecks.  */
513694
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
513694
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
513694
 	addq	%rdx, %END_REG
513694
@@ -292,6 +299,15 @@ L(more_2x_vec):
513694
 	cmpq	$(VEC_SIZE * 4), %rdx
513694
 	jbe	L(last_2x_vec)
513694
 
513694
+
513694
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
513694
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
513694
+	   LEA_BID.  */
513694
+
513694
+	/* END_REG is rcx for EVEX/AVX512.  */
513694
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
513694
+#endif
513694
+
513694
 	/* Store next 2x vec regardless.  */
513694
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
513694
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
513694
@@ -355,65 +371,93 @@ L(stosb_local):
513694
 	/* Define L(less_vec) only if not otherwise defined.  */
513694
 	.p2align 4
513694
 L(less_vec):
513694
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
513694
+	   xmm). This is only does anything for AVX2.  */
513694
+	MEMSET_VDUP_TO_VEC0_LOW ()
513694
+L(less_vec_no_vdup):
513694
 #endif
513694
 L(cross_page):
513694
 #if VEC_SIZE > 32
513694
 	cmpl	$32, %edx
513694
-	jae	L(between_32_63)
513694
+	jge	L(between_32_63)
513694
 #endif
513694
 #if VEC_SIZE > 16
513694
 	cmpl	$16, %edx
513694
-	jae	L(between_16_31)
513694
+	jge	L(between_16_31)
513694
+#endif
513694
+#ifndef USE_XMM_LESS_VEC
513694
+	MOVQ	%XMM0, %rcx
513694
 #endif
513694
-	MOVQ	%XMM0, %rdi
513694
 	cmpl	$8, %edx
513694
-	jae	L(between_8_15)
513694
+	jge	L(between_8_15)
513694
 	cmpl	$4, %edx
513694
-	jae	L(between_4_7)
513694
+	jge	L(between_4_7)
513694
 	cmpl	$1, %edx
513694
-	ja	L(between_2_3)
513694
-	jb	L(return)
513694
-	movb	%sil, (%rax)
513694
-	VZEROUPPER_RETURN
513694
+	jg	L(between_2_3)
513694
+	jl	L(between_0_0)
513694
+	movb	%sil, (%LESS_VEC_REG)
513694
+L(between_0_0):
513694
+	ret
513694
 
513694
-	/* Align small targets only if not doing so would cross a fetch
513694
-	   line.  */
513694
+	/* Align small targets only if not doing so would cross a fetch line.
513694
+	 */
513694
 #if VEC_SIZE > 32
513694
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
513694
 	/* From 32 to 63.  No branch when size == 32.  */
513694
 L(between_32_63):
513694
-	VMOVU	%YMM0, (%rax)
513694
-	VMOVU	%YMM0, -32(%rax, %rdx)
513694
+	VMOVU	%YMM0, (%LESS_VEC_REG)
513694
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
513694
 	VZEROUPPER_RETURN
513694
 #endif
513694
 
513694
 #if VEC_SIZE >= 32
513694
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
513694
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
513694
 L(between_16_31):
513694
 	/* From 16 to 31.  No branch when size == 16.  */
513694
-	VMOVU	%XMM0, (%rax)
513694
-	VMOVU	%XMM0, -16(%rax, %rdx)
513694
-	VZEROUPPER_RETURN
513694
+	VMOVU	%XMM0, (%LESS_VEC_REG)
513694
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
513694
+	ret
513694
 #endif
513694
 
513694
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
513694
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
513694
+	 */
513694
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
513694
 L(between_8_15):
513694
 	/* From 8 to 15.  No branch when size == 8.  */
513694
-	movq	%rdi, (%rax)
513694
-	movq	%rdi, -8(%rax, %rdx)
513694
-	VZEROUPPER_RETURN
513694
+#ifdef USE_XMM_LESS_VEC
513694
+	MOVQ	%XMM0, (%rdi)
513694
+	MOVQ	%XMM0, -8(%rdi, %rdx)
513694
+#else
513694
+	movq	%rcx, (%LESS_VEC_REG)
513694
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
513694
+#endif
513694
+	ret
513694
 
513694
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
513694
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
513694
+	 */
513694
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
513694
 L(between_4_7):
513694
 	/* From 4 to 7.  No branch when size == 4.  */
513694
-	movl	%edi, (%rax)
513694
-	movl	%edi, -4(%rax, %rdx)
513694
-	VZEROUPPER_RETURN
513694
+#ifdef USE_XMM_LESS_VEC
513694
+	MOVD	%XMM0, (%rdi)
513694
+	MOVD	%XMM0, -4(%rdi, %rdx)
513694
+#else
513694
+	movl	%ecx, (%LESS_VEC_REG)
513694
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
513694
+#endif
513694
+	ret
513694
 
513694
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
513694
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
513694
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
513694
 L(between_2_3):
513694
 	/* From 2 to 3.  No branch when size == 2.  */
513694
-	movw	%di, (%rax)
513694
-	movb	%dil, -1(%rax, %rdx)
513694
-	VZEROUPPER_RETURN
513694
+#ifdef USE_XMM_LESS_VEC
513694
+	movb	%sil, (%rdi)
513694
+	movb	%sil, 1(%rdi)
513694
+	movb	%sil, -1(%rdi, %rdx)
513694
+#else
513694
+	movw	%cx, (%LESS_VEC_REG)
513694
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
513694
+#endif
513694
+	ret
513694
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
513694
-- 
513694
GitLab
513694