Blame SOURCES/ia-opt-memset-vec-unaligned-erms.patch

190885
From 5deda2b73383bf16788cc83c8ea6262d89608263 Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Mon, 20 Sep 2021 16:20:15 -0500
190885
Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
190885
190885
No bug.
190885
190885
Optimization are
190885
190885
1. change control flow for L(more_2x_vec) to fall through to loop and
190885
   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
190885
   size and saves jumps for length > 4x VEC_SIZE.
190885
190885
2. For EVEX/AVX512 move L(less_vec) closer to entry.
190885
190885
3. Avoid complex address mode for length > 2x VEC_SIZE
190885
190885
4. Slightly better aligning code for the loop from the perspective of
190885
   code size and uops.
190885
190885
5. Align targets so they make full use of their fetch block and if
190885
   possible cache line.
190885
190885
6. Try and reduce total number of icache lines that will need to be
190885
   pulled in for a given length.
190885
190885
7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
190885
   jumping to the stosb target in the sse2 code section will almost
190885
   certainly be to a new page. The new version does increase code size
190885
   marginally by duplicating the target but should get better iTLB
190885
   behavior as a result.
190885
190885
test-memset, test-wmemset, and test-bzero are all passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
190885
(cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
190885
---
190885
 sysdeps/x86_64/memset.S                       |  10 +-
190885
 .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
190885
 .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
190885
 .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
190885
 .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
190885
 5 files changed, 232 insertions(+), 95 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
190885
index b3426795..8672b030 100644
190885
--- a/sysdeps/x86_64/memset.S
190885
+++ b/sysdeps/x86_64/memset.S
190885
@@ -18,13 +18,15 @@
190885
    <http://www.gnu.org/licenses/>.  */
190885
 
190885
 #include <sysdep.h>
190885
+#define USE_WITH_SSE2	1
190885
 
190885
 #define VEC_SIZE	16
190885
+#define MOV_SIZE	3
190885
+#define RET_SIZE	1
190885
+
190885
 #define VEC(i)		xmm##i
190885
-/* Don't use movups and movaps since it will get larger nop paddings for
190885
-   alignment.  */
190885
-#define VMOVU		movdqu
190885
-#define VMOVA		movdqa
190885
+#define VMOVU     movups
190885
+#define VMOVA     movaps
190885
 
190885
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
   movd d, %xmm0; \
190885
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
190885
index ae0860f3..1af668af 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
190885
@@ -1,8 +1,14 @@
190885
 #if IS_IN (libc)
190885
+# define USE_WITH_AVX2	1
190885
+
190885
 # define VEC_SIZE	32
190885
+# define MOV_SIZE	4
190885
+# define RET_SIZE	4
190885
+
190885
 # define VEC(i)		ymm##i
190885
-# define VMOVU		vmovdqu
190885
-# define VMOVA		vmovdqa
190885
+
190885
+# define VMOVU     vmovdqu
190885
+# define VMOVA     vmovdqa
190885
 
190885
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
   vmovd d, %xmm0; \
190885
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
190885
index 8ad842fc..f14d6f84 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
190885
@@ -1,11 +1,18 @@
190885
 #if IS_IN (libc)
190885
+# define USE_WITH_AVX512	1
190885
+
190885
 # define VEC_SIZE	64
190885
+# define MOV_SIZE	6
190885
+# define RET_SIZE	1
190885
+
190885
 # define XMM0		xmm16
190885
 # define YMM0		ymm16
190885
 # define VEC0		zmm16
190885
 # define VEC(i)		VEC##i
190885
-# define VMOVU		vmovdqu64
190885
-# define VMOVA		vmovdqa64
190885
+
190885
+# define VMOVU     vmovdqu64
190885
+# define VMOVA     vmovdqa64
190885
+
190885
 # define VZEROUPPER
190885
 
190885
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
index 640f0929..64b09e77 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
190885
@@ -1,11 +1,18 @@
190885
 #if IS_IN (libc)
190885
+# define USE_WITH_EVEX	1
190885
+
190885
 # define VEC_SIZE	32
190885
+# define MOV_SIZE	6
190885
+# define RET_SIZE	1
190885
+
190885
 # define XMM0		xmm16
190885
 # define YMM0		ymm16
190885
 # define VEC0		ymm16
190885
 # define VEC(i)		VEC##i
190885
-# define VMOVU		vmovdqu64
190885
-# define VMOVA		vmovdqa64
190885
+
190885
+# define VMOVU     vmovdqu64
190885
+# define VMOVA     vmovdqa64
190885
+
190885
 # define VZEROUPPER
190885
 
190885
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
190885
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
index 909c33f6..f08b7323 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
@@ -63,8 +63,27 @@
190885
 # endif
190885
 #endif
190885
 
190885
+#if VEC_SIZE == 64
190885
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
190885
+#else
190885
+# define LOOP_4X_OFFSET	(0)
190885
+#endif
190885
+
190885
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
190885
+# define END_REG	rcx
190885
+# define LOOP_REG	rdi
190885
+#else
190885
+# define END_REG	rdi
190885
+# define LOOP_REG	rdx
190885
+#endif
190885
+
190885
 #define PAGE_SIZE 4096
190885
 
190885
+/* Macro to calculate size of small memset block for aligning
190885
+   purposes.  */
190885
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
190885
+
190885
+
190885
 #ifndef SECTION
190885
 # error SECTION is not defined!
190885
 #endif
190885
@@ -74,6 +93,7 @@
190885
 ENTRY (__bzero)
190885
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
190885
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
190885
+	xorl	%esi, %esi
190885
 	pxor	%XMM0, %XMM0
190885
 	jmp	L(entry_from_bzero)
190885
 END (__bzero)
190885
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
190885
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
190885
 # endif
190885
 
190885
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
190885
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
190885
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
190885
 # ifdef __ILP32__
190885
 	/* Clear the upper 32 bits.  */
190885
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
190885
 	jb	L(less_vec)
190885
 	cmp	$(VEC_SIZE * 2), %RDX_LP
190885
 	ja	L(stosb_more_2x_vec)
190885
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
190885
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
190885
-	VMOVU	%VEC(0), (%rdi)
190885
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
190885
+	 */
190885
+	VMOVU	%VEC(0), (%rax)
190885
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
-
190885
-	.p2align 4
190885
-L(stosb_more_2x_vec):
190885
-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
190885
-	ja	L(stosb)
190885
-#else
190885
-	.p2align 4
190885
 #endif
190885
-L(more_2x_vec):
190885
-	/* Stores to first 2x VEC before cmp as any path forward will
190885
-	   require it.  */
190885
-	VMOVU	%VEC(0), (%rdi)
190885
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
190885
-	cmpq	$(VEC_SIZE * 4), %rdx
190885
-	ja	L(loop_start)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
190885
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
190885
-L(return):
190885
-#if VEC_SIZE > 16
190885
-	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
+
190885
+	.p2align 4,, 10
190885
+L(last_2x_vec):
190885
+#ifdef USE_LESS_VEC_MASK_STORE
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
190885
 #else
190885
-	ret
190885
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
190885
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
190885
 #endif
190885
+	VZEROUPPER_RETURN
190885
 
190885
-L(loop_start):
190885
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
190885
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
190885
-	cmpq	$(VEC_SIZE * 8), %rdx
190885
-	jbe	L(loop_end)
190885
-	andq	$-(VEC_SIZE * 2), %rdi
190885
-	subq	$-(VEC_SIZE * 4), %rdi
190885
-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
190885
-	.p2align 4
190885
-L(loop):
190885
-	VMOVA	%VEC(0), (%rdi)
190885
-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
190885
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
190885
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
190885
-	subq	$-(VEC_SIZE * 4), %rdi
190885
-	cmpq	%rcx, %rdi
190885
-	jb	L(loop)
190885
-L(loop_end):
190885
-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
190885
-	       rdx as length is also unchanged.  */
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
190885
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
190885
-	VZEROUPPER_SHORT_RETURN
190885
-
190885
-	.p2align 4
190885
+	/* If have AVX512 mask instructions put L(less_vec) close to
190885
+	   entry as it doesn't take much space and is likely a hot target.
190885
+	 */
190885
+#ifdef USE_LESS_VEC_MASK_STORE
190885
+	.p2align 4,, 10
190885
 L(less_vec):
190885
 	/* Less than 1 VEC.  */
190885
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
190885
 #  error Unsupported VEC_SIZE!
190885
 # endif
190885
-# ifdef USE_LESS_VEC_MASK_STORE
190885
 	/* Clear high bits from edi. Only keeping bits relevant to page
190885
 	   cross check. Note that we are using rax which is set in
190885
-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
190885
-	 */
190885
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
190885
 	andl	$(PAGE_SIZE - 1), %edi
190885
-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
190885
-	   performance degradation when it has to fault supress.  */
190885
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
190885
+	   serious performance degradation when it has to fault supress.
190885
+	 */
190885
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
190885
+	/* This is generally considered a cold target.  */
190885
 	ja	L(cross_page)
190885
 # if VEC_SIZE > 32
190885
 	movq	$-1, %rcx
190885
@@ -247,58 +235,185 @@ L(less_vec):
190885
 	bzhil	%edx, %ecx, %ecx
190885
 	kmovd	%ecx, %k1
190885
 # endif
190885
-	vmovdqu8	%VEC(0), (%rax) {%k1}
190885
+	vmovdqu8 %VEC(0), (%rax){%k1}
190885
 	VZEROUPPER_RETURN
190885
 
190885
+# if defined USE_MULTIARCH && IS_IN (libc)
190885
+	/* Include L(stosb_local) here if including L(less_vec) between
190885
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
190885
+	   L(stosb_more_2x_vec) target.  */
190885
+	.p2align 4,, 10
190885
+L(stosb_local):
190885
+	movzbl	%sil, %eax
190885
+	mov	%RDX_LP, %RCX_LP
190885
+	mov	%RDI_LP, %RDX_LP
190885
+	rep	stosb
190885
+	mov	%RDX_LP, %RAX_LP
190885
+	VZEROUPPER_RETURN
190885
+# endif
190885
+#endif
190885
+
190885
+#if defined USE_MULTIARCH && IS_IN (libc)
190885
 	.p2align 4
190885
-L(cross_page):
190885
+L(stosb_more_2x_vec):
190885
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
190885
+	ja	L(stosb_local)
190885
+#endif
190885
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
190885
+	   and (4x, 8x] jump to target.  */
190885
+L(more_2x_vec):
190885
+
190885
+	/* Two different methods of setting up pointers / compare. The
190885
+	   two methods are based on the fact that EVEX/AVX512 mov
190885
+	   instructions take more bytes then AVX2/SSE2 mov instructions. As
190885
+	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
190885
+	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
190885
+	   this saves code size and keeps a few targets in one fetch block.
190885
+	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
190885
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
190885
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
190885
+	   LOOP_4X_OFFSET) with LEA_BID.  */
190885
+
190885
+	/* END_REG is rcx for EVEX/AVX512.  */
190885
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
190885
+#endif
190885
+
190885
+	/* Stores to first 2x VEC before cmp as any path forward will
190885
+	   require it.  */
190885
+	VMOVU	%VEC(0), (%rax)
190885
+	VMOVU	%VEC(0), VEC_SIZE(%rax)
190885
+
190885
+
190885
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
190885
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
190885
+	addq	%rdx, %END_REG
190885
+#endif
190885
+
190885
+	cmpq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_2x_vec)
190885
+
190885
+	/* Store next 2x vec regardless.  */
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
190885
+
190885
+
190885
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
190885
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
190885
+	   extra offset to addresses in loop. Used for AVX512 to save space
190885
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
190885
+# if LOOP_4X_OFFSET == 0
190885
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
190885
 # endif
190885
-# if VEC_SIZE > 32
190885
-	cmpb	$32, %dl
190885
-	jae	L(between_32_63)
190885
+	/* Avoid imm32 compare here to save code size.  */
190885
+	cmpq	%rdi, %rcx
190885
+#else
190885
+	addq	$-(VEC_SIZE * 4), %END_REG
190885
+	cmpq	$(VEC_SIZE * 8), %rdx
190885
+#endif
190885
+	jbe	L(last_4x_vec)
190885
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
190885
+	/* Set LOOP_REG (rdx).  */
190885
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
190885
+#endif
190885
+	/* Align dst for loop.  */
190885
+	andq	$(VEC_SIZE * -2), %LOOP_REG
190885
+	.p2align 4
190885
+L(loop):
190885
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
190885
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
190885
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
190885
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
190885
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
190885
+	cmpq	%END_REG, %LOOP_REG
190885
+	jb	L(loop)
190885
+	.p2align 4,, MOV_SIZE
190885
+L(last_4x_vec):
190885
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
190885
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
190885
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
190885
+L(return):
190885
+#if VEC_SIZE > 16
190885
+	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
+#else
190885
+	ret
190885
+#endif
190885
+
190885
+	.p2align 4,, 10
190885
+#ifndef USE_LESS_VEC_MASK_STORE
190885
+# if defined USE_MULTIARCH && IS_IN (libc)
190885
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
190885
+	   range for 2-byte jump encoding.  */
190885
+L(stosb_local):
190885
+	movzbl	%sil, %eax
190885
+	mov	%RDX_LP, %RCX_LP
190885
+	mov	%RDI_LP, %RDX_LP
190885
+	rep	stosb
190885
+	mov	%RDX_LP, %RAX_LP
190885
+	VZEROUPPER_RETURN
190885
 # endif
190885
-# if VEC_SIZE > 16
190885
-	cmpb	$16, %dl
190885
+	/* Define L(less_vec) only if not otherwise defined.  */
190885
+	.p2align 4
190885
+L(less_vec):
190885
+#endif
190885
+L(cross_page):
190885
+#if VEC_SIZE > 32
190885
+	cmpl	$32, %edx
190885
+	jae	L(between_32_63)
190885
+#endif
190885
+#if VEC_SIZE > 16
190885
+	cmpl	$16, %edx
190885
 	jae	L(between_16_31)
190885
-# endif
190885
-	MOVQ	%XMM0, %rcx
190885
-	cmpb	$8, %dl
190885
+#endif
190885
+	MOVQ	%XMM0, %rdi
190885
+	cmpl	$8, %edx
190885
 	jae	L(between_8_15)
190885
-	cmpb	$4, %dl
190885
+	cmpl	$4, %edx
190885
 	jae	L(between_4_7)
190885
-	cmpb	$1, %dl
190885
+	cmpl	$1, %edx
190885
 	ja	L(between_2_3)
190885
-	jb	1f
190885
-	movb	%cl, (%rax)
190885
-1:
190885
+	jb	L(return)
190885
+	movb	%sil, (%rax)
190885
 	VZEROUPPER_RETURN
190885
-# if VEC_SIZE > 32
190885
+
190885
+	/* Align small targets only if not doing so would cross a fetch
190885
+	   line.  */
190885
+#if VEC_SIZE > 32
190885
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
190885
 	/* From 32 to 63.  No branch when size == 32.  */
190885
 L(between_32_63):
190885
-	VMOVU	%YMM0, -32(%rax,%rdx)
190885
 	VMOVU	%YMM0, (%rax)
190885
+	VMOVU	%YMM0, -32(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
-# endif
190885
-# if VEC_SIZE > 16
190885
-	/* From 16 to 31.  No branch when size == 16.  */
190885
+#endif
190885
+
190885
+#if VEC_SIZE >= 32
190885
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
190885
 L(between_16_31):
190885
-	VMOVU	%XMM0, -16(%rax,%rdx)
190885
+	/* From 16 to 31.  No branch when size == 16.  */
190885
 	VMOVU	%XMM0, (%rax)
190885
+	VMOVU	%XMM0, -16(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
-# endif
190885
-	/* From 8 to 15.  No branch when size == 8.  */
190885
+#endif
190885
+
190885
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
190885
 L(between_8_15):
190885
-	movq	%rcx, -8(%rax,%rdx)
190885
-	movq	%rcx, (%rax)
190885
+	/* From 8 to 15.  No branch when size == 8.  */
190885
+	movq	%rdi, (%rax)
190885
+	movq	%rdi, -8(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
+
190885
+	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
190885
 L(between_4_7):
190885
 	/* From 4 to 7.  No branch when size == 4.  */
190885
-	movl	%ecx, -4(%rax,%rdx)
190885
-	movl	%ecx, (%rax)
190885
+	movl	%edi, (%rax)
190885
+	movl	%edi, -4(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
+
190885
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
190885
 L(between_2_3):
190885
 	/* From 2 to 3.  No branch when size == 2.  */
190885
-	movw	%cx, -2(%rax,%rdx)
190885
-	movw	%cx, (%rax)
190885
+	movw	%di, (%rax)
190885
+	movb	%dil, -1(%rax, %rdx)
190885
 	VZEROUPPER_RETURN
190885
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
190885
-- 
190885
GitLab
190885