190885
From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Thu, 20 May 2021 13:13:51 -0400
190885
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
190885
190885
No bug. This commit makes a few small improvements to
190885
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
190885
instead of 128. Either alignment will perform equally well in a loop
190885
and 128 just increases the odds of having to do an extra iteration
190885
which can be significant overhead for small values. 2) Align some
190885
targets and the loop. 3) Remove an ALU from the alignment process. 4)
190885
Reorder the last 4x VEC so that they are stored after the loop. 5)
190885
Move the condition for leq 8x VEC to before the alignment
190885
process. test-memset and test-wmemset are both passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
190885
(cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)
190885
---
190885
 .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
190885
 1 file changed, 28 insertions(+), 22 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
index f877ac9d..909c33f6 100644
190885
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
190885
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
190885
 	VMOVU	%VEC(0), (%rdi)
190885
 	VZEROUPPER_RETURN
190885
 
190885
+	.p2align 4
190885
 L(stosb_more_2x_vec):
190885
 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
190885
 	ja	L(stosb)
190885
+#else
190885
+	.p2align 4
190885
 #endif
190885
 L(more_2x_vec):
190885
-	cmpq  $(VEC_SIZE * 4), %rdx
190885
-	ja	L(loop_start)
190885
+	/* Stores to first 2x VEC before cmp as any path forward will
190885
+	   require it.  */
190885
 	VMOVU	%VEC(0), (%rdi)
190885
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
190885
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
190885
+	cmpq	$(VEC_SIZE * 4), %rdx
190885
+	ja	L(loop_start)
190885
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
190885
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
190885
 L(return):
190885
 #if VEC_SIZE > 16
190885
 	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
@@ -192,28 +197,29 @@ L(return):
190885
 #endif
190885
 
190885
 L(loop_start):
190885
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
190885
-	VMOVU	%VEC(0), (%rdi)
190885
-	andq	$-(VEC_SIZE * 4), %rcx
190885
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
190885
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
190885
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
190885
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
190885
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
190885
-	addq	%rdi, %rdx
190885
-	andq	$-(VEC_SIZE * 4), %rdx
190885
-	cmpq	%rdx, %rcx
190885
-	je	L(return)
190885
+	cmpq	$(VEC_SIZE * 8), %rdx
190885
+	jbe	L(loop_end)
190885
+	andq	$-(VEC_SIZE * 2), %rdi
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
190885
+	.p2align 4
190885
 L(loop):
190885
-	VMOVA	%VEC(0), (%rcx)
190885
-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
190885
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
190885
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
190885
-	addq	$(VEC_SIZE * 4), %rcx
190885
-	cmpq	%rcx, %rdx
190885
-	jne	L(loop)
190885
+	VMOVA	%VEC(0), (%rdi)
190885
+	VMOVA	%VEC(0), VEC_SIZE(%rdi)
190885
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
190885
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	cmpq	%rcx, %rdi
190885
+	jb	L(loop)
190885
+L(loop_end):
190885
+	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
190885
+	       rdx as length is also unchanged.  */
190885
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
190885
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
190885
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
190885
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
190885
 	VZEROUPPER_SHORT_RETURN
190885
 
190885
 	.p2align 4
190885
-- 
190885
GitLab
190885