Tree - rpms/glibc - CentOS Git server

rpms / glibc

Files

Commit: aba491ab5eddb9074f11f22f9f8e1204c6f89e42

Blob Blame History Raw

 From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 20 May 2021 13:13:51 -0400
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
 
No bug. This commit makes a few small improvements to
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
instead of 128. Either alignment will perform equally well in a loop
and 128 just increases the odds of having to do an extra iteration
which can be significant overhead for small values. 2) Align some
targets and the loop. 3) Remove an ALU from the alignment process. 4)
Reorder the last 4x VEC so that they are stored after the loop. 5)
Move the condition for leq 8x VEC to before the alignment
process. test-memset and test-wmemset are both passing.
 
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)
---
 .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
 1 file changed, 28 insertions(+), 22 deletions(-)
 
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index f877ac9d..909c33f6 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER_RETURN
 
+	.p2align 4
 L(stosb_more_2x_vec):
 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 	ja	L(stosb)
+#else
+	.p2align 4
 #endif
 L(more_2x_vec):
-	cmpq  $(VEC_SIZE * 4), %rdx
-	ja	L(loop_start)
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 L(return):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
@@ -192,28 +197,29 @@ L(return):
 #endif
 
 L(loop_start):
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	VMOVU	%VEC(0), (%rdi)
-	andq	$-(VEC_SIZE * 4), %rcx
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(loop_end)
+	andq	$-(VEC_SIZE * 2), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+	.p2align 4
 L(loop):
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rcx, %rdi
+	jb	L(loop)
+L(loop_end):
+	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+	       rdx as length is also unchanged.  */
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_SHORT_RETURN
 
 	.p2align 4
-- 
GitLab

	From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001
	From: Noah Goldstein <goldstein.w.n@gmail.com>
	Date: Thu, 20 May 2021 13:13:51 -0400
	Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S

	No bug. This commit makes a few small improvements to
	memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
	instead of 128. Either alignment will perform equally well in a loop
	and 128 just increases the odds of having to do an extra iteration
	which can be significant overhead for small values. 2) Align some
	targets and the loop. 3) Remove an ALU from the alignment process. 4)
	Reorder the last 4x VEC so that they are stored after the loop. 5)
	Move the condition for leq 8x VEC to before the alignment
	process. test-memset and test-wmemset are both passing.

	Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
	Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
	(cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)
	---
	.../multiarch/memset-vec-unaligned-erms.S \| 50 +++++++++++--------
	1 file changed, 28 insertions(+), 22 deletions(-)

	diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
	index f877ac9d..909c33f6 100644
	--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
	+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
	@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
	VMOVU %VEC(0), (%rdi)
	VZEROUPPER_RETURN

	+ .p2align 4
	L(stosb_more_2x_vec):
	cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
	ja L(stosb)
	+#else
	+ .p2align 4
	#endif
	L(more_2x_vec):
	- cmpq $(VEC_SIZE * 4), %rdx
	- ja L(loop_start)
	+ /* Stores to first 2x VEC before cmp as any path forward will
	+ require it. */
	VMOVU %VEC(0), (%rdi)
	VMOVU %VEC(0), VEC_SIZE(%rdi)
	- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
	+ cmpq $(VEC_SIZE * 4), %rdx
	+ ja L(loop_start)
	VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
	+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
	L(return):
	#if VEC_SIZE > 16
	ZERO_UPPER_VEC_REGISTERS_RETURN
	@@ -192,28 +197,29 @@ L(return):
	#endif

	L(loop_start):
	- leaq (VEC_SIZE * 4)(%rdi), %rcx
	- VMOVU %VEC(0), (%rdi)
	- andq $-(VEC_SIZE * 4), %rcx
	- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
	- VMOVU %VEC(0), VEC_SIZE(%rdi)
	- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
	VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
	- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
	VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
	- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
	- addq %rdi, %rdx
	- andq $-(VEC_SIZE * 4), %rdx
	- cmpq %rdx, %rcx
	- je L(return)
	+ cmpq $(VEC_SIZE * 8), %rdx
	+ jbe L(loop_end)
	+ andq $-(VEC_SIZE * 2), %rdi
	+ subq $-(VEC_SIZE * 4), %rdi
	+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
	+ .p2align 4
	L(loop):
	- VMOVA %VEC(0), (%rcx)
	- VMOVA %VEC(0), VEC_SIZE(%rcx)
	- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
	- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
	- addq $(VEC_SIZE * 4), %rcx
	- cmpq %rcx, %rdx
	- jne L(loop)
	+ VMOVA %VEC(0), (%rdi)
	+ VMOVA %VEC(0), VEC_SIZE(%rdi)
	+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
	+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
	+ subq $-(VEC_SIZE * 4), %rdi
	+ cmpq %rcx, %rdi
	+ jb L(loop)
	+L(loop_end):
	+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
	+ rdx as length is also unchanged. */
	+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
	+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
	+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
	+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
	VZEROUPPER_SHORT_RETURN

	.p2align 4
	--
	GitLab

rpms / glibc

Source Code

Files