|
|
190885 |
From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001
|
|
|
190885 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Date: Thu, 20 May 2021 13:13:51 -0400
|
|
|
190885 |
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
|
|
|
190885 |
|
|
|
190885 |
No bug. This commit makes a few small improvements to
|
|
|
190885 |
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
|
|
|
190885 |
instead of 128. Either alignment will perform equally well in a loop
|
|
|
190885 |
and 128 just increases the odds of having to do an extra iteration
|
|
|
190885 |
which can be significant overhead for small values. 2) Align some
|
|
|
190885 |
targets and the loop. 3) Remove an ALU from the alignment process. 4)
|
|
|
190885 |
Reorder the last 4x VEC so that they are stored after the loop. 5)
|
|
|
190885 |
Move the condition for leq 8x VEC to before the alignment
|
|
|
190885 |
process. test-memset and test-wmemset are both passing.
|
|
|
190885 |
|
|
|
190885 |
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
190885 |
(cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)
|
|
|
190885 |
---
|
|
|
190885 |
.../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++--------
|
|
|
190885 |
1 file changed, 28 insertions(+), 22 deletions(-)
|
|
|
190885 |
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
190885 |
index f877ac9d..909c33f6 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
190885 |
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
|
190885 |
VMOVU %VEC(0), (%rdi)
|
|
|
190885 |
VZEROUPPER_RETURN
|
|
|
190885 |
|
|
|
190885 |
+ .p2align 4
|
|
|
190885 |
L(stosb_more_2x_vec):
|
|
|
190885 |
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
|
|
190885 |
ja L(stosb)
|
|
|
190885 |
+#else
|
|
|
190885 |
+ .p2align 4
|
|
|
190885 |
#endif
|
|
|
190885 |
L(more_2x_vec):
|
|
|
190885 |
- cmpq $(VEC_SIZE * 4), %rdx
|
|
|
190885 |
- ja L(loop_start)
|
|
|
190885 |
+ /* Stores to first 2x VEC before cmp as any path forward will
|
|
|
190885 |
+ require it. */
|
|
|
190885 |
VMOVU %VEC(0), (%rdi)
|
|
|
190885 |
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
|
190885 |
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
|
190885 |
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
|
190885 |
+ ja L(loop_start)
|
|
|
190885 |
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
|
190885 |
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
|
190885 |
L(return):
|
|
|
190885 |
#if VEC_SIZE > 16
|
|
|
190885 |
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
|
190885 |
@@ -192,28 +197,29 @@ L(return):
|
|
|
190885 |
#endif
|
|
|
190885 |
|
|
|
190885 |
L(loop_start):
|
|
|
190885 |
- leaq (VEC_SIZE * 4)(%rdi), %rcx
|
|
|
190885 |
- VMOVU %VEC(0), (%rdi)
|
|
|
190885 |
- andq $-(VEC_SIZE * 4), %rcx
|
|
|
190885 |
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
|
190885 |
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
|
190885 |
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
|
190885 |
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
|
190885 |
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
|
190885 |
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
|
190885 |
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
|
190885 |
- addq %rdi, %rdx
|
|
|
190885 |
- andq $-(VEC_SIZE * 4), %rdx
|
|
|
190885 |
- cmpq %rdx, %rcx
|
|
|
190885 |
- je L(return)
|
|
|
190885 |
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
|
190885 |
+ jbe L(loop_end)
|
|
|
190885 |
+ andq $-(VEC_SIZE * 2), %rdi
|
|
|
190885 |
+ subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
|
|
190885 |
+ .p2align 4
|
|
|
190885 |
L(loop):
|
|
|
190885 |
- VMOVA %VEC(0), (%rcx)
|
|
|
190885 |
- VMOVA %VEC(0), VEC_SIZE(%rcx)
|
|
|
190885 |
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
|
|
190885 |
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
|
|
190885 |
- addq $(VEC_SIZE * 4), %rcx
|
|
|
190885 |
- cmpq %rcx, %rdx
|
|
|
190885 |
- jne L(loop)
|
|
|
190885 |
+ VMOVA %VEC(0), (%rdi)
|
|
|
190885 |
+ VMOVA %VEC(0), VEC_SIZE(%rdi)
|
|
|
190885 |
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
|
190885 |
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
|
190885 |
+ subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
+ cmpq %rcx, %rdi
|
|
|
190885 |
+ jb L(loop)
|
|
|
190885 |
+L(loop_end):
|
|
|
190885 |
+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
|
|
190885 |
+ rdx as length is also unchanged. */
|
|
|
190885 |
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
|
|
190885 |
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
|
|
190885 |
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
|
|
190885 |
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
|
|
190885 |
VZEROUPPER_SHORT_RETURN
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
--
|
|
|
190885 |
GitLab
|
|
|
190885 |
|