|
|
513694 |
From 4619b6dbf13c17a13be2d2a0bdc9fcc2640b0f86 Mon Sep 17 00:00:00 2001
|
|
|
513694 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
513694 |
Date: Fri, 15 Apr 2022 12:28:01 -0500
|
|
|
513694 |
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
|
|
513694 |
|
|
|
513694 |
Old code was both inefficient and wasted code size. New code (-62
|
|
|
513694 |
bytes) and comparable or better performance in the page cross case.
|
|
|
513694 |
|
|
|
513694 |
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
|
|
513694 |
|
|
|
513694 |
size, align0, align1, ret, New Time/Old Time
|
|
|
513694 |
1, 4095, 0, 0, 1.001
|
|
|
513694 |
1, 4095, 0, 1, 0.999
|
|
|
513694 |
1, 4095, 0, -1, 1.0
|
|
|
513694 |
2, 4094, 0, 0, 1.0
|
|
|
513694 |
2, 4094, 0, 1, 1.0
|
|
|
513694 |
2, 4094, 0, -1, 1.0
|
|
|
513694 |
3, 4093, 0, 0, 1.0
|
|
|
513694 |
3, 4093, 0, 1, 1.0
|
|
|
513694 |
3, 4093, 0, -1, 1.0
|
|
|
513694 |
4, 4092, 0, 0, 0.987
|
|
|
513694 |
4, 4092, 0, 1, 1.0
|
|
|
513694 |
4, 4092, 0, -1, 1.0
|
|
|
513694 |
5, 4091, 0, 0, 0.984
|
|
|
513694 |
5, 4091, 0, 1, 1.002
|
|
|
513694 |
5, 4091, 0, -1, 1.005
|
|
|
513694 |
6, 4090, 0, 0, 0.993
|
|
|
513694 |
6, 4090, 0, 1, 1.001
|
|
|
513694 |
6, 4090, 0, -1, 1.003
|
|
|
513694 |
7, 4089, 0, 0, 0.991
|
|
|
513694 |
7, 4089, 0, 1, 1.0
|
|
|
513694 |
7, 4089, 0, -1, 1.001
|
|
|
513694 |
8, 4088, 0, 0, 0.875
|
|
|
513694 |
8, 4088, 0, 1, 0.881
|
|
|
513694 |
8, 4088, 0, -1, 0.888
|
|
|
513694 |
9, 4087, 0, 0, 0.872
|
|
|
513694 |
9, 4087, 0, 1, 0.879
|
|
|
513694 |
9, 4087, 0, -1, 0.883
|
|
|
513694 |
10, 4086, 0, 0, 0.878
|
|
|
513694 |
10, 4086, 0, 1, 0.886
|
|
|
513694 |
10, 4086, 0, -1, 0.873
|
|
|
513694 |
11, 4085, 0, 0, 0.878
|
|
|
513694 |
11, 4085, 0, 1, 0.881
|
|
|
513694 |
11, 4085, 0, -1, 0.879
|
|
|
513694 |
12, 4084, 0, 0, 0.873
|
|
|
513694 |
12, 4084, 0, 1, 0.889
|
|
|
513694 |
12, 4084, 0, -1, 0.875
|
|
|
513694 |
13, 4083, 0, 0, 0.873
|
|
|
513694 |
13, 4083, 0, 1, 0.863
|
|
|
513694 |
13, 4083, 0, -1, 0.863
|
|
|
513694 |
14, 4082, 0, 0, 0.838
|
|
|
513694 |
14, 4082, 0, 1, 0.869
|
|
|
513694 |
14, 4082, 0, -1, 0.877
|
|
|
513694 |
15, 4081, 0, 0, 0.841
|
|
|
513694 |
15, 4081, 0, 1, 0.869
|
|
|
513694 |
15, 4081, 0, -1, 0.876
|
|
|
513694 |
16, 4080, 0, 0, 0.988
|
|
|
513694 |
16, 4080, 0, 1, 0.99
|
|
|
513694 |
16, 4080, 0, -1, 0.989
|
|
|
513694 |
17, 4079, 0, 0, 0.978
|
|
|
513694 |
17, 4079, 0, 1, 0.981
|
|
|
513694 |
17, 4079, 0, -1, 0.98
|
|
|
513694 |
18, 4078, 0, 0, 0.981
|
|
|
513694 |
18, 4078, 0, 1, 0.98
|
|
|
513694 |
18, 4078, 0, -1, 0.985
|
|
|
513694 |
19, 4077, 0, 0, 0.977
|
|
|
513694 |
19, 4077, 0, 1, 0.979
|
|
|
513694 |
19, 4077, 0, -1, 0.986
|
|
|
513694 |
20, 4076, 0, 0, 0.977
|
|
|
513694 |
20, 4076, 0, 1, 0.986
|
|
|
513694 |
20, 4076, 0, -1, 0.984
|
|
|
513694 |
21, 4075, 0, 0, 0.977
|
|
|
513694 |
21, 4075, 0, 1, 0.983
|
|
|
513694 |
21, 4075, 0, -1, 0.988
|
|
|
513694 |
22, 4074, 0, 0, 0.983
|
|
|
513694 |
22, 4074, 0, 1, 0.994
|
|
|
513694 |
22, 4074, 0, -1, 0.993
|
|
|
513694 |
23, 4073, 0, 0, 0.98
|
|
|
513694 |
23, 4073, 0, 1, 0.992
|
|
|
513694 |
23, 4073, 0, -1, 0.995
|
|
|
513694 |
24, 4072, 0, 0, 0.989
|
|
|
513694 |
24, 4072, 0, 1, 0.989
|
|
|
513694 |
24, 4072, 0, -1, 0.991
|
|
|
513694 |
25, 4071, 0, 0, 0.99
|
|
|
513694 |
25, 4071, 0, 1, 0.999
|
|
|
513694 |
25, 4071, 0, -1, 0.996
|
|
|
513694 |
26, 4070, 0, 0, 0.993
|
|
|
513694 |
26, 4070, 0, 1, 0.995
|
|
|
513694 |
26, 4070, 0, -1, 0.998
|
|
|
513694 |
27, 4069, 0, 0, 0.993
|
|
|
513694 |
27, 4069, 0, 1, 0.999
|
|
|
513694 |
27, 4069, 0, -1, 1.0
|
|
|
513694 |
28, 4068, 0, 0, 0.997
|
|
|
513694 |
28, 4068, 0, 1, 1.0
|
|
|
513694 |
28, 4068, 0, -1, 0.999
|
|
|
513694 |
29, 4067, 0, 0, 0.996
|
|
|
513694 |
29, 4067, 0, 1, 0.999
|
|
|
513694 |
29, 4067, 0, -1, 0.999
|
|
|
513694 |
30, 4066, 0, 0, 0.991
|
|
|
513694 |
30, 4066, 0, 1, 1.001
|
|
|
513694 |
30, 4066, 0, -1, 0.999
|
|
|
513694 |
31, 4065, 0, 0, 0.988
|
|
|
513694 |
31, 4065, 0, 1, 0.998
|
|
|
513694 |
31, 4065, 0, -1, 0.998
|
|
|
513694 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
513694 |
|
|
|
513694 |
(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
|
|
|
513694 |
---
|
|
|
513694 |
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
|
|
|
513694 |
1 file changed, 61 insertions(+), 37 deletions(-)
|
|
|
513694 |
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
|
513694 |
index 16fc673e..99258cf5 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
|
513694 |
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
|
|
513694 |
# ifndef USE_AS_WMEMCMP
|
|
|
513694 |
cmpl $8, %edx
|
|
|
513694 |
jae L(between_8_15)
|
|
|
513694 |
+ /* Fall through for [4, 7]. */
|
|
|
513694 |
cmpl $4, %edx
|
|
|
513694 |
- jae L(between_4_7)
|
|
|
513694 |
+ jb L(between_2_3)
|
|
|
513694 |
|
|
|
513694 |
- /* Load as big endian to avoid branches. */
|
|
|
513694 |
- movzwl (%rdi), %eax
|
|
|
513694 |
- movzwl (%rsi), %ecx
|
|
|
513694 |
- shll $8, %eax
|
|
|
513694 |
- shll $8, %ecx
|
|
|
513694 |
- bswap %eax
|
|
|
513694 |
- bswap %ecx
|
|
|
513694 |
- movzbl -1(%rdi, %rdx), %edi
|
|
|
513694 |
- movzbl -1(%rsi, %rdx), %esi
|
|
|
513694 |
- orl %edi, %eax
|
|
|
513694 |
- orl %esi, %ecx
|
|
|
513694 |
- /* Subtraction is okay because the upper 8 bits are zero. */
|
|
|
513694 |
- subl %ecx, %eax
|
|
|
513694 |
+ movbe (%rdi), %eax
|
|
|
513694 |
+ movbe (%rsi), %ecx
|
|
|
513694 |
+ shlq $32, %rax
|
|
|
513694 |
+ shlq $32, %rcx
|
|
|
513694 |
+ movbe -4(%rdi, %rdx), %edi
|
|
|
513694 |
+ movbe -4(%rsi, %rdx), %esi
|
|
|
513694 |
+ orq %rdi, %rax
|
|
|
513694 |
+ orq %rsi, %rcx
|
|
|
513694 |
+ subq %rcx, %rax
|
|
|
513694 |
+ /* Fast path for return zero. */
|
|
|
513694 |
+ jnz L(ret_nonzero)
|
|
|
513694 |
/* No ymm register was touched. */
|
|
|
513694 |
ret
|
|
|
513694 |
|
|
|
513694 |
@@ -457,9 +456,33 @@ L(one_or_less):
|
|
|
513694 |
/* No ymm register was touched. */
|
|
|
513694 |
ret
|
|
|
513694 |
|
|
|
513694 |
+ .p2align 4,, 5
|
|
|
513694 |
+L(ret_nonzero):
|
|
|
513694 |
+ sbbl %eax, %eax
|
|
|
513694 |
+ orl $1, %eax
|
|
|
513694 |
+ /* No ymm register was touched. */
|
|
|
513694 |
+ ret
|
|
|
513694 |
+
|
|
|
513694 |
+ .p2align 4,, 2
|
|
|
513694 |
+L(zero):
|
|
|
513694 |
+ xorl %eax, %eax
|
|
|
513694 |
+ /* No ymm register was touched. */
|
|
|
513694 |
+ ret
|
|
|
513694 |
+
|
|
|
513694 |
.p2align 4
|
|
|
513694 |
L(between_8_15):
|
|
|
513694 |
-# endif
|
|
|
513694 |
+ movbe (%rdi), %rax
|
|
|
513694 |
+ movbe (%rsi), %rcx
|
|
|
513694 |
+ subq %rcx, %rax
|
|
|
513694 |
+ jnz L(ret_nonzero)
|
|
|
513694 |
+ movbe -8(%rdi, %rdx), %rax
|
|
|
513694 |
+ movbe -8(%rsi, %rdx), %rcx
|
|
|
513694 |
+ subq %rcx, %rax
|
|
|
513694 |
+ /* Fast path for return zero. */
|
|
|
513694 |
+ jnz L(ret_nonzero)
|
|
|
513694 |
+ /* No ymm register was touched. */
|
|
|
513694 |
+ ret
|
|
|
513694 |
+# else
|
|
|
513694 |
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
|
513694 |
vmovq (%rdi), %xmm1
|
|
|
513694 |
vmovq (%rsi), %xmm2
|
|
|
513694 |
@@ -475,16 +498,13 @@ L(between_8_15):
|
|
|
513694 |
VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
|
513694 |
vpmovmskb %xmm2, %eax
|
|
|
513694 |
subl $0xffff, %eax
|
|
|
513694 |
+ /* Fast path for return zero. */
|
|
|
513694 |
jnz L(return_vec_0)
|
|
|
513694 |
/* No ymm register was touched. */
|
|
|
513694 |
ret
|
|
|
513694 |
+# endif
|
|
|
513694 |
|
|
|
513694 |
- .p2align 4
|
|
|
513694 |
-L(zero):
|
|
|
513694 |
- xorl %eax, %eax
|
|
|
513694 |
- ret
|
|
|
513694 |
-
|
|
|
513694 |
- .p2align 4
|
|
|
513694 |
+ .p2align 4,, 10
|
|
|
513694 |
L(between_16_31):
|
|
|
513694 |
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
|
513694 |
vmovdqu (%rsi), %xmm2
|
|
|
513694 |
@@ -501,11 +521,17 @@ L(between_16_31):
|
|
|
513694 |
VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
|
513694 |
vpmovmskb %xmm2, %eax
|
|
|
513694 |
subl $0xffff, %eax
|
|
|
513694 |
+ /* Fast path for return zero. */
|
|
|
513694 |
jnz L(return_vec_0)
|
|
|
513694 |
/* No ymm register was touched. */
|
|
|
513694 |
ret
|
|
|
513694 |
|
|
|
513694 |
# ifdef USE_AS_WMEMCMP
|
|
|
513694 |
+ .p2align 4,, 2
|
|
|
513694 |
+L(zero):
|
|
|
513694 |
+ xorl %eax, %eax
|
|
|
513694 |
+ ret
|
|
|
513694 |
+
|
|
|
513694 |
.p2align 4
|
|
|
513694 |
L(one_or_less):
|
|
|
513694 |
jb L(zero)
|
|
|
513694 |
@@ -520,22 +546,20 @@ L(one_or_less):
|
|
|
513694 |
# else
|
|
|
513694 |
|
|
|
513694 |
.p2align 4
|
|
|
513694 |
-L(between_4_7):
|
|
|
513694 |
- /* Load as big endian with overlapping movbe to avoid branches.
|
|
|
513694 |
- */
|
|
|
513694 |
- movbe (%rdi), %eax
|
|
|
513694 |
- movbe (%rsi), %ecx
|
|
|
513694 |
- shlq $32, %rax
|
|
|
513694 |
- shlq $32, %rcx
|
|
|
513694 |
- movbe -4(%rdi, %rdx), %edi
|
|
|
513694 |
- movbe -4(%rsi, %rdx), %esi
|
|
|
513694 |
- orq %rdi, %rax
|
|
|
513694 |
- orq %rsi, %rcx
|
|
|
513694 |
- subq %rcx, %rax
|
|
|
513694 |
- jz L(zero_4_7)
|
|
|
513694 |
- sbbl %eax, %eax
|
|
|
513694 |
- orl $1, %eax
|
|
|
513694 |
-L(zero_4_7):
|
|
|
513694 |
+L(between_2_3):
|
|
|
513694 |
+ /* Load as big endian to avoid branches. */
|
|
|
513694 |
+ movzwl (%rdi), %eax
|
|
|
513694 |
+ movzwl (%rsi), %ecx
|
|
|
513694 |
+ bswap %eax
|
|
|
513694 |
+ bswap %ecx
|
|
|
513694 |
+ shrl %eax
|
|
|
513694 |
+ shrl %ecx
|
|
|
513694 |
+ movzbl -1(%rdi, %rdx), %edi
|
|
|
513694 |
+ movzbl -1(%rsi, %rdx), %esi
|
|
|
513694 |
+ orl %edi, %eax
|
|
|
513694 |
+ orl %esi, %ecx
|
|
|
513694 |
+ /* Subtraction is okay because the upper bit is zero. */
|
|
|
513694 |
+ subl %ecx, %eax
|
|
|
513694 |
/* No ymm register was touched. */
|
|
|
513694 |
ret
|
|
|
513694 |
# endif
|
|
|
513694 |
--
|
|
|
513694 |
GitLab
|
|
|
513694 |
|