commit df5de87260dba479873b2850bbe5c0b81c2376f6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Apr 15 12:28:01 2022 -0500
x86: Cleanup page cross code in memcmp-avx2-movbe.S
Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.
geometric_mean(N=20) of page cross cases New / Original: 0.960
size, align0, align1, ret, New Time/Old Time
1, 4095, 0, 0, 1.001
1, 4095, 0, 1, 0.999
1, 4095, 0, -1, 1.0
2, 4094, 0, 0, 1.0
2, 4094, 0, 1, 1.0
2, 4094, 0, -1, 1.0
3, 4093, 0, 0, 1.0
3, 4093, 0, 1, 1.0
3, 4093, 0, -1, 1.0
4, 4092, 0, 0, 0.987
4, 4092, 0, 1, 1.0
4, 4092, 0, -1, 1.0
5, 4091, 0, 0, 0.984
5, 4091, 0, 1, 1.002
5, 4091, 0, -1, 1.005
6, 4090, 0, 0, 0.993
6, 4090, 0, 1, 1.001
6, 4090, 0, -1, 1.003
7, 4089, 0, 0, 0.991
7, 4089, 0, 1, 1.0
7, 4089, 0, -1, 1.001
8, 4088, 0, 0, 0.875
8, 4088, 0, 1, 0.881
8, 4088, 0, -1, 0.888
9, 4087, 0, 0, 0.872
9, 4087, 0, 1, 0.879
9, 4087, 0, -1, 0.883
10, 4086, 0, 0, 0.878
10, 4086, 0, 1, 0.886
10, 4086, 0, -1, 0.873
11, 4085, 0, 0, 0.878
11, 4085, 0, 1, 0.881
11, 4085, 0, -1, 0.879
12, 4084, 0, 0, 0.873
12, 4084, 0, 1, 0.889
12, 4084, 0, -1, 0.875
13, 4083, 0, 0, 0.873
13, 4083, 0, 1, 0.863
13, 4083, 0, -1, 0.863
14, 4082, 0, 0, 0.838
14, 4082, 0, 1, 0.869
14, 4082, 0, -1, 0.877
15, 4081, 0, 0, 0.841
15, 4081, 0, 1, 0.869
15, 4081, 0, -1, 0.876
16, 4080, 0, 0, 0.988
16, 4080, 0, 1, 0.99
16, 4080, 0, -1, 0.989
17, 4079, 0, 0, 0.978
17, 4079, 0, 1, 0.981
17, 4079, 0, -1, 0.98
18, 4078, 0, 0, 0.981
18, 4078, 0, 1, 0.98
18, 4078, 0, -1, 0.985
19, 4077, 0, 0, 0.977
19, 4077, 0, 1, 0.979
19, 4077, 0, -1, 0.986
20, 4076, 0, 0, 0.977
20, 4076, 0, 1, 0.986
20, 4076, 0, -1, 0.984
21, 4075, 0, 0, 0.977
21, 4075, 0, 1, 0.983
21, 4075, 0, -1, 0.988
22, 4074, 0, 0, 0.983
22, 4074, 0, 1, 0.994
22, 4074, 0, -1, 0.993
23, 4073, 0, 0, 0.98
23, 4073, 0, 1, 0.992
23, 4073, 0, -1, 0.995
24, 4072, 0, 0, 0.989
24, 4072, 0, 1, 0.989
24, 4072, 0, -1, 0.991
25, 4071, 0, 0, 0.99
25, 4071, 0, 1, 0.999
25, 4071, 0, -1, 0.996
26, 4070, 0, 0, 0.993
26, 4070, 0, 1, 0.995
26, 4070, 0, -1, 0.998
27, 4069, 0, 0, 0.993
27, 4069, 0, 1, 0.999
27, 4069, 0, -1, 1.0
28, 4068, 0, 0, 0.997
28, 4068, 0, 1, 1.0
28, 4068, 0, -1, 0.999
29, 4067, 0, 0, 0.996
29, 4067, 0, 1, 0.999
29, 4067, 0, -1, 0.999
30, 4066, 0, 0, 0.991
30, 4066, 0, 1, 1.001
30, 4066, 0, -1, 0.999
31, 4065, 0, 0, 0.988
31, 4065, 0, 1, 0.998
31, 4065, 0, -1, 0.998
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 2621ec907aedb781..ec9cf0852edf216d 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
@@ -457,9 +456,33 @@ L(one_or_less):
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif