513694
From 4619b6dbf13c17a13be2d2a0bdc9fcc2640b0f86 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Fri, 15 Apr 2022 12:28:01 -0500
513694
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
513694
513694
Old code was both inefficient and wasted code size. New code (-62
513694
bytes) and comparable or better performance in the page cross case.
513694
513694
geometric_mean(N=20) of page cross cases New / Original: 0.960
513694
513694
size, align0, align1, ret, New Time/Old Time
513694
   1,   4095,      0,   0,             1.001
513694
   1,   4095,      0,   1,             0.999
513694
   1,   4095,      0,  -1,               1.0
513694
   2,   4094,      0,   0,               1.0
513694
   2,   4094,      0,   1,               1.0
513694
   2,   4094,      0,  -1,               1.0
513694
   3,   4093,      0,   0,               1.0
513694
   3,   4093,      0,   1,               1.0
513694
   3,   4093,      0,  -1,               1.0
513694
   4,   4092,      0,   0,             0.987
513694
   4,   4092,      0,   1,               1.0
513694
   4,   4092,      0,  -1,               1.0
513694
   5,   4091,      0,   0,             0.984
513694
   5,   4091,      0,   1,             1.002
513694
   5,   4091,      0,  -1,             1.005
513694
   6,   4090,      0,   0,             0.993
513694
   6,   4090,      0,   1,             1.001
513694
   6,   4090,      0,  -1,             1.003
513694
   7,   4089,      0,   0,             0.991
513694
   7,   4089,      0,   1,               1.0
513694
   7,   4089,      0,  -1,             1.001
513694
   8,   4088,      0,   0,             0.875
513694
   8,   4088,      0,   1,             0.881
513694
   8,   4088,      0,  -1,             0.888
513694
   9,   4087,      0,   0,             0.872
513694
   9,   4087,      0,   1,             0.879
513694
   9,   4087,      0,  -1,             0.883
513694
  10,   4086,      0,   0,             0.878
513694
  10,   4086,      0,   1,             0.886
513694
  10,   4086,      0,  -1,             0.873
513694
  11,   4085,      0,   0,             0.878
513694
  11,   4085,      0,   1,             0.881
513694
  11,   4085,      0,  -1,             0.879
513694
  12,   4084,      0,   0,             0.873
513694
  12,   4084,      0,   1,             0.889
513694
  12,   4084,      0,  -1,             0.875
513694
  13,   4083,      0,   0,             0.873
513694
  13,   4083,      0,   1,             0.863
513694
  13,   4083,      0,  -1,             0.863
513694
  14,   4082,      0,   0,             0.838
513694
  14,   4082,      0,   1,             0.869
513694
  14,   4082,      0,  -1,             0.877
513694
  15,   4081,      0,   0,             0.841
513694
  15,   4081,      0,   1,             0.869
513694
  15,   4081,      0,  -1,             0.876
513694
  16,   4080,      0,   0,             0.988
513694
  16,   4080,      0,   1,              0.99
513694
  16,   4080,      0,  -1,             0.989
513694
  17,   4079,      0,   0,             0.978
513694
  17,   4079,      0,   1,             0.981
513694
  17,   4079,      0,  -1,              0.98
513694
  18,   4078,      0,   0,             0.981
513694
  18,   4078,      0,   1,              0.98
513694
  18,   4078,      0,  -1,             0.985
513694
  19,   4077,      0,   0,             0.977
513694
  19,   4077,      0,   1,             0.979
513694
  19,   4077,      0,  -1,             0.986
513694
  20,   4076,      0,   0,             0.977
513694
  20,   4076,      0,   1,             0.986
513694
  20,   4076,      0,  -1,             0.984
513694
  21,   4075,      0,   0,             0.977
513694
  21,   4075,      0,   1,             0.983
513694
  21,   4075,      0,  -1,             0.988
513694
  22,   4074,      0,   0,             0.983
513694
  22,   4074,      0,   1,             0.994
513694
  22,   4074,      0,  -1,             0.993
513694
  23,   4073,      0,   0,              0.98
513694
  23,   4073,      0,   1,             0.992
513694
  23,   4073,      0,  -1,             0.995
513694
  24,   4072,      0,   0,             0.989
513694
  24,   4072,      0,   1,             0.989
513694
  24,   4072,      0,  -1,             0.991
513694
  25,   4071,      0,   0,              0.99
513694
  25,   4071,      0,   1,             0.999
513694
  25,   4071,      0,  -1,             0.996
513694
  26,   4070,      0,   0,             0.993
513694
  26,   4070,      0,   1,             0.995
513694
  26,   4070,      0,  -1,             0.998
513694
  27,   4069,      0,   0,             0.993
513694
  27,   4069,      0,   1,             0.999
513694
  27,   4069,      0,  -1,               1.0
513694
  28,   4068,      0,   0,             0.997
513694
  28,   4068,      0,   1,               1.0
513694
  28,   4068,      0,  -1,             0.999
513694
  29,   4067,      0,   0,             0.996
513694
  29,   4067,      0,   1,             0.999
513694
  29,   4067,      0,  -1,             0.999
513694
  30,   4066,      0,   0,             0.991
513694
  30,   4066,      0,   1,             1.001
513694
  30,   4066,      0,  -1,             0.999
513694
  31,   4065,      0,   0,             0.988
513694
  31,   4065,      0,   1,             0.998
513694
  31,   4065,      0,  -1,             0.998
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
513694
---
513694
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
513694
 1 file changed, 61 insertions(+), 37 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
513694
index 16fc673e..99258cf5 100644
513694
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
513694
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
513694
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
513694
 # ifndef USE_AS_WMEMCMP
513694
 	cmpl	$8, %edx
513694
 	jae	L(between_8_15)
513694
+	/* Fall through for [4, 7].  */
513694
 	cmpl	$4, %edx
513694
-	jae	L(between_4_7)
513694
+	jb	L(between_2_3)
513694
 
513694
-	/* Load as big endian to avoid branches.  */
513694
-	movzwl	(%rdi), %eax
513694
-	movzwl	(%rsi), %ecx
513694
-	shll	$8, %eax
513694
-	shll	$8, %ecx
513694
-	bswap	%eax
513694
-	bswap	%ecx
513694
-	movzbl	-1(%rdi, %rdx), %edi
513694
-	movzbl	-1(%rsi, %rdx), %esi
513694
-	orl	%edi, %eax
513694
-	orl	%esi, %ecx
513694
-	/* Subtraction is okay because the upper 8 bits are zero.  */
513694
-	subl	%ecx, %eax
513694
+	movbe	(%rdi), %eax
513694
+	movbe	(%rsi), %ecx
513694
+	shlq	$32, %rax
513694
+	shlq	$32, %rcx
513694
+	movbe	-4(%rdi, %rdx), %edi
513694
+	movbe	-4(%rsi, %rdx), %esi
513694
+	orq	%rdi, %rax
513694
+	orq	%rsi, %rcx
513694
+	subq	%rcx, %rax
513694
+	/* Fast path for return zero.  */
513694
+	jnz	L(ret_nonzero)
513694
 	/* No ymm register was touched.  */
513694
 	ret
513694
 
513694
@@ -457,9 +456,33 @@ L(one_or_less):
513694
 	/* No ymm register was touched.  */
513694
 	ret
513694
 
513694
+	.p2align 4,, 5
513694
+L(ret_nonzero):
513694
+	sbbl	%eax, %eax
513694
+	orl	$1, %eax
513694
+	/* No ymm register was touched.  */
513694
+	ret
513694
+
513694
+	.p2align 4,, 2
513694
+L(zero):
513694
+	xorl	%eax, %eax
513694
+	/* No ymm register was touched.  */
513694
+	ret
513694
+
513694
 	.p2align 4
513694
 L(between_8_15):
513694
-# endif
513694
+	movbe	(%rdi), %rax
513694
+	movbe	(%rsi), %rcx
513694
+	subq	%rcx, %rax
513694
+	jnz	L(ret_nonzero)
513694
+	movbe	-8(%rdi, %rdx), %rax
513694
+	movbe	-8(%rsi, %rdx), %rcx
513694
+	subq	%rcx, %rax
513694
+	/* Fast path for return zero.  */
513694
+	jnz	L(ret_nonzero)
513694
+	/* No ymm register was touched.  */
513694
+	ret
513694
+# else
513694
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
513694
 	vmovq	(%rdi), %xmm1
513694
 	vmovq	(%rsi), %xmm2
513694
@@ -475,16 +498,13 @@ L(between_8_15):
513694
 	VPCMPEQ	%xmm1, %xmm2, %xmm2
513694
 	vpmovmskb %xmm2, %eax
513694
 	subl	$0xffff, %eax
513694
+	/* Fast path for return zero.  */
513694
 	jnz	L(return_vec_0)
513694
 	/* No ymm register was touched.  */
513694
 	ret
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(zero):
513694
-	xorl	%eax, %eax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
+	.p2align 4,, 10
513694
 L(between_16_31):
513694
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
513694
 	vmovdqu	(%rsi), %xmm2
513694
@@ -501,11 +521,17 @@ L(between_16_31):
513694
 	VPCMPEQ	(%rdi), %xmm2, %xmm2
513694
 	vpmovmskb %xmm2, %eax
513694
 	subl	$0xffff, %eax
513694
+	/* Fast path for return zero.  */
513694
 	jnz	L(return_vec_0)
513694
 	/* No ymm register was touched.  */
513694
 	ret
513694
 
513694
 # ifdef USE_AS_WMEMCMP
513694
+	.p2align 4,, 2
513694
+L(zero):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
+
513694
 	.p2align 4
513694
 L(one_or_less):
513694
 	jb	L(zero)
513694
@@ -520,22 +546,20 @@ L(one_or_less):
513694
 # else
513694
 
513694
 	.p2align 4
513694
-L(between_4_7):
513694
-	/* Load as big endian with overlapping movbe to avoid branches.
513694
-	 */
513694
-	movbe	(%rdi), %eax
513694
-	movbe	(%rsi), %ecx
513694
-	shlq	$32, %rax
513694
-	shlq	$32, %rcx
513694
-	movbe	-4(%rdi, %rdx), %edi
513694
-	movbe	-4(%rsi, %rdx), %esi
513694
-	orq	%rdi, %rax
513694
-	orq	%rsi, %rcx
513694
-	subq	%rcx, %rax
513694
-	jz	L(zero_4_7)
513694
-	sbbl	%eax, %eax
513694
-	orl	$1, %eax
513694
-L(zero_4_7):
513694
+L(between_2_3):
513694
+	/* Load as big endian to avoid branches.  */
513694
+	movzwl	(%rdi), %eax
513694
+	movzwl	(%rsi), %ecx
513694
+	bswap	%eax
513694
+	bswap	%ecx
513694
+	shrl	%eax
513694
+	shrl	%ecx
513694
+	movzbl	-1(%rdi, %rdx), %edi
513694
+	movzbl	-1(%rsi, %rdx), %esi
513694
+	orl	%edi, %eax
513694
+	orl	%esi, %ecx
513694
+	/* Subtraction is okay because the upper bit is zero.  */
513694
+	subl	%ecx, %eax
513694
 	/* No ymm register was touched.  */
513694
 	ret
513694
 # endif
513694
-- 
513694
GitLab
513694