190885
From 93b1c47bd092f8e1444a10b5d6ec20e44d66459a Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Fri, 23 Apr 2021 15:56:24 -0400
190885
Subject: [PATCH] x86: Optimize strchr-avx2.S
190885
190885
No bug. This commit optimizes strchr-avx2.S. The optimizations are all
190885
small things such as save an ALU in the alignment process, saving a
190885
few instructions in the loop return, saving some bytes in the main
190885
loop, and increasing the ILP in the return cases. test-strchr,
190885
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
(cherry picked from commit ccabe7971f508709d034b63b8672f6f751a3d356)
190885
---
190885
 sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
190885
 1 file changed, 170 insertions(+), 120 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
index 919d256c..5884726b 100644
190885
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
@@ -49,133 +49,144 @@
190885
 
190885
 	.section SECTION(.text),"ax",@progbits
190885
 ENTRY (STRCHR)
190885
-	movl	%edi, %ecx
190885
-# ifndef USE_AS_STRCHRNUL
190885
-	xorl	%edx, %edx
190885
-# endif
190885
-
190885
 	/* Broadcast CHAR to YMM0.	*/
190885
 	vmovd	%esi, %xmm0
190885
+	movl	%edi, %eax
190885
+	andl	$(PAGE_SIZE - 1), %eax
190885
+	VPBROADCAST	%xmm0, %ymm0
190885
 	vpxor	%xmm9, %xmm9, %xmm9
190885
-	VPBROADCAST %xmm0, %ymm0
190885
 
190885
 	/* Check if we cross page boundary with one vector load.  */
190885
-	andl	$(PAGE_SIZE - 1), %ecx
190885
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
190885
-	ja  L(cross_page_boundary)
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	ja	L(cross_page_boundary)
190885
 
190885
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
190885
 	   null byte.  */
190885
 	vmovdqu	(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
 	vpor	%ymm1, %ymm2, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
 	testl	%eax, %eax
190885
-	jz	L(more_vecs)
190885
+	jz	L(aligned_more)
190885
 	tzcntl	%eax, %eax
190885
+# ifndef USE_AS_STRCHRNUL
190885
 	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero)
190885
+# endif
190885
 	addq	%rdi, %rax
190885
+	VZEROUPPER_RETURN
190885
+
190885
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
190885
+	   alignment % 32 was either 16 or 0. As well this makes the
190885
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
190885
+	   easier.  */
190885
+	.p2align 5
190885
+L(first_vec_x4):
190885
+	tzcntl	%eax, %eax
190885
+	addq	$(VEC_SIZE * 3 + 1), %rdi
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
-L(return_vzeroupper):
190885
-	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
-
190885
-	.p2align 4
190885
-L(more_vecs):
190885
-	/* Align data for aligned loads in the loop.  */
190885
-	andq	$-VEC_SIZE, %rdi
190885
-L(aligned_more):
190885
-
190885
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
190885
-	   since data is only aligned to VEC_SIZE.	*/
190885
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
190885
-	addq	$VEC_SIZE, %rdi
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
-	vpor	%ymm1, %ymm2, %ymm1
190885
-	vpmovmskb %ymm1, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x0)
190885
-
190885
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
-	vpor	%ymm1, %ymm2, %ymm1
190885
-	vpmovmskb %ymm1, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x1)
190885
-
190885
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
-	vpor	%ymm1, %ymm2, %ymm1
190885
-	vpmovmskb %ymm1, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x2)
190885
-
190885
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
-	vpor	%ymm1, %ymm2, %ymm1
190885
-	vpmovmskb %ymm1, %eax
190885
-	testl	%eax, %eax
190885
-	jz	L(prep_loop_4x)
190885
+	addq	%rdi, %rax
190885
+	VZEROUPPER_RETURN
190885
 
190885
-	tzcntl	%eax, %eax
190885
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	VZEROUPPER_RETURN
190885
 # endif
190885
-	VZEROUPPER
190885
-	ret
190885
+
190885
 
190885
 	.p2align 4
190885
-L(first_vec_x0):
190885
+L(first_vec_x1):
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
-	addq	%rdi, %rax
190885
+	incq	%rdi
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
+	addq	%rdi, %rax
190885
 	VZEROUPPER_RETURN
190885
 
190885
 	.p2align 4
190885
-L(first_vec_x1):
190885
+L(first_vec_x2):
190885
 	tzcntl	%eax, %eax
190885
-	leaq	VEC_SIZE(%rdi, %rax), %rax
190885
+	addq	$(VEC_SIZE + 1), %rdi
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
+	addq	%rdi, %rax
190885
 	VZEROUPPER_RETURN
190885
 
190885
 	.p2align 4
190885
-L(first_vec_x2):
190885
+L(first_vec_x3):
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
+	addq	$(VEC_SIZE * 2 + 1), %rdi
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
+	addq	%rdi, %rax
190885
 	VZEROUPPER_RETURN
190885
 
190885
-L(prep_loop_4x):
190885
-	/* Align data to 4 * VEC_SIZE.	*/
190885
-	andq	$-(VEC_SIZE * 4), %rdi
190885
+	.p2align 4
190885
+L(aligned_more):
190885
+	/* Align data to VEC_SIZE - 1. This is the same number of
190885
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
190885
+	   on x4 check.  */
190885
+	orq	$(VEC_SIZE - 1), %rdi
190885
+L(cross_page_continue):
190885
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.  */
190885
+	vmovdqa	1(%rdi), %ymm8
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
+	vpor	%ymm1, %ymm2, %ymm1
190885
+	vpmovmskb %ymm1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
+	vpor	%ymm1, %ymm2, %ymm1
190885
+	vpmovmskb %ymm1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+
190885
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
+	vpor	%ymm1, %ymm2, %ymm1
190885
+	vpmovmskb %ymm1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x3)
190885
 
190885
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
+	vpor	%ymm1, %ymm2, %ymm1
190885
+	vpmovmskb %ymm1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x4)
190885
+	/* Align data to VEC_SIZE * 4 - 1.	*/
190885
+	addq	$(VEC_SIZE * 4 + 1), %rdi
190885
+	andq	$-(VEC_SIZE * 4), %rdi
190885
 	.p2align 4
190885
 L(loop_4x_vec):
190885
 	/* Compare 4 * VEC at a time forward.  */
190885
-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
190885
-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
190885
-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
190885
-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
190885
+	vmovdqa	(%rdi), %ymm5
190885
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
190885
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
190885
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
190885
 
190885
 	/* Leaves only CHARS matching esi as 0.	 */
190885
 	vpxor	%ymm5, %ymm0, %ymm1
190885
@@ -191,63 +202,102 @@ L(loop_4x_vec):
190885
 	VPMINU	%ymm1, %ymm2, %ymm5
190885
 	VPMINU	%ymm3, %ymm4, %ymm6
190885
 
190885
-	VPMINU	%ymm5, %ymm6, %ymm5
190885
+	VPMINU	%ymm5, %ymm6, %ymm6
190885
 
190885
-	VPCMPEQ %ymm5, %ymm9, %ymm5
190885
-	vpmovmskb %ymm5, %eax
190885
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
190885
+	vpmovmskb %ymm6, %ecx
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	testl	%ecx, %ecx
190885
+	jz	L(loop_4x_vec)
190885
 
190885
-	addq	$(VEC_SIZE * 4), %rdi
190885
-	testl	%eax, %eax
190885
-	jz  L(loop_4x_vec)
190885
 
190885
-	VPCMPEQ %ymm1, %ymm9, %ymm1
190885
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(first_vec_x0)
190885
+	jnz	L(last_vec_x0)
190885
+
190885
 
190885
-	VPCMPEQ %ymm2, %ymm9, %ymm2
190885
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
190885
 	vpmovmskb %ymm2, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(first_vec_x1)
190885
+	jnz	L(last_vec_x1)
190885
+
190885
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
190885
+	vpmovmskb %ymm3, %eax
190885
+	/* rcx has combined result from all 4 VEC. It will only be used
190885
+	   if the first 3 other VEC all did not contain a match.  */
190885
+	salq	$32, %rcx
190885
+	orq	%rcx, %rax
190885
+	tzcntq	%rax, %rax
190885
+	subq	$(VEC_SIZE * 2), %rdi
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero_end)
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	VZEROUPPER_RETURN
190885
+
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x0):
190885
+	tzcntl	%eax, %eax
190885
+	addq	$-(VEC_SIZE * 4), %rdi
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero_end)
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	VZEROUPPER_RETURN
190885
 
190885
-	VPCMPEQ %ymm3, %ymm9, %ymm3
190885
-	VPCMPEQ %ymm4, %ymm9, %ymm4
190885
-	vpmovmskb %ymm3, %ecx
190885
-	vpmovmskb %ymm4, %eax
190885
-	salq	$32, %rax
190885
-	orq %rcx, %rax
190885
-	tzcntq  %rax, %rax
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+L(zero_end):
190885
+	xorl	%eax, %eax
190885
+	VZEROUPPER_RETURN
190885
 # endif
190885
-	VZEROUPPER
190885
-	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x1):
190885
+	tzcntl	%eax, %eax
190885
+	subq	$(VEC_SIZE * 3), %rdi
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdi, %rax), %CHAR_REG
190885
+	jne	L(zero_end)
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	VZEROUPPER_RETURN
190885
+
190885
 
190885
 	/* Cold case for crossing page with first load.	 */
190885
 	.p2align 4
190885
 L(cross_page_boundary):
190885
-	andq	$-VEC_SIZE, %rdi
190885
-	andl	$(VEC_SIZE - 1), %ecx
190885
-
190885
-	vmovdqa	(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
+	movq	%rdi, %rdx
190885
+	/* Align rdi to VEC_SIZE - 1.  */
190885
+	orq	$(VEC_SIZE - 1), %rdi
190885
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
190885
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
190885
 	vpor	%ymm1, %ymm2, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
-	/* Remove the leading bits.	 */
190885
-	sarxl	%ecx, %eax, %eax
190885
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
190885
+	   so no need to manually mod edx.  */
190885
+	sarxl	%edx, %eax, %eax
190885
 	testl	%eax, %eax
190885
-	jz	L(aligned_more)
190885
+	jz	L(cross_page_continue)
190885
 	tzcntl	%eax, %eax
190885
-	addq	%rcx, %rdi
190885
-	addq	%rdi, %rax
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	xorl	%ecx, %ecx
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rdx, %rax), %CHAR_REG
190885
+	leaq	(%rdx, %rax), %rax
190885
+	cmovne	%rcx, %rax
190885
+# else
190885
+	addq	%rdx, %rax
190885
 # endif
190885
-	VZEROUPPER_RETURN
190885
+L(return_vzeroupper):
190885
+	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
 
190885
 END (STRCHR)
190885
 # endif
190885
-- 
190885
GitLab
190885