Blame SOURCES/ia-opt-str-wcs_rchr-avx2.patch

513694
From 0566d7c3c34685183e4f17f209651b0fba646df8 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Thu, 21 Apr 2022 20:52:29 -0500
513694
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
513694
513694
The new code unrolls the main loop slightly without adding too much
513694
overhead and minimizes the comparisons for the search CHAR.
513694
513694
Geometric Mean of all benchmarks New / Old: 0.832
513694
See email for all results.
513694
513694
Full xcheck passes on x86_64 with and without multiarch enabled.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)
513694
---
513694
 sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
513694
 1 file changed, 269 insertions(+), 157 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
513694
index c949410b..3d26fad4 100644
513694
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
513694
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
513694
@@ -27,9 +27,13 @@
513694
 # ifdef USE_AS_WCSRCHR
513694
 #  define VPBROADCAST	vpbroadcastd
513694
 #  define VPCMPEQ	vpcmpeqd
513694
+#  define VPMIN	vpminud
513694
+#  define CHAR_SIZE	4
513694
 # else
513694
 #  define VPBROADCAST	vpbroadcastb
513694
 #  define VPCMPEQ	vpcmpeqb
513694
+#  define VPMIN	vpminub
513694
+#  define CHAR_SIZE	1
513694
 # endif
513694
 
513694
 # ifndef VZEROUPPER
513694
@@ -41,196 +45,304 @@
513694
 # endif
513694
 
513694
 # define VEC_SIZE	32
513694
+# define PAGE_SIZE	4096
513694
 
513694
-	.section SECTION(.text),"ax",@progbits
513694
-ENTRY (STRRCHR)
513694
-	movd	%esi, %xmm4
513694
-	movl	%edi, %ecx
513694
+	.section SECTION(.text), "ax", @progbits
513694
+ENTRY(STRRCHR)
513694
+	movd	%esi, %xmm7
513694
+	movl	%edi, %eax
513694
 	/* Broadcast CHAR to YMM4.  */
513694
-	VPBROADCAST %xmm4, %ymm4
513694
+	VPBROADCAST %xmm7, %ymm7
513694
 	vpxor	%xmm0, %xmm0, %xmm0
513694
 
513694
-	/* Check if we may cross page boundary with one vector load.  */
513694
-	andl	$(2 * VEC_SIZE - 1), %ecx
513694
-	cmpl	$VEC_SIZE, %ecx
513694
-	ja	L(cros_page_boundary)
513694
+	/* Shift here instead of `andl` to save code size (saves a fetch
513694
+	   block).  */
513694
+	sall	$20, %eax
513694
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
513694
+	ja	L(cross_page)
513694
 
513694
+L(page_cross_continue):
513694
 	vmovdqu	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %ecx
513694
-	vpmovmskb %ymm3, %eax
513694
-	addq	$VEC_SIZE, %rdi
513694
+	/* Check end of string match.  */
513694
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
513694
+	vpmovmskb %ymm6, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jz	L(aligned_more)
513694
+
513694
+	/* Only check match with search CHAR if needed.  */
513694
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
513694
+	vpmovmskb %ymm1, %eax
513694
+	/* Check if match before first zero.  */
513694
+	blsmskl	%ecx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret0)
513694
+	bsrl	%eax, %eax
513694
+	addq	%rdi, %rax
513694
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
513694
+	   search CHAR is zero we are correct. Either way `andq
513694
+	   -CHAR_SIZE, %rax` gets the correct result.  */
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+L(ret0):
513694
+L(return_vzeroupper):
513694
+	ZERO_UPPER_VEC_REGISTERS_RETURN
513694
+
513694
+	/* Returns for first vec x1/x2 have hard coded backward search
513694
+	   path for earlier matches.  */
513694
+	.p2align 4,, 10
513694
+L(first_vec_x1):
513694
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
513694
+	vpmovmskb %ymm6, %eax
513694
+	blsmskl	%ecx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jnz	L(first_vec_x1_return)
513694
+
513694
+	.p2align 4,, 4
513694
+L(first_vec_x0_test):
513694
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
513694
+	vpmovmskb %ymm6, %eax
513694
+	testl	%eax, %eax
513694
+	jz	L(ret1)
513694
+	bsrl	%eax, %eax
513694
+	addq	%r8, %rax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+L(ret1):
513694
+	VZEROUPPER_RETURN
513694
 
513694
+	.p2align 4,, 10
513694
+L(first_vec_x0_x1_test):
513694
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
513694
+	vpmovmskb %ymm6, %eax
513694
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
513694
+	   before returning.  */
513694
 	testl	%eax, %eax
513694
-	jnz	L(first_vec)
513694
+	jz	L(first_vec_x0_test)
513694
+	.p2align 4,, 4
513694
+L(first_vec_x1_return):
513694
+	bsrl	%eax, %eax
513694
+	leaq	1(%rdi, %rax), %rax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+	VZEROUPPER_RETURN
513694
 
513694
-	testl	%ecx, %ecx
513694
-	jnz	L(return_null)
513694
 
513694
-	andq	$-VEC_SIZE, %rdi
513694
-	xorl	%edx, %edx
513694
-	jmp	L(aligned_loop)
513694
+	.p2align 4,, 10
513694
+L(first_vec_x2):
513694
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
513694
+	vpmovmskb %ymm6, %eax
513694
+	blsmskl	%ecx, %ecx
513694
+	/* If no in-range search CHAR match in ymm3 then need to check
513694
+	   ymm1/ymm2 for an earlier match (we delay checking search
513694
+	   CHAR matches until needed).  */
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_vec_x0_x1_test)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+	VZEROUPPER_RETURN
513694
+
513694
 
513694
 	.p2align 4
513694
-L(first_vec):
513694
-	/* Check if there is a nul CHAR.  */
513694
+L(aligned_more):
513694
+	/* Save original pointer if match was in VEC 0.  */
513694
+	movq	%rdi, %r8
513694
+
513694
+	/* Align src.  */
513694
+	orq	$(VEC_SIZE - 1), %rdi
513694
+	vmovdqu	1(%rdi), %ymm2
513694
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
513694
+	vpmovmskb %ymm6, %ecx
513694
 	testl	%ecx, %ecx
513694
-	jnz	L(char_and_nul_in_first_vec)
513694
+	jnz	L(first_vec_x1)
513694
 
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
-	movq	%rdi, %rsi
513694
-	andq	$-VEC_SIZE, %rdi
513694
-	jmp	L(aligned_loop)
513694
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
513694
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
513694
+	vpmovmskb %ymm6, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x2)
513694
 
513694
+	/* Save pointer again before realigning.  */
513694
+	movq	%rdi, %rsi
513694
+	addq	$(VEC_SIZE + 1), %rdi
513694
+	andq	$-(VEC_SIZE * 2), %rdi
513694
 	.p2align 4
513694
-L(cros_page_boundary):
513694
-	andl	$(VEC_SIZE - 1), %ecx
513694
-	andq	$-VEC_SIZE, %rdi
513694
-	vmovdqa	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %edx
513694
-	vpmovmskb %ymm3, %eax
513694
-	shrl	%cl, %edx
513694
-	shrl	%cl, %eax
513694
-	addq	$VEC_SIZE, %rdi
513694
-
513694
-	/* Check if there is a CHAR.  */
513694
+L(first_aligned_loop):
513694
+	/* Do 2x VEC at a time. Any more and the cost of finding the
513694
+	   match outweights loop benefit.  */
513694
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
513694
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
513694
+
513694
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
513694
+	VPMIN	%ymm4, %ymm5, %ymm8
513694
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
513694
+	vpor	%ymm6, %ymm10, %ymm5
513694
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
513694
+	vpor	%ymm5, %ymm8, %ymm9
513694
+
513694
+	vpmovmskb %ymm9, %eax
513694
+	addq	$(VEC_SIZE * 2), %rdi
513694
+	/* No zero or search CHAR.  */
513694
 	testl	%eax, %eax
513694
-	jnz	L(found_char)
513694
-
513694
-	testl	%edx, %edx
513694
-	jnz	L(return_null)
513694
+	jz	L(first_aligned_loop)
513694
 
513694
-	jmp	L(aligned_loop)
513694
-
513694
-	.p2align 4
513694
-L(found_char):
513694
-	testl	%edx, %edx
513694
-	jnz	L(char_and_nul)
513694
+	/* If no zero CHAR then go to second loop (this allows us to
513694
+	   throw away all prior work).  */
513694
+	vpmovmskb %ymm8, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jz	L(second_aligned_loop_prep)
513694
 
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
-	leaq	(%rdi, %rcx), %rsi
513694
+	/* Search char could be zero so we need to get the true match.
513694
+	 */
513694
+	vpmovmskb %ymm5, %eax
513694
+	testl	%eax, %eax
513694
+	jnz	L(first_aligned_loop_return)
513694
 
513694
-	.p2align 4
513694
-L(aligned_loop):
513694
-	vmovdqa	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	addq	$VEC_SIZE, %rdi
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %ecx
513694
-	vpmovmskb %ymm3, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
-
513694
-	vmovdqa	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	add	$VEC_SIZE, %rdi
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %ecx
513694
+	.p2align 4,, 4
513694
+L(first_vec_x1_or_x2):
513694
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
513694
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
513694
 	vpmovmskb %ymm3, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
-
513694
-	vmovdqa	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	addq	$VEC_SIZE, %rdi
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %ecx
513694
-	vpmovmskb %ymm3, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
-
513694
-	vmovdqa	(%rdi), %ymm1
513694
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
513694
-	addq	$VEC_SIZE, %rdi
513694
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
513694
-	vpmovmskb %ymm2, %ecx
513694
-	vpmovmskb %ymm3, %eax
513694
-	orl	%eax, %ecx
513694
-	jz	L(aligned_loop)
513694
-
513694
-	.p2align 4
513694
-L(char_nor_null):
513694
-	/* Find a CHAR or a nul CHAR in a loop.  */
513694
-	testl	%eax, %eax
513694
-	jnz	L(match)
513694
-L(return_value):
513694
-	testl	%edx, %edx
513694
-	jz	L(return_null)
513694
-	movl	%edx, %eax
513694
-	movq	%rsi, %rdi
513694
+	vpmovmskb %ymm2, %edx
513694
+	/* Use add for macro-fusion.  */
513694
+	addq	%rax, %rdx
513694
+	jz	L(first_vec_x0_test)
513694
+	/* NB: We could move this shift to before the branch and save a
513694
+	   bit of code size / performance on the fall through. The
513694
+	   branch leads to the null case which generally seems hotter
513694
+	   than char in first 3x VEC.  */
513694
+	salq	$32, %rax
513694
+	addq	%rdx, %rax
513694
+	bsrq	%rax, %rax
513694
+	leaq	1(%rsi, %rax), %rax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+	VZEROUPPER_RETURN
513694
 
513694
+	.p2align 4,, 8
513694
+L(first_aligned_loop_return):
513694
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
513694
+	vpmovmskb %ymm4, %edx
513694
+	salq	$32, %rcx
513694
+	orq	%rdx, %rcx
513694
+
513694
+	vpmovmskb %ymm10, %eax
513694
+	vpmovmskb %ymm6, %edx
513694
+	salq	$32, %rax
513694
+	orq	%rdx, %rax
513694
+	blsmskq	%rcx, %rcx
513694
+	andq	%rcx, %rax
513694
+	jz	L(first_vec_x1_or_x2)
513694
+
513694
+	bsrq	%rax, %rax
513694
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
513694
 # ifdef USE_AS_WCSRCHR
513694
-	/* Keep the first bit for each matching CHAR for bsr.  */
513694
-	andl	$0x11111111, %eax
513694
+	andq	$-CHAR_SIZE, %rax
513694
 # endif
513694
-	bsrl	%eax, %eax
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
-L(return_vzeroupper):
513694
-	ZERO_UPPER_VEC_REGISTERS_RETURN
513694
+	VZEROUPPER_RETURN
513694
 
513694
+	/* Search char cannot be zero.  */
513694
 	.p2align 4
513694
-L(match):
513694
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
513694
-	vpmovmskb %ymm2, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jnz	L(find_nul)
513694
-
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
+L(second_aligned_loop_set_furthest_match):
513694
+	/* Save VEC and pointer from most recent match.  */
513694
+L(second_aligned_loop_prep):
513694
 	movq	%rdi, %rsi
513694
-	jmp	L(aligned_loop)
513694
+	vmovdqu	%ymm6, %ymm2
513694
+	vmovdqu	%ymm10, %ymm3
513694
 
513694
 	.p2align 4
513694
-L(find_nul):
513694
-# ifdef USE_AS_WCSRCHR
513694
-	/* Keep the first bit for each matching CHAR for bsr.  */
513694
-	andl	$0x11111111, %ecx
513694
-	andl	$0x11111111, %eax
513694
-# endif
513694
-	/* Mask out any matching bits after the nul CHAR.  */
513694
-	movl	%ecx, %r8d
513694
-	subl	$1, %r8d
513694
-	xorl	%ecx, %r8d
513694
-	andl	%r8d, %eax
513694
+L(second_aligned_loop):
513694
+	/* Search 2x at at time.  */
513694
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
513694
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
513694
+
513694
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
513694
+	VPMIN	%ymm4, %ymm5, %ymm1
513694
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
513694
+	vpor	%ymm6, %ymm10, %ymm5
513694
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
513694
+	vpor	%ymm5, %ymm1, %ymm9
513694
+
513694
+	vpmovmskb %ymm9, %eax
513694
+	addq	$(VEC_SIZE * 2), %rdi
513694
 	testl	%eax, %eax
513694
-	/* If there is no CHAR here, return the remembered one.  */
513694
-	jz	L(return_value)
513694
-	bsrl	%eax, %eax
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
-	VZEROUPPER_RETURN
513694
-
513694
-	.p2align 4
513694
-L(char_and_nul):
513694
-	/* Find both a CHAR and a nul CHAR.  */
513694
-	addq	%rcx, %rdi
513694
-	movl	%edx, %ecx
513694
-L(char_and_nul_in_first_vec):
513694
-# ifdef USE_AS_WCSRCHR
513694
-	/* Keep the first bit for each matching CHAR for bsr.  */
513694
-	andl	$0x11111111, %ecx
513694
-	andl	$0x11111111, %eax
513694
-# endif
513694
-	/* Mask out any matching bits after the nul CHAR.  */
513694
-	movl	%ecx, %r8d
513694
-	subl	$1, %r8d
513694
-	xorl	%ecx, %r8d
513694
-	andl	%r8d, %eax
513694
+	jz	L(second_aligned_loop)
513694
+	vpmovmskb %ymm1, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jz	L(second_aligned_loop_set_furthest_match)
513694
+	vpmovmskb %ymm5, %eax
513694
 	testl	%eax, %eax
513694
-	/* Return null pointer if the nul CHAR comes first.  */
513694
-	jz	L(return_null)
513694
-	bsrl	%eax, %eax
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
+	jnz	L(return_new_match)
513694
+
513694
+	/* This is the hot patch. We know CHAR is inbounds and that
513694
+	   ymm3/ymm2 have latest match.  */
513694
+	.p2align 4,, 4
513694
+L(return_old_match):
513694
+	vpmovmskb %ymm3, %eax
513694
+	vpmovmskb %ymm2, %edx
513694
+	salq	$32, %rax
513694
+	orq	%rdx, %rax
513694
+	bsrq	%rax, %rax
513694
+	/* Search char cannot be zero so safe to just use lea for
513694
+	   wcsrchr.  */
513694
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
513694
 	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(return_null):
513694
-	xorl	%eax, %eax
513694
+	/* Last iteration also potentially has a match.  */
513694
+	.p2align 4,, 8
513694
+L(return_new_match):
513694
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
513694
+	vpmovmskb %ymm4, %edx
513694
+	salq	$32, %rcx
513694
+	orq	%rdx, %rcx
513694
+
513694
+	vpmovmskb %ymm10, %eax
513694
+	vpmovmskb %ymm6, %edx
513694
+	salq	$32, %rax
513694
+	orq	%rdx, %rax
513694
+	blsmskq	%rcx, %rcx
513694
+	andq	%rcx, %rax
513694
+	jz	L(return_old_match)
513694
+	bsrq	%rax, %rax
513694
+	/* Search char cannot be zero so safe to just use lea for
513694
+	   wcsrchr.  */
513694
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
513694
 	VZEROUPPER_RETURN
513694
 
513694
-END (STRRCHR)
513694
+	.p2align 4,, 4
513694
+L(cross_page):
513694
+	movq	%rdi, %rsi
513694
+	andq	$-VEC_SIZE, %rsi
513694
+	vmovdqu	(%rsi), %ymm1
513694
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
513694
+	vpmovmskb %ymm6, %ecx
513694
+	/* Shift out zero CHAR matches that are before the begining of
513694
+	   src (rdi).  */
513694
+	shrxl	%edi, %ecx, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jz	L(page_cross_continue)
513694
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
513694
+	vpmovmskb %ymm1, %eax
513694
+
513694
+	/* Shift out search CHAR matches that are before the begining of
513694
+	   src (rdi).  */
513694
+	shrxl	%edi, %eax, %eax
513694
+	blsmskl	%ecx, %ecx
513694
+	/* Check if any search CHAR match in range.  */
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret2)
513694
+	bsrl	%eax, %eax
513694
+	addq	%rdi, %rax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+# endif
513694
+L(ret2):
513694
+	VZEROUPPER_RETURN
513694
+END(STRRCHR)
513694
 #endif
513694
-- 
513694
GitLab
513694