076f82
commit 00f09a14d2818f438959e764834abb3913f2b20a
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Thu Apr 21 20:52:29 2022 -0500
076f82
076f82
    x86: Optimize {str|wcs}rchr-avx2
076f82
    
076f82
    The new code unrolls the main loop slightly without adding too much
076f82
    overhead and minimizes the comparisons for the search CHAR.
076f82
    
076f82
    Geometric Mean of all benchmarks New / Old: 0.832
076f82
    See email for all results.
076f82
    
076f82
    Full xcheck passes on x86_64 with and without multiarch enabled.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
076f82
index 0deba97114d3b83d..b8dec737d5213b25 100644
076f82
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
076f82
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
076f82
@@ -27,9 +27,13 @@
076f82
 # ifdef USE_AS_WCSRCHR
076f82
 #  define VPBROADCAST	vpbroadcastd
076f82
 #  define VPCMPEQ	vpcmpeqd
076f82
+#  define VPMIN	vpminud
076f82
+#  define CHAR_SIZE	4
076f82
 # else
076f82
 #  define VPBROADCAST	vpbroadcastb
076f82
 #  define VPCMPEQ	vpcmpeqb
076f82
+#  define VPMIN	vpminub
076f82
+#  define CHAR_SIZE	1
076f82
 # endif
076f82
 
076f82
 # ifndef VZEROUPPER
076f82
@@ -41,196 +45,304 @@
076f82
 # endif
076f82
 
076f82
 # define VEC_SIZE	32
076f82
+# define PAGE_SIZE	4096
076f82
 
076f82
-	.section SECTION(.text),"ax",@progbits
076f82
-ENTRY (STRRCHR)
076f82
-	movd	%esi, %xmm4
076f82
-	movl	%edi, %ecx
076f82
+	.section SECTION(.text), "ax", @progbits
076f82
+ENTRY(STRRCHR)
076f82
+	movd	%esi, %xmm7
076f82
+	movl	%edi, %eax
076f82
 	/* Broadcast CHAR to YMM4.  */
076f82
-	VPBROADCAST %xmm4, %ymm4
076f82
+	VPBROADCAST %xmm7, %ymm7
076f82
 	vpxor	%xmm0, %xmm0, %xmm0
076f82
 
076f82
-	/* Check if we may cross page boundary with one vector load.  */
076f82
-	andl	$(2 * VEC_SIZE - 1), %ecx
076f82
-	cmpl	$VEC_SIZE, %ecx
076f82
-	ja	L(cros_page_boundary)
076f82
+	/* Shift here instead of `andl` to save code size (saves a fetch
076f82
+	   block).  */
076f82
+	sall	$20, %eax
076f82
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
076f82
+	ja	L(cross_page)
076f82
 
076f82
+L(page_cross_continue):
076f82
 	vmovdqu	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	addq	$VEC_SIZE, %rdi
076f82
+	/* Check end of string match.  */
076f82
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
076f82
+	vpmovmskb %ymm6, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(aligned_more)
076f82
+
076f82
+	/* Only check match with search CHAR if needed.  */
076f82
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
076f82
+	vpmovmskb %ymm1, %eax
076f82
+	/* Check if match before first zero.  */
076f82
+	blsmskl	%ecx, %ecx
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(ret0)
076f82
+	bsrl	%eax, %eax
076f82
+	addq	%rdi, %rax
076f82
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
076f82
+	   search CHAR is zero we are correct. Either way `andq
076f82
+	   -CHAR_SIZE, %rax` gets the correct result.  */
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+L(ret0):
076f82
+L(return_vzeroupper):
076f82
+	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
+
076f82
+	/* Returns for first vec x1/x2 have hard coded backward search
076f82
+	   path for earlier matches.  */
076f82
+	.p2align 4,, 10
076f82
+L(first_vec_x1):
076f82
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
076f82
+	vpmovmskb %ymm6, %eax
076f82
+	blsmskl	%ecx, %ecx
076f82
+	andl	%ecx, %eax
076f82
+	jnz	L(first_vec_x1_return)
076f82
+
076f82
+	.p2align 4,, 4
076f82
+L(first_vec_x0_test):
076f82
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
076f82
+	vpmovmskb %ymm6, %eax
076f82
+	testl	%eax, %eax
076f82
+	jz	L(ret1)
076f82
+	bsrl	%eax, %eax
076f82
+	addq	%r8, %rax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+L(ret1):
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
+	.p2align 4,, 10
076f82
+L(first_vec_x0_x1_test):
076f82
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
076f82
+	vpmovmskb %ymm6, %eax
076f82
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
076f82
+	   before returning.  */
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(first_vec)
076f82
+	jz	L(first_vec_x0_test)
076f82
+	.p2align 4,, 4
076f82
+L(first_vec_x1_return):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	1(%rdi, %rax), %rax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-	testl	%ecx, %ecx
076f82
-	jnz	L(return_null)
076f82
 
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	xorl	%edx, %edx
076f82
-	jmp	L(aligned_loop)
076f82
+	.p2align 4,, 10
076f82
+L(first_vec_x2):
076f82
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
076f82
+	vpmovmskb %ymm6, %eax
076f82
+	blsmskl	%ecx, %ecx
076f82
+	/* If no in-range search CHAR match in ymm3 then need to check
076f82
+	   ymm1/ymm2 for an earlier match (we delay checking search
076f82
+	   CHAR matches until needed).  */
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(first_vec_x0_x1_test)
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
 
076f82
 	.p2align 4
076f82
-L(first_vec):
076f82
-	/* Check if there is a nul CHAR.  */
076f82
+L(aligned_more):
076f82
+	/* Save original pointer if match was in VEC 0.  */
076f82
+	movq	%rdi, %r8
076f82
+
076f82
+	/* Align src.  */
076f82
+	orq	$(VEC_SIZE - 1), %rdi
076f82
+	vmovdqu	1(%rdi), %ymm2
076f82
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
076f82
+	vpmovmskb %ymm6, %ecx
076f82
 	testl	%ecx, %ecx
076f82
-	jnz	L(char_and_nul_in_first_vec)
076f82
+	jnz	L(first_vec_x1)
076f82
 
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
-	movq	%rdi, %rsi
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	jmp	L(aligned_loop)
076f82
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
076f82
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
076f82
+	vpmovmskb %ymm6, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(first_vec_x2)
076f82
 
076f82
+	/* Save pointer again before realigning.  */
076f82
+	movq	%rdi, %rsi
076f82
+	addq	$(VEC_SIZE + 1), %rdi
076f82
+	andq	$-(VEC_SIZE * 2), %rdi
076f82
 	.p2align 4
076f82
-L(cros_page_boundary):
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %edx
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	shrl	%cl, %edx
076f82
-	shrl	%cl, %eax
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-
076f82
-	/* Check if there is a CHAR.  */
076f82
+L(first_aligned_loop):
076f82
+	/* Do 2x VEC at a time. Any more and the cost of finding the
076f82
+	   match outweights loop benefit.  */
076f82
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
076f82
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
076f82
+
076f82
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
076f82
+	VPMIN	%ymm4, %ymm5, %ymm8
076f82
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
076f82
+	vpor	%ymm6, %ymm10, %ymm5
076f82
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
076f82
+	vpor	%ymm5, %ymm8, %ymm9
076f82
+
076f82
+	vpmovmskb %ymm9, %eax
076f82
+	addq	$(VEC_SIZE * 2), %rdi
076f82
+	/* No zero or search CHAR.  */
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(found_char)
076f82
-
076f82
-	testl	%edx, %edx
076f82
-	jnz	L(return_null)
076f82
+	jz	L(first_aligned_loop)
076f82
 
076f82
-	jmp	L(aligned_loop)
076f82
-
076f82
-	.p2align 4
076f82
-L(found_char):
076f82
-	testl	%edx, %edx
076f82
-	jnz	L(char_and_nul)
076f82
+	/* If no zero CHAR then go to second loop (this allows us to
076f82
+	   throw away all prior work).  */
076f82
+	vpmovmskb %ymm8, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(second_aligned_loop_prep)
076f82
 
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
-	leaq	(%rdi, %rcx), %rsi
076f82
+	/* Search char could be zero so we need to get the true match.
076f82
+	 */
076f82
+	vpmovmskb %ymm5, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(first_aligned_loop_return)
076f82
 
076f82
-	.p2align 4
076f82
-L(aligned_loop):
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
-
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	add	$VEC_SIZE, %rdi
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %ecx
076f82
+	.p2align 4,, 4
076f82
+L(first_vec_x1_or_x2):
076f82
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
076f82
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
076f82
 	vpmovmskb %ymm3, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
-
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
-
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jz	L(aligned_loop)
076f82
-
076f82
-	.p2align 4
076f82
-L(char_nor_null):
076f82
-	/* Find a CHAR or a nul CHAR in a loop.  */
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(match)
076f82
-L(return_value):
076f82
-	testl	%edx, %edx
076f82
-	jz	L(return_null)
076f82
-	movl	%edx, %eax
076f82
-	movq	%rsi, %rdi
076f82
+	vpmovmskb %ymm2, %edx
076f82
+	/* Use add for macro-fusion.  */
076f82
+	addq	%rax, %rdx
076f82
+	jz	L(first_vec_x0_test)
076f82
+	/* NB: We could move this shift to before the branch and save a
076f82
+	   bit of code size / performance on the fall through. The
076f82
+	   branch leads to the null case which generally seems hotter
076f82
+	   than char in first 3x VEC.  */
076f82
+	salq	$32, %rax
076f82
+	addq	%rdx, %rax
076f82
+	bsrq	%rax, %rax
076f82
+	leaq	1(%rsi, %rax), %rax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
+	.p2align 4,, 8
076f82
+L(first_aligned_loop_return):
076f82
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
076f82
+	vpmovmskb %ymm4, %edx
076f82
+	salq	$32, %rcx
076f82
+	orq	%rdx, %rcx
076f82
+
076f82
+	vpmovmskb %ymm10, %eax
076f82
+	vpmovmskb %ymm6, %edx
076f82
+	salq	$32, %rax
076f82
+	orq	%rdx, %rax
076f82
+	blsmskq	%rcx, %rcx
076f82
+	andq	%rcx, %rax
076f82
+	jz	L(first_vec_x1_or_x2)
076f82
+
076f82
+	bsrq	%rax, %rax
076f82
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
076f82
 # ifdef USE_AS_WCSRCHR
076f82
-	/* Keep the first bit for each matching CHAR for bsr.  */
076f82
-	andl	$0x11111111, %eax
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
 # endif
076f82
-	bsrl	%eax, %eax
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
-L(return_vzeroupper):
076f82
-	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
+	/* Search char cannot be zero.  */
076f82
 	.p2align 4
076f82
-L(match):
076f82
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jnz	L(find_nul)
076f82
-
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
+L(second_aligned_loop_set_furthest_match):
076f82
+	/* Save VEC and pointer from most recent match.  */
076f82
+L(second_aligned_loop_prep):
076f82
 	movq	%rdi, %rsi
076f82
-	jmp	L(aligned_loop)
076f82
+	vmovdqu	%ymm6, %ymm2
076f82
+	vmovdqu	%ymm10, %ymm3
076f82
 
076f82
 	.p2align 4
076f82
-L(find_nul):
076f82
-# ifdef USE_AS_WCSRCHR
076f82
-	/* Keep the first bit for each matching CHAR for bsr.  */
076f82
-	andl	$0x11111111, %ecx
076f82
-	andl	$0x11111111, %eax
076f82
-# endif
076f82
-	/* Mask out any matching bits after the nul CHAR.  */
076f82
-	movl	%ecx, %r8d
076f82
-	subl	$1, %r8d
076f82
-	xorl	%ecx, %r8d
076f82
-	andl	%r8d, %eax
076f82
+L(second_aligned_loop):
076f82
+	/* Search 2x at at time.  */
076f82
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
076f82
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
076f82
+
076f82
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
076f82
+	VPMIN	%ymm4, %ymm5, %ymm1
076f82
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
076f82
+	vpor	%ymm6, %ymm10, %ymm5
076f82
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
076f82
+	vpor	%ymm5, %ymm1, %ymm9
076f82
+
076f82
+	vpmovmskb %ymm9, %eax
076f82
+	addq	$(VEC_SIZE * 2), %rdi
076f82
 	testl	%eax, %eax
076f82
-	/* If there is no CHAR here, return the remembered one.  */
076f82
-	jz	L(return_value)
076f82
-	bsrl	%eax, %eax
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
-	VZEROUPPER_RETURN
076f82
-
076f82
-	.p2align 4
076f82
-L(char_and_nul):
076f82
-	/* Find both a CHAR and a nul CHAR.  */
076f82
-	addq	%rcx, %rdi
076f82
-	movl	%edx, %ecx
076f82
-L(char_and_nul_in_first_vec):
076f82
-# ifdef USE_AS_WCSRCHR
076f82
-	/* Keep the first bit for each matching CHAR for bsr.  */
076f82
-	andl	$0x11111111, %ecx
076f82
-	andl	$0x11111111, %eax
076f82
-# endif
076f82
-	/* Mask out any matching bits after the nul CHAR.  */
076f82
-	movl	%ecx, %r8d
076f82
-	subl	$1, %r8d
076f82
-	xorl	%ecx, %r8d
076f82
-	andl	%r8d, %eax
076f82
+	jz	L(second_aligned_loop)
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(second_aligned_loop_set_furthest_match)
076f82
+	vpmovmskb %ymm5, %eax
076f82
 	testl	%eax, %eax
076f82
-	/* Return null pointer if the nul CHAR comes first.  */
076f82
-	jz	L(return_null)
076f82
-	bsrl	%eax, %eax
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
+	jnz	L(return_new_match)
076f82
+
076f82
+	/* This is the hot patch. We know CHAR is inbounds and that
076f82
+	   ymm3/ymm2 have latest match.  */
076f82
+	.p2align 4,, 4
076f82
+L(return_old_match):
076f82
+	vpmovmskb %ymm3, %eax
076f82
+	vpmovmskb %ymm2, %edx
076f82
+	salq	$32, %rax
076f82
+	orq	%rdx, %rax
076f82
+	bsrq	%rax, %rax
076f82
+	/* Search char cannot be zero so safe to just use lea for
076f82
+	   wcsrchr.  */
076f82
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(return_null):
076f82
-	xorl	%eax, %eax
076f82
+	/* Last iteration also potentially has a match.  */
076f82
+	.p2align 4,, 8
076f82
+L(return_new_match):
076f82
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
076f82
+	vpmovmskb %ymm4, %edx
076f82
+	salq	$32, %rcx
076f82
+	orq	%rdx, %rcx
076f82
+
076f82
+	vpmovmskb %ymm10, %eax
076f82
+	vpmovmskb %ymm6, %edx
076f82
+	salq	$32, %rax
076f82
+	orq	%rdx, %rax
076f82
+	blsmskq	%rcx, %rcx
076f82
+	andq	%rcx, %rax
076f82
+	jz	L(return_old_match)
076f82
+	bsrq	%rax, %rax
076f82
+	/* Search char cannot be zero so safe to just use lea for
076f82
+	   wcsrchr.  */
076f82
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-END (STRRCHR)
076f82
+	.p2align 4,, 4
076f82
+L(cross_page):
076f82
+	movq	%rdi, %rsi
076f82
+	andq	$-VEC_SIZE, %rsi
076f82
+	vmovdqu	(%rsi), %ymm1
076f82
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
076f82
+	vpmovmskb %ymm6, %ecx
076f82
+	/* Shift out zero CHAR matches that are before the begining of
076f82
+	   src (rdi).  */
076f82
+	shrxl	%edi, %ecx, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(page_cross_continue)
076f82
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
076f82
+	vpmovmskb %ymm1, %eax
076f82
+
076f82
+	/* Shift out search CHAR matches that are before the begining of
076f82
+	   src (rdi).  */
076f82
+	shrxl	%edi, %eax, %eax
076f82
+	blsmskl	%ecx, %ecx
076f82
+	/* Check if any search CHAR match in range.  */
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(ret2)
076f82
+	bsrl	%eax, %eax
076f82
+	addq	%rdi, %rax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	andq	$-CHAR_SIZE, %rax
076f82
+# endif
076f82
+L(ret2):
076f82
+	VZEROUPPER_RETURN
076f82
+END(STRRCHR)
076f82
 #endif