08c3a6
commit 00f09a14d2818f438959e764834abb3913f2b20a
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Thu Apr 21 20:52:29 2022 -0500
08c3a6
08c3a6
    x86: Optimize {str|wcs}rchr-avx2
08c3a6
    
08c3a6
    The new code unrolls the main loop slightly without adding too much
08c3a6
    overhead and minimizes the comparisons for the search CHAR.
08c3a6
    
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.832
08c3a6
    See email for all results.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64 with and without multiarch enabled.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
08c3a6
index 0deba97114d3b83d..b8dec737d5213b25 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
08c3a6
@@ -27,9 +27,13 @@
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
 #  define VPBROADCAST	vpbroadcastd
08c3a6
 #  define VPCMPEQ	vpcmpeqd
08c3a6
+#  define VPMIN	vpminud
08c3a6
+#  define CHAR_SIZE	4
08c3a6
 # else
08c3a6
 #  define VPBROADCAST	vpbroadcastb
08c3a6
 #  define VPCMPEQ	vpcmpeqb
08c3a6
+#  define VPMIN	vpminub
08c3a6
+#  define CHAR_SIZE	1
08c3a6
 # endif
08c3a6
 
08c3a6
 # ifndef VZEROUPPER
08c3a6
@@ -41,196 +45,304 @@
08c3a6
 # endif
08c3a6
 
08c3a6
 # define VEC_SIZE	32
08c3a6
+# define PAGE_SIZE	4096
08c3a6
 
08c3a6
-	.section SECTION(.text),"ax",@progbits
08c3a6
-ENTRY (STRRCHR)
08c3a6
-	movd	%esi, %xmm4
08c3a6
-	movl	%edi, %ecx
08c3a6
+	.section SECTION(.text), "ax", @progbits
08c3a6
+ENTRY(STRRCHR)
08c3a6
+	movd	%esi, %xmm7
08c3a6
+	movl	%edi, %eax
08c3a6
 	/* Broadcast CHAR to YMM4.  */
08c3a6
-	VPBROADCAST %xmm4, %ymm4
08c3a6
+	VPBROADCAST %xmm7, %ymm7
08c3a6
 	vpxor	%xmm0, %xmm0, %xmm0
08c3a6
 
08c3a6
-	/* Check if we may cross page boundary with one vector load.  */
08c3a6
-	andl	$(2 * VEC_SIZE - 1), %ecx
08c3a6
-	cmpl	$VEC_SIZE, %ecx
08c3a6
-	ja	L(cros_page_boundary)
08c3a6
+	/* Shift here instead of `andl` to save code size (saves a fetch
08c3a6
+	   block).  */
08c3a6
+	sall	$20, %eax
08c3a6
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
08c3a6
+	ja	L(cross_page)
08c3a6
 
08c3a6
+L(page_cross_continue):
08c3a6
 	vmovdqu	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
+	/* Check end of string match.  */
08c3a6
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
08c3a6
+	vpmovmskb %ymm6, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(aligned_more)
08c3a6
+
08c3a6
+	/* Only check match with search CHAR if needed.  */
08c3a6
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
08c3a6
+	vpmovmskb %ymm1, %eax
08c3a6
+	/* Check if match before first zero.  */
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret0)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rdi, %rax
08c3a6
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
08c3a6
+	   search CHAR is zero we are correct. Either way `andq
08c3a6
+	   -CHAR_SIZE, %rax` gets the correct result.  */
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+L(ret0):
08c3a6
+L(return_vzeroupper):
08c3a6
+	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
+
08c3a6
+	/* Returns for first vec x1/x2 have hard coded backward search
08c3a6
+	   path for earlier matches.  */
08c3a6
+	.p2align 4,, 10
08c3a6
+L(first_vec_x1):
08c3a6
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
08c3a6
+	vpmovmskb %ymm6, %eax
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jnz	L(first_vec_x1_return)
08c3a6
+
08c3a6
+	.p2align 4,, 4
08c3a6
+L(first_vec_x0_test):
08c3a6
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
08c3a6
+	vpmovmskb %ymm6, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jz	L(ret1)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%r8, %rax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+L(ret1):
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
+	.p2align 4,, 10
08c3a6
+L(first_vec_x0_x1_test):
08c3a6
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
08c3a6
+	vpmovmskb %ymm6, %eax
08c3a6
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
08c3a6
+	   before returning.  */
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(first_vec)
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	.p2align 4,, 4
08c3a6
+L(first_vec_x1_return):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	1(%rdi, %rax), %rax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	testl	%ecx, %ecx
08c3a6
-	jnz	L(return_null)
08c3a6
 
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	xorl	%edx, %edx
08c3a6
-	jmp	L(aligned_loop)
08c3a6
+	.p2align 4,, 10
08c3a6
+L(first_vec_x2):
08c3a6
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
08c3a6
+	vpmovmskb %ymm6, %eax
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	/* If no in-range search CHAR match in ymm3 then need to check
08c3a6
+	   ymm1/ymm2 for an earlier match (we delay checking search
08c3a6
+	   CHAR matches until needed).  */
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_vec_x0_x1_test)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+	VZEROUPPER_RETURN
08c3a6
+
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(first_vec):
08c3a6
-	/* Check if there is a nul CHAR.  */
08c3a6
+L(aligned_more):
08c3a6
+	/* Save original pointer if match was in VEC 0.  */
08c3a6
+	movq	%rdi, %r8
08c3a6
+
08c3a6
+	/* Align src.  */
08c3a6
+	orq	$(VEC_SIZE - 1), %rdi
08c3a6
+	vmovdqu	1(%rdi), %ymm2
08c3a6
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
08c3a6
+	vpmovmskb %ymm6, %ecx
08c3a6
 	testl	%ecx, %ecx
08c3a6
-	jnz	L(char_and_nul_in_first_vec)
08c3a6
+	jnz	L(first_vec_x1)
08c3a6
 
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
-	movq	%rdi, %rsi
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	jmp	L(aligned_loop)
08c3a6
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
08c3a6
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
08c3a6
+	vpmovmskb %ymm6, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x2)
08c3a6
 
08c3a6
+	/* Save pointer again before realigning.  */
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	addq	$(VEC_SIZE + 1), %rdi
08c3a6
+	andq	$-(VEC_SIZE * 2), %rdi
08c3a6
 	.p2align 4
08c3a6
-L(cros_page_boundary):
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %edx
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	shrl	%cl, %edx
08c3a6
-	shrl	%cl, %eax
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-
08c3a6
-	/* Check if there is a CHAR.  */
08c3a6
+L(first_aligned_loop):
08c3a6
+	/* Do 2x VEC at a time. Any more and the cost of finding the
08c3a6
+	   match outweights loop benefit.  */
08c3a6
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
08c3a6
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
08c3a6
+
08c3a6
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
08c3a6
+	VPMIN	%ymm4, %ymm5, %ymm8
08c3a6
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
08c3a6
+	vpor	%ymm6, %ymm10, %ymm5
08c3a6
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
08c3a6
+	vpor	%ymm5, %ymm8, %ymm9
08c3a6
+
08c3a6
+	vpmovmskb %ymm9, %eax
08c3a6
+	addq	$(VEC_SIZE * 2), %rdi
08c3a6
+	/* No zero or search CHAR.  */
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(found_char)
08c3a6
-
08c3a6
-	testl	%edx, %edx
08c3a6
-	jnz	L(return_null)
08c3a6
+	jz	L(first_aligned_loop)
08c3a6
 
08c3a6
-	jmp	L(aligned_loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(found_char):
08c3a6
-	testl	%edx, %edx
08c3a6
-	jnz	L(char_and_nul)
08c3a6
+	/* If no zero CHAR then go to second loop (this allows us to
08c3a6
+	   throw away all prior work).  */
08c3a6
+	vpmovmskb %ymm8, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(second_aligned_loop_prep)
08c3a6
 
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
-	leaq	(%rdi, %rcx), %rsi
08c3a6
+	/* Search char could be zero so we need to get the true match.
08c3a6
+	 */
08c3a6
+	vpmovmskb %ymm5, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(first_aligned_loop_return)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(aligned_loop):
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
-
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	add	$VEC_SIZE, %rdi
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
+	.p2align 4,, 4
08c3a6
+L(first_vec_x1_or_x2):
08c3a6
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
08c3a6
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
08c3a6
 	vpmovmskb %ymm3, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
-
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
-
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jz	L(aligned_loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(char_nor_null):
08c3a6
-	/* Find a CHAR or a nul CHAR in a loop.  */
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(match)
08c3a6
-L(return_value):
08c3a6
-	testl	%edx, %edx
08c3a6
-	jz	L(return_null)
08c3a6
-	movl	%edx, %eax
08c3a6
-	movq	%rsi, %rdi
08c3a6
+	vpmovmskb %ymm2, %edx
08c3a6
+	/* Use add for macro-fusion.  */
08c3a6
+	addq	%rax, %rdx
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	/* NB: We could move this shift to before the branch and save a
08c3a6
+	   bit of code size / performance on the fall through. The
08c3a6
+	   branch leads to the null case which generally seems hotter
08c3a6
+	   than char in first 3x VEC.  */
08c3a6
+	salq	$32, %rax
08c3a6
+	addq	%rdx, %rax
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	leaq	1(%rsi, %rax), %rax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
+	.p2align 4,, 8
08c3a6
+L(first_aligned_loop_return):
08c3a6
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
08c3a6
+	vpmovmskb %ymm4, %edx
08c3a6
+	salq	$32, %rcx
08c3a6
+	orq	%rdx, %rcx
08c3a6
+
08c3a6
+	vpmovmskb %ymm10, %eax
08c3a6
+	vpmovmskb %ymm6, %edx
08c3a6
+	salq	$32, %rax
08c3a6
+	orq	%rdx, %rax
08c3a6
+	blsmskq	%rcx, %rcx
08c3a6
+	andq	%rcx, %rax
08c3a6
+	jz	L(first_vec_x1_or_x2)
08c3a6
+
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
-	/* Keep the first bit for each matching CHAR for bsr.  */
08c3a6
-	andl	$0x11111111, %eax
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
 # endif
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
-L(return_vzeroupper):
08c3a6
-	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
+	/* Search char cannot be zero.  */
08c3a6
 	.p2align 4
08c3a6
-L(match):
08c3a6
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
08c3a6
-	vpmovmskb %ymm2, %ecx
08c3a6
-	testl	%ecx, %ecx
08c3a6
-	jnz	L(find_nul)
08c3a6
-
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
+L(second_aligned_loop_set_furthest_match):
08c3a6
+	/* Save VEC and pointer from most recent match.  */
08c3a6
+L(second_aligned_loop_prep):
08c3a6
 	movq	%rdi, %rsi
08c3a6
-	jmp	L(aligned_loop)
08c3a6
+	vmovdqu	%ymm6, %ymm2
08c3a6
+	vmovdqu	%ymm10, %ymm3
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(find_nul):
08c3a6
-# ifdef USE_AS_WCSRCHR
08c3a6
-	/* Keep the first bit for each matching CHAR for bsr.  */
08c3a6
-	andl	$0x11111111, %ecx
08c3a6
-	andl	$0x11111111, %eax
08c3a6
-# endif
08c3a6
-	/* Mask out any matching bits after the nul CHAR.  */
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	subl	$1, %r8d
08c3a6
-	xorl	%ecx, %r8d
08c3a6
-	andl	%r8d, %eax
08c3a6
+L(second_aligned_loop):
08c3a6
+	/* Search 2x at at time.  */
08c3a6
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
08c3a6
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
08c3a6
+
08c3a6
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
08c3a6
+	VPMIN	%ymm4, %ymm5, %ymm1
08c3a6
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
08c3a6
+	vpor	%ymm6, %ymm10, %ymm5
08c3a6
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
08c3a6
+	vpor	%ymm5, %ymm1, %ymm9
08c3a6
+
08c3a6
+	vpmovmskb %ymm9, %eax
08c3a6
+	addq	$(VEC_SIZE * 2), %rdi
08c3a6
 	testl	%eax, %eax
08c3a6
-	/* If there is no CHAR here, return the remembered one.  */
08c3a6
-	jz	L(return_value)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(char_and_nul):
08c3a6
-	/* Find both a CHAR and a nul CHAR.  */
08c3a6
-	addq	%rcx, %rdi
08c3a6
-	movl	%edx, %ecx
08c3a6
-L(char_and_nul_in_first_vec):
08c3a6
-# ifdef USE_AS_WCSRCHR
08c3a6
-	/* Keep the first bit for each matching CHAR for bsr.  */
08c3a6
-	andl	$0x11111111, %ecx
08c3a6
-	andl	$0x11111111, %eax
08c3a6
-# endif
08c3a6
-	/* Mask out any matching bits after the nul CHAR.  */
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	subl	$1, %r8d
08c3a6
-	xorl	%ecx, %r8d
08c3a6
-	andl	%r8d, %eax
08c3a6
+	jz	L(second_aligned_loop)
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(second_aligned_loop_set_furthest_match)
08c3a6
+	vpmovmskb %ymm5, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	/* Return null pointer if the nul CHAR comes first.  */
08c3a6
-	jz	L(return_null)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
+	jnz	L(return_new_match)
08c3a6
+
08c3a6
+	/* This is the hot patch. We know CHAR is inbounds and that
08c3a6
+	   ymm3/ymm2 have latest match.  */
08c3a6
+	.p2align 4,, 4
08c3a6
+L(return_old_match):
08c3a6
+	vpmovmskb %ymm3, %eax
08c3a6
+	vpmovmskb %ymm2, %edx
08c3a6
+	salq	$32, %rax
08c3a6
+	orq	%rdx, %rax
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	/* Search char cannot be zero so safe to just use lea for
08c3a6
+	   wcsrchr.  */
08c3a6
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(return_null):
08c3a6
-	xorl	%eax, %eax
08c3a6
+	/* Last iteration also potentially has a match.  */
08c3a6
+	.p2align 4,, 8
08c3a6
+L(return_new_match):
08c3a6
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
08c3a6
+	vpmovmskb %ymm4, %edx
08c3a6
+	salq	$32, %rcx
08c3a6
+	orq	%rdx, %rcx
08c3a6
+
08c3a6
+	vpmovmskb %ymm10, %eax
08c3a6
+	vpmovmskb %ymm6, %edx
08c3a6
+	salq	$32, %rax
08c3a6
+	orq	%rdx, %rax
08c3a6
+	blsmskq	%rcx, %rcx
08c3a6
+	andq	%rcx, %rax
08c3a6
+	jz	L(return_old_match)
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	/* Search char cannot be zero so safe to just use lea for
08c3a6
+	   wcsrchr.  */
08c3a6
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-END (STRRCHR)
08c3a6
+	.p2align 4,, 4
08c3a6
+L(cross_page):
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	andq	$-VEC_SIZE, %rsi
08c3a6
+	vmovdqu	(%rsi), %ymm1
08c3a6
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
08c3a6
+	vpmovmskb %ymm6, %ecx
08c3a6
+	/* Shift out zero CHAR matches that are before the begining of
08c3a6
+	   src (rdi).  */
08c3a6
+	shrxl	%edi, %ecx, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(page_cross_continue)
08c3a6
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
08c3a6
+	vpmovmskb %ymm1, %eax
08c3a6
+
08c3a6
+	/* Shift out search CHAR matches that are before the begining of
08c3a6
+	   src (rdi).  */
08c3a6
+	shrxl	%edi, %eax, %eax
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	/* Check if any search CHAR match in range.  */
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret2)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rdi, %rax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+# endif
08c3a6
+L(ret2):
08c3a6
+	VZEROUPPER_RETURN
08c3a6
+END(STRRCHR)
08c3a6
 #endif