076f82
commit 596c9a32cc5d5eb82587e92d1e66c9ecb7668456
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Thu Apr 21 20:52:30 2022 -0500
076f82
076f82
    x86: Optimize {str|wcs}rchr-evex
076f82
    
076f82
    The new code unrolls the main loop slightly without adding too much
076f82
    overhead and minimizes the comparisons for the search CHAR.
076f82
    
076f82
    Geometric Mean of all benchmarks New / Old: 0.755
076f82
    See email for all results.
076f82
    
076f82
    Full xcheck passes on x86_64 with and without multiarch enabled.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
076f82
index f920b5a584edd293..f5b6d755ceb85ae2 100644
076f82
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
076f82
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
076f82
@@ -24,242 +24,351 @@
076f82
 #  define STRRCHR	__strrchr_evex
076f82
 # endif
076f82
 
076f82
-# define VMOVU		vmovdqu64
076f82
-# define VMOVA		vmovdqa64
076f82
+# define VMOVU	vmovdqu64
076f82
+# define VMOVA	vmovdqa64
076f82
 
076f82
 # ifdef USE_AS_WCSRCHR
076f82
+#  define SHIFT_REG	esi
076f82
+
076f82
+#  define kunpck	kunpckbw
076f82
+#  define kmov_2x	kmovd
076f82
+#  define maskz_2x	ecx
076f82
+#  define maskm_2x	eax
076f82
+#  define CHAR_SIZE	4
076f82
+#  define VPMIN	vpminud
076f82
+#  define VPTESTN	vptestnmd
076f82
 #  define VPBROADCAST	vpbroadcastd
076f82
-#  define VPCMP		vpcmpd
076f82
-#  define SHIFT_REG	r8d
076f82
+#  define VPCMP	vpcmpd
076f82
 # else
076f82
+#  define SHIFT_REG	edi
076f82
+
076f82
+#  define kunpck	kunpckdq
076f82
+#  define kmov_2x	kmovq
076f82
+#  define maskz_2x	rcx
076f82
+#  define maskm_2x	rax
076f82
+
076f82
+#  define CHAR_SIZE	1
076f82
+#  define VPMIN	vpminub
076f82
+#  define VPTESTN	vptestnmb
076f82
 #  define VPBROADCAST	vpbroadcastb
076f82
-#  define VPCMP		vpcmpb
076f82
-#  define SHIFT_REG	ecx
076f82
+#  define VPCMP	vpcmpb
076f82
 # endif
076f82
 
076f82
 # define XMMZERO	xmm16
076f82
 # define YMMZERO	ymm16
076f82
 # define YMMMATCH	ymm17
076f82
-# define YMM1		ymm18
076f82
+# define YMMSAVE	ymm18
076f82
+
076f82
+# define YMM1	ymm19
076f82
+# define YMM2	ymm20
076f82
+# define YMM3	ymm21
076f82
+# define YMM4	ymm22
076f82
+# define YMM5	ymm23
076f82
+# define YMM6	ymm24
076f82
+# define YMM7	ymm25
076f82
+# define YMM8	ymm26
076f82
 
076f82
-# define VEC_SIZE	32
076f82
 
076f82
-	.section .text.evex,"ax",@progbits
076f82
-ENTRY (STRRCHR)
076f82
-	movl	%edi, %ecx
076f82
+# define VEC_SIZE	32
076f82
+# define PAGE_SIZE	4096
076f82
+	.section .text.evex, "ax", @progbits
076f82
+ENTRY(STRRCHR)
076f82
+	movl	%edi, %eax
076f82
 	/* Broadcast CHAR to YMMMATCH.  */
076f82
 	VPBROADCAST %esi, %YMMMATCH
076f82
 
076f82
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
076f82
-
076f82
-	/* Check if we may cross page boundary with one vector load.  */
076f82
-	andl	$(2 * VEC_SIZE - 1), %ecx
076f82
-	cmpl	$VEC_SIZE, %ecx
076f82
-	ja	L(cros_page_boundary)
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
+	jg	L(cross_page_boundary)
076f82
 
076f82
+L(page_cross_continue):
076f82
 	VMOVU	(%rdi), %YMM1
076f82
-
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
076f82
+	VPTESTN	%YMM1, %YMM1, %k0
076f82
 	kmovd	%k0, %ecx
076f82
-	kmovd	%k1, %eax
076f82
-
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(first_vec)
076f82
-
076f82
 	testl	%ecx, %ecx
076f82
-	jnz	L(return_null)
076f82
-
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	xorl	%edx, %edx
076f82
-	jmp	L(aligned_loop)
076f82
-
076f82
-	.p2align 4
076f82
-L(first_vec):
076f82
-	/* Check if there is a null byte.  */
076f82
-	testl	%ecx, %ecx
076f82
-	jnz	L(char_and_nul_in_first_vec)
076f82
-
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
-	movq	%rdi, %rsi
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	jmp	L(aligned_loop)
076f82
-
076f82
-	.p2align 4
076f82
-L(cros_page_boundary):
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
+	jz	L(aligned_more)
076f82
+	/* fallthrough: zero CHAR in first VEC.  */
076f82
 
076f82
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
076f82
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
+	kmovd	%k1, %eax
076f82
+	/* Build mask up until first zero CHAR (used to mask of
076f82
+	   potential search CHAR matches past the end of the string).
076f82
+	 */
076f82
+	blsmskl	%ecx, %ecx
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(ret0)
076f82
+	/* Get last match (the `andl` removed any out of bounds
076f82
+	   matches).  */
076f82
+	bsrl	%eax, %eax
076f82
 # ifdef USE_AS_WCSRCHR
076f82
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
076f82
-	   bytes.  */
076f82
-	movl	%ecx, %SHIFT_REG
076f82
-	sarl	$2, %SHIFT_REG
076f82
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
076f82
+# else
076f82
+	addq	%rdi, %rax
076f82
 # endif
076f82
+L(ret0):
076f82
+	ret
076f82
 
076f82
-	VMOVA	(%rdi), %YMM1
076f82
-
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
+	/* Returns for first vec x1/x2/x3 have hard coded backward
076f82
+	   search path for earlier matches.  */
076f82
+	.p2align 4,, 6
076f82
+L(first_vec_x1):
076f82
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
076f82
+	kmovd	%k1, %eax
076f82
+	blsmskl	%ecx, %ecx
076f82
+	/* eax non-zero if search CHAR in range.  */
076f82
+	andl	%ecx, %eax
076f82
+	jnz	L(first_vec_x1_return)
076f82
+
076f82
+	/* fallthrough: no match in YMM2 then need to check for earlier
076f82
+	   matches (in YMM1).  */
076f82
+	.p2align 4,, 4
076f82
+L(first_vec_x0_test):
076f82
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
-	kmovd	%k0, %edx
076f82
 	kmovd	%k1, %eax
076f82
-
076f82
-	shrxl	%SHIFT_REG, %edx, %edx
076f82
-	shrxl	%SHIFT_REG, %eax, %eax
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-
076f82
-	/* Check if there is a CHAR.  */
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(found_char)
076f82
-
076f82
-	testl	%edx, %edx
076f82
-	jnz	L(return_null)
076f82
-
076f82
-	jmp	L(aligned_loop)
076f82
-
076f82
-	.p2align 4
076f82
-L(found_char):
076f82
-	testl	%edx, %edx
076f82
-	jnz	L(char_and_nul)
076f82
-
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
-	leaq	(%rdi, %rcx), %rsi
076f82
+	jz	L(ret1)
076f82
+	bsrl	%eax, %eax
076f82
+# ifdef USE_AS_WCSRCHR
076f82
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
076f82
+# else
076f82
+	addq	%rsi, %rax
076f82
+# endif
076f82
+L(ret1):
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(aligned_loop):
076f82
-	VMOVA	(%rdi), %YMM1
076f82
-	addq	$VEC_SIZE, %rdi
076f82
+	.p2align 4,, 10
076f82
+L(first_vec_x1_or_x2):
076f82
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
076f82
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
076f82
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
076f82
+	   matches between either of them. Otherwise check YMM1.  */
076f82
+	kortestd %k2, %k3
076f82
+	jz	L(first_vec_x0_test)
076f82
+
076f82
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
076f82
+	   two bitmasks then get last result.  */
076f82
+	kunpck	%k2, %k3, %k3
076f82
+	kmovq	%k3, %rax
076f82
+	bsrq	%rax, %rax
076f82
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
 
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
-	kmovd	%k0, %ecx
076f82
+	.p2align 4,, 6
076f82
+L(first_vec_x3):
076f82
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
076f82
 	kmovd	%k1, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
+	blsmskl	%ecx, %ecx
076f82
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(first_vec_x1_or_x2)
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
 
076f82
-	VMOVA	(%rdi), %YMM1
076f82
-	add	$VEC_SIZE, %rdi
076f82
+	.p2align 4,, 6
076f82
+L(first_vec_x0_x1_test):
076f82
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
076f82
+	kmovd	%k1, %eax
076f82
+	/* Check YMM2 for last match first. If no match try YMM1.  */
076f82
+	testl	%eax, %eax
076f82
+	jz	L(first_vec_x0_test)
076f82
+	.p2align 4,, 4
076f82
+L(first_vec_x1_return):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
 
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
-	kmovd	%k0, %ecx
076f82
+	.p2align 4,, 10
076f82
+L(first_vec_x2):
076f82
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
076f82
 	kmovd	%k1, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
+	blsmskl	%ecx, %ecx
076f82
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
076f82
+	 */
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(first_vec_x0_x1_test)
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
 
076f82
-	VMOVA	(%rdi), %YMM1
076f82
-	addq	$VEC_SIZE, %rdi
076f82
 
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
+	.p2align 4
076f82
+L(aligned_more):
076f82
+	/* Need to keep original pointer incase YMM1 has last match.  */
076f82
+	movq	%rdi, %rsi
076f82
+	andq	$-VEC_SIZE, %rdi
076f82
+	VMOVU	VEC_SIZE(%rdi), %YMM2
076f82
+	VPTESTN	%YMM2, %YMM2, %k0
076f82
 	kmovd	%k0, %ecx
076f82
-	kmovd	%k1, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jnz	L(char_nor_null)
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(first_vec_x1)
076f82
 
076f82
-	VMOVA	(%rdi), %YMM1
076f82
-	addq	$VEC_SIZE, %rdi
076f82
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
076f82
+	VPTESTN	%YMM3, %YMM3, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(first_vec_x2)
076f82
 
076f82
-	/* Each bit in K0 represents a null byte in YMM1.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
076f82
-	/* Each bit in K1 represents a CHAR in YMM1.  */
076f82
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
076f82
+	VPTESTN	%YMM4, %YMM4, %k0
076f82
 	kmovd	%k0, %ecx
076f82
-	kmovd	%k1, %eax
076f82
-	orl	%eax, %ecx
076f82
-	jz	L(aligned_loop)
076f82
+	movq	%rdi, %r8
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(first_vec_x3)
076f82
 
076f82
+	andq	$-(VEC_SIZE * 2), %rdi
076f82
 	.p2align 4
076f82
-L(char_nor_null):
076f82
-	/* Find a CHAR or a null byte in a loop.  */
076f82
+L(first_aligned_loop):
076f82
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
076f82
+	   they don't store a match.  */
076f82
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
076f82
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
076f82
+
076f82
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
076f82
+	vpxord	%YMM6, %YMMMATCH, %YMM7
076f82
+
076f82
+	VPMIN	%YMM5, %YMM6, %YMM8
076f82
+	VPMIN	%YMM8, %YMM7, %YMM7
076f82
+
076f82
+	VPTESTN	%YMM7, %YMM7, %k1
076f82
+	subq	$(VEC_SIZE * -2), %rdi
076f82
+	kortestd %k1, %k2
076f82
+	jz	L(first_aligned_loop)
076f82
+
076f82
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
076f82
+	VPTESTN	%YMM8, %YMM8, %k1
076f82
+	ktestd	%k1, %k1
076f82
+	jz	L(second_aligned_loop_prep)
076f82
+
076f82
+	kortestd %k2, %k3
076f82
+	jnz	L(return_first_aligned_loop)
076f82
+
076f82
+	.p2align 4,, 6
076f82
+L(first_vec_x1_or_x2_or_x3):
076f82
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
076f82
+	kmovd	%k4, %eax
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(match)
076f82
-L(return_value):
076f82
-	testl	%edx, %edx
076f82
-	jz	L(return_null)
076f82
-	movl	%edx, %eax
076f82
-	movq	%rsi, %rdi
076f82
+	jz	L(first_vec_x1_or_x2)
076f82
 	bsrl	%eax, %eax
076f82
-# ifdef USE_AS_WCSRCHR
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
076f82
-# else
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
-# endif
076f82
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(match):
076f82
-	/* Find a CHAR.  Check if there is a null byte.  */
076f82
-	kmovd	%k0, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jnz	L(find_nul)
076f82
+	.p2align 4,, 8
076f82
+L(return_first_aligned_loop):
076f82
+	VPTESTN	%YMM5, %YMM5, %k0
076f82
+	kunpck	%k0, %k1, %k0
076f82
+	kmov_2x	%k0, %maskz_2x
076f82
+
076f82
+	blsmsk	%maskz_2x, %maskz_2x
076f82
+	kunpck	%k2, %k3, %k3
076f82
+	kmov_2x	%k3, %maskm_2x
076f82
+	and	%maskz_2x, %maskm_2x
076f82
+	jz	L(first_vec_x1_or_x2_or_x3)
076f82
 
076f82
-	/* Remember the match and keep searching.  */
076f82
-	movl	%eax, %edx
076f82
+	bsr	%maskm_2x, %maskm_2x
076f82
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
+
076f82
+	.p2align 4
076f82
+	/* We can throw away the work done for the first 4x checks here
076f82
+	   as we have a later match. This is the 'fast' path persay.
076f82
+	 */
076f82
+L(second_aligned_loop_prep):
076f82
+L(second_aligned_loop_set_furthest_match):
076f82
 	movq	%rdi, %rsi
076f82
-	jmp	L(aligned_loop)
076f82
+	kunpck	%k2, %k3, %k4
076f82
 
076f82
 	.p2align 4
076f82
-L(find_nul):
076f82
-	/* Mask out any matching bits after the null byte.  */
076f82
-	movl	%ecx, %r8d
076f82
-	subl	$1, %r8d
076f82
-	xorl	%ecx, %r8d
076f82
-	andl	%r8d, %eax
076f82
-	testl	%eax, %eax
076f82
-	/* If there is no CHAR here, return the remembered one.  */
076f82
-	jz	L(return_value)
076f82
-	bsrl	%eax, %eax
076f82
+L(second_aligned_loop):
076f82
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
076f82
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
076f82
+
076f82
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
076f82
+	vpxord	%YMM2, %YMMMATCH, %YMM3
076f82
+
076f82
+	VPMIN	%YMM1, %YMM2, %YMM4
076f82
+	VPMIN	%YMM3, %YMM4, %YMM3
076f82
+
076f82
+	VPTESTN	%YMM3, %YMM3, %k1
076f82
+	subq	$(VEC_SIZE * -2), %rdi
076f82
+	kortestd %k1, %k2
076f82
+	jz	L(second_aligned_loop)
076f82
+
076f82
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
076f82
+	VPTESTN	%YMM4, %YMM4, %k1
076f82
+	ktestd	%k1, %k1
076f82
+	jz	L(second_aligned_loop_set_furthest_match)
076f82
+
076f82
+	kortestd %k2, %k3
076f82
+	/* branch here because there is a significant advantage interms
076f82
+	   of output dependency chance in using edx.  */
076f82
+	jnz	L(return_new_match)
076f82
+L(return_old_match):
076f82
+	kmovq	%k4, %rax
076f82
+	bsrq	%rax, %rax
076f82
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
+
076f82
+L(return_new_match):
076f82
+	VPTESTN	%YMM1, %YMM1, %k0
076f82
+	kunpck	%k0, %k1, %k0
076f82
+	kmov_2x	%k0, %maskz_2x
076f82
+
076f82
+	blsmsk	%maskz_2x, %maskz_2x
076f82
+	kunpck	%k2, %k3, %k3
076f82
+	kmov_2x	%k3, %maskm_2x
076f82
+	and	%maskz_2x, %maskm_2x
076f82
+	jz	L(return_old_match)
076f82
+
076f82
+	bsr	%maskm_2x, %maskm_2x
076f82
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
076f82
+	ret
076f82
+
076f82
+L(cross_page_boundary):
076f82
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
076f82
+	   rax` sets pointer will all page offset bits cleared so
076f82
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
076f82
+	   before page cross (guranteed to be safe to read). Doing this
076f82
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
076f82
+	   a bit of code size.  */
076f82
+	xorq	%rdi, %rax
076f82
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
076f82
+	VPTESTN	%YMM1, %YMM1, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+
076f82
+	/* Shift out zero CHAR matches that are before the begining of
076f82
+	   src (rdi).  */
076f82
 # ifdef USE_AS_WCSRCHR
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
076f82
-# else
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
+	movl	%edi, %esi
076f82
+	andl	$(VEC_SIZE - 1), %esi
076f82
+	shrl	$2, %esi
076f82
 # endif
076f82
-	ret
076f82
+	shrxl	%SHIFT_REG, %ecx, %ecx
076f82
 
076f82
-	.p2align 4
076f82
-L(char_and_nul):
076f82
-	/* Find both a CHAR and a null byte.  */
076f82
-	addq	%rcx, %rdi
076f82
-	movl	%edx, %ecx
076f82
-L(char_and_nul_in_first_vec):
076f82
-	/* Mask out any matching bits after the null byte.  */
076f82
-	movl	%ecx, %r8d
076f82
-	subl	$1, %r8d
076f82
-	xorl	%ecx, %r8d
076f82
-	andl	%r8d, %eax
076f82
-	testl	%eax, %eax
076f82
-	/* Return null pointer if the null byte comes first.  */
076f82
-	jz	L(return_null)
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(page_cross_continue)
076f82
+
076f82
+	/* Found zero CHAR so need to test for search CHAR.  */
076f82
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
076f82
+	kmovd	%k1, %eax
076f82
+	/* Shift out search CHAR matches that are before the begining of
076f82
+	   src (rdi).  */
076f82
+	shrxl	%SHIFT_REG, %eax, %eax
076f82
+
076f82
+	/* Check if any search CHAR match in range.  */
076f82
+	blsmskl	%ecx, %ecx
076f82
+	andl	%ecx, %eax
076f82
+	jz	L(ret3)
076f82
 	bsrl	%eax, %eax
076f82
 # ifdef USE_AS_WCSRCHR
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
076f82
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
076f82
 # else
076f82
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
076f82
+	addq	%rdi, %rax
076f82
 # endif
076f82
+L(ret3):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(return_null):
076f82
-	xorl	%eax, %eax
076f82
-	ret
076f82
-
076f82
-END (STRRCHR)
076f82
+END(STRRCHR)
076f82
 #endif