08c3a6
commit 596c9a32cc5d5eb82587e92d1e66c9ecb7668456
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Thu Apr 21 20:52:30 2022 -0500
08c3a6
08c3a6
    x86: Optimize {str|wcs}rchr-evex
08c3a6
    
08c3a6
    The new code unrolls the main loop slightly without adding too much
08c3a6
    overhead and minimizes the comparisons for the search CHAR.
08c3a6
    
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.755
08c3a6
    See email for all results.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64 with and without multiarch enabled.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
08c3a6
index f920b5a584edd293..f5b6d755ceb85ae2 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
08c3a6
@@ -24,242 +24,351 @@
08c3a6
 #  define STRRCHR	__strrchr_evex
08c3a6
 # endif
08c3a6
 
08c3a6
-# define VMOVU		vmovdqu64
08c3a6
-# define VMOVA		vmovdqa64
08c3a6
+# define VMOVU	vmovdqu64
08c3a6
+# define VMOVA	vmovdqa64
08c3a6
 
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
+#  define SHIFT_REG	esi
08c3a6
+
08c3a6
+#  define kunpck	kunpckbw
08c3a6
+#  define kmov_2x	kmovd
08c3a6
+#  define maskz_2x	ecx
08c3a6
+#  define maskm_2x	eax
08c3a6
+#  define CHAR_SIZE	4
08c3a6
+#  define VPMIN	vpminud
08c3a6
+#  define VPTESTN	vptestnmd
08c3a6
 #  define VPBROADCAST	vpbroadcastd
08c3a6
-#  define VPCMP		vpcmpd
08c3a6
-#  define SHIFT_REG	r8d
08c3a6
+#  define VPCMP	vpcmpd
08c3a6
 # else
08c3a6
+#  define SHIFT_REG	edi
08c3a6
+
08c3a6
+#  define kunpck	kunpckdq
08c3a6
+#  define kmov_2x	kmovq
08c3a6
+#  define maskz_2x	rcx
08c3a6
+#  define maskm_2x	rax
08c3a6
+
08c3a6
+#  define CHAR_SIZE	1
08c3a6
+#  define VPMIN	vpminub
08c3a6
+#  define VPTESTN	vptestnmb
08c3a6
 #  define VPBROADCAST	vpbroadcastb
08c3a6
-#  define VPCMP		vpcmpb
08c3a6
-#  define SHIFT_REG	ecx
08c3a6
+#  define VPCMP	vpcmpb
08c3a6
 # endif
08c3a6
 
08c3a6
 # define XMMZERO	xmm16
08c3a6
 # define YMMZERO	ymm16
08c3a6
 # define YMMMATCH	ymm17
08c3a6
-# define YMM1		ymm18
08c3a6
+# define YMMSAVE	ymm18
08c3a6
+
08c3a6
+# define YMM1	ymm19
08c3a6
+# define YMM2	ymm20
08c3a6
+# define YMM3	ymm21
08c3a6
+# define YMM4	ymm22
08c3a6
+# define YMM5	ymm23
08c3a6
+# define YMM6	ymm24
08c3a6
+# define YMM7	ymm25
08c3a6
+# define YMM8	ymm26
08c3a6
 
08c3a6
-# define VEC_SIZE	32
08c3a6
 
08c3a6
-	.section .text.evex,"ax",@progbits
08c3a6
-ENTRY (STRRCHR)
08c3a6
-	movl	%edi, %ecx
08c3a6
+# define VEC_SIZE	32
08c3a6
+# define PAGE_SIZE	4096
08c3a6
+	.section .text.evex, "ax", @progbits
08c3a6
+ENTRY(STRRCHR)
08c3a6
+	movl	%edi, %eax
08c3a6
 	/* Broadcast CHAR to YMMMATCH.  */
08c3a6
 	VPBROADCAST %esi, %YMMMATCH
08c3a6
 
08c3a6
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
08c3a6
-
08c3a6
-	/* Check if we may cross page boundary with one vector load.  */
08c3a6
-	andl	$(2 * VEC_SIZE - 1), %ecx
08c3a6
-	cmpl	$VEC_SIZE, %ecx
08c3a6
-	ja	L(cros_page_boundary)
08c3a6
+	andl	$(PAGE_SIZE - 1), %eax
08c3a6
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
+	jg	L(cross_page_boundary)
08c3a6
 
08c3a6
+L(page_cross_continue):
08c3a6
 	VMOVU	(%rdi), %YMM1
08c3a6
-
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
08c3a6
+	VPTESTN	%YMM1, %YMM1, %k0
08c3a6
 	kmovd	%k0, %ecx
08c3a6
-	kmovd	%k1, %eax
08c3a6
-
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(first_vec)
08c3a6
-
08c3a6
 	testl	%ecx, %ecx
08c3a6
-	jnz	L(return_null)
08c3a6
-
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	xorl	%edx, %edx
08c3a6
-	jmp	L(aligned_loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(first_vec):
08c3a6
-	/* Check if there is a null byte.  */
08c3a6
-	testl	%ecx, %ecx
08c3a6
-	jnz	L(char_and_nul_in_first_vec)
08c3a6
-
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
-	movq	%rdi, %rsi
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	jmp	L(aligned_loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(cros_page_boundary):
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
+	jz	L(aligned_more)
08c3a6
+	/* fallthrough: zero CHAR in first VEC.  */
08c3a6
 
08c3a6
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
+	kmovd	%k1, %eax
08c3a6
+	/* Build mask up until first zero CHAR (used to mask of
08c3a6
+	   potential search CHAR matches past the end of the string).
08c3a6
+	 */
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret0)
08c3a6
+	/* Get last match (the `andl` removed any out of bounds
08c3a6
+	   matches).  */
08c3a6
+	bsrl	%eax, %eax
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
08c3a6
-	   bytes.  */
08c3a6
-	movl	%ecx, %SHIFT_REG
08c3a6
-	sarl	$2, %SHIFT_REG
08c3a6
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+# else
08c3a6
+	addq	%rdi, %rax
08c3a6
 # endif
08c3a6
+L(ret0):
08c3a6
+	ret
08c3a6
 
08c3a6
-	VMOVA	(%rdi), %YMM1
08c3a6
-
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
+	/* Returns for first vec x1/x2/x3 have hard coded backward
08c3a6
+	   search path for earlier matches.  */
08c3a6
+	.p2align 4,, 6
08c3a6
+L(first_vec_x1):
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
08c3a6
+	kmovd	%k1, %eax
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	/* eax non-zero if search CHAR in range.  */
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jnz	L(first_vec_x1_return)
08c3a6
+
08c3a6
+	/* fallthrough: no match in YMM2 then need to check for earlier
08c3a6
+	   matches (in YMM1).  */
08c3a6
+	.p2align 4,, 4
08c3a6
+L(first_vec_x0_test):
08c3a6
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
-	kmovd	%k0, %edx
08c3a6
 	kmovd	%k1, %eax
08c3a6
-
08c3a6
-	shrxl	%SHIFT_REG, %edx, %edx
08c3a6
-	shrxl	%SHIFT_REG, %eax, %eax
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-
08c3a6
-	/* Check if there is a CHAR.  */
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(found_char)
08c3a6
-
08c3a6
-	testl	%edx, %edx
08c3a6
-	jnz	L(return_null)
08c3a6
-
08c3a6
-	jmp	L(aligned_loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(found_char):
08c3a6
-	testl	%edx, %edx
08c3a6
-	jnz	L(char_and_nul)
08c3a6
-
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
-	leaq	(%rdi, %rcx), %rsi
08c3a6
+	jz	L(ret1)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+# ifdef USE_AS_WCSRCHR
08c3a6
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
08c3a6
+# else
08c3a6
+	addq	%rsi, %rax
08c3a6
+# endif
08c3a6
+L(ret1):
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(aligned_loop):
08c3a6
-	VMOVA	(%rdi), %YMM1
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
+	.p2align 4,, 10
08c3a6
+L(first_vec_x1_or_x2):
08c3a6
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
08c3a6
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
08c3a6
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
08c3a6
+	   matches between either of them. Otherwise check YMM1.  */
08c3a6
+	kortestd %k2, %k3
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+
08c3a6
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
08c3a6
+	   two bitmasks then get last result.  */
08c3a6
+	kunpck	%k2, %k3, %k3
08c3a6
+	kmovq	%k3, %rax
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
-	kmovd	%k0, %ecx
08c3a6
+	.p2align 4,, 6
08c3a6
+L(first_vec_x3):
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_vec_x1_or_x2)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	VMOVA	(%rdi), %YMM1
08c3a6
-	add	$VEC_SIZE, %rdi
08c3a6
+	.p2align 4,, 6
08c3a6
+L(first_vec_x0_x1_test):
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
08c3a6
+	kmovd	%k1, %eax
08c3a6
+	/* Check YMM2 for last match first. If no match try YMM1.  */
08c3a6
+	testl	%eax, %eax
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	.p2align 4,, 4
08c3a6
+L(first_vec_x1_return):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
-	kmovd	%k0, %ecx
08c3a6
+	.p2align 4,, 10
08c3a6
+L(first_vec_x2):
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
08c3a6
+	 */
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_vec_x0_x1_test)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	VMOVA	(%rdi), %YMM1
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
 
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
+	.p2align 4
08c3a6
+L(aligned_more):
08c3a6
+	/* Need to keep original pointer incase YMM1 has last match.  */
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	andq	$-VEC_SIZE, %rdi
08c3a6
+	VMOVU	VEC_SIZE(%rdi), %YMM2
08c3a6
+	VPTESTN	%YMM2, %YMM2, %k0
08c3a6
 	kmovd	%k0, %ecx
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jnz	L(char_nor_null)
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x1)
08c3a6
 
08c3a6
-	VMOVA	(%rdi), %YMM1
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
08c3a6
+	VPTESTN	%YMM3, %YMM3, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x2)
08c3a6
 
08c3a6
-	/* Each bit in K0 represents a null byte in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
08c3a6
-	/* Each bit in K1 represents a CHAR in YMM1.  */
08c3a6
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
08c3a6
+	VPTESTN	%YMM4, %YMM4, %k0
08c3a6
 	kmovd	%k0, %ecx
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	orl	%eax, %ecx
08c3a6
-	jz	L(aligned_loop)
08c3a6
+	movq	%rdi, %r8
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x3)
08c3a6
 
08c3a6
+	andq	$-(VEC_SIZE * 2), %rdi
08c3a6
 	.p2align 4
08c3a6
-L(char_nor_null):
08c3a6
-	/* Find a CHAR or a null byte in a loop.  */
08c3a6
+L(first_aligned_loop):
08c3a6
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
08c3a6
+	   they don't store a match.  */
08c3a6
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
08c3a6
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
08c3a6
+
08c3a6
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
08c3a6
+	vpxord	%YMM6, %YMMMATCH, %YMM7
08c3a6
+
08c3a6
+	VPMIN	%YMM5, %YMM6, %YMM8
08c3a6
+	VPMIN	%YMM8, %YMM7, %YMM7
08c3a6
+
08c3a6
+	VPTESTN	%YMM7, %YMM7, %k1
08c3a6
+	subq	$(VEC_SIZE * -2), %rdi
08c3a6
+	kortestd %k1, %k2
08c3a6
+	jz	L(first_aligned_loop)
08c3a6
+
08c3a6
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
08c3a6
+	VPTESTN	%YMM8, %YMM8, %k1
08c3a6
+	ktestd	%k1, %k1
08c3a6
+	jz	L(second_aligned_loop_prep)
08c3a6
+
08c3a6
+	kortestd %k2, %k3
08c3a6
+	jnz	L(return_first_aligned_loop)
08c3a6
+
08c3a6
+	.p2align 4,, 6
08c3a6
+L(first_vec_x1_or_x2_or_x3):
08c3a6
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
08c3a6
+	kmovd	%k4, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(match)
08c3a6
-L(return_value):
08c3a6
-	testl	%edx, %edx
08c3a6
-	jz	L(return_null)
08c3a6
-	movl	%edx, %eax
08c3a6
-	movq	%rsi, %rdi
08c3a6
+	jz	L(first_vec_x1_or_x2)
08c3a6
 	bsrl	%eax, %eax
08c3a6
-# ifdef USE_AS_WCSRCHR
08c3a6
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
08c3a6
-# else
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
-# endif
08c3a6
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(match):
08c3a6
-	/* Find a CHAR.  Check if there is a null byte.  */
08c3a6
-	kmovd	%k0, %ecx
08c3a6
-	testl	%ecx, %ecx
08c3a6
-	jnz	L(find_nul)
08c3a6
+	.p2align 4,, 8
08c3a6
+L(return_first_aligned_loop):
08c3a6
+	VPTESTN	%YMM5, %YMM5, %k0
08c3a6
+	kunpck	%k0, %k1, %k0
08c3a6
+	kmov_2x	%k0, %maskz_2x
08c3a6
+
08c3a6
+	blsmsk	%maskz_2x, %maskz_2x
08c3a6
+	kunpck	%k2, %k3, %k3
08c3a6
+	kmov_2x	%k3, %maskm_2x
08c3a6
+	and	%maskz_2x, %maskm_2x
08c3a6
+	jz	L(first_vec_x1_or_x2_or_x3)
08c3a6
 
08c3a6
-	/* Remember the match and keep searching.  */
08c3a6
-	movl	%eax, %edx
08c3a6
+	bsr	%maskm_2x, %maskm_2x
08c3a6
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4
08c3a6
+	/* We can throw away the work done for the first 4x checks here
08c3a6
+	   as we have a later match. This is the 'fast' path persay.
08c3a6
+	 */
08c3a6
+L(second_aligned_loop_prep):
08c3a6
+L(second_aligned_loop_set_furthest_match):
08c3a6
 	movq	%rdi, %rsi
08c3a6
-	jmp	L(aligned_loop)
08c3a6
+	kunpck	%k2, %k3, %k4
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(find_nul):
08c3a6
-	/* Mask out any matching bits after the null byte.  */
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	subl	$1, %r8d
08c3a6
-	xorl	%ecx, %r8d
08c3a6
-	andl	%r8d, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	/* If there is no CHAR here, return the remembered one.  */
08c3a6
-	jz	L(return_value)
08c3a6
-	bsrl	%eax, %eax
08c3a6
+L(second_aligned_loop):
08c3a6
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
08c3a6
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
08c3a6
+
08c3a6
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
08c3a6
+	vpxord	%YMM2, %YMMMATCH, %YMM3
08c3a6
+
08c3a6
+	VPMIN	%YMM1, %YMM2, %YMM4
08c3a6
+	VPMIN	%YMM3, %YMM4, %YMM3
08c3a6
+
08c3a6
+	VPTESTN	%YMM3, %YMM3, %k1
08c3a6
+	subq	$(VEC_SIZE * -2), %rdi
08c3a6
+	kortestd %k1, %k2
08c3a6
+	jz	L(second_aligned_loop)
08c3a6
+
08c3a6
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
08c3a6
+	VPTESTN	%YMM4, %YMM4, %k1
08c3a6
+	ktestd	%k1, %k1
08c3a6
+	jz	L(second_aligned_loop_set_furthest_match)
08c3a6
+
08c3a6
+	kortestd %k2, %k3
08c3a6
+	/* branch here because there is a significant advantage interms
08c3a6
+	   of output dependency chance in using edx.  */
08c3a6
+	jnz	L(return_new_match)
08c3a6
+L(return_old_match):
08c3a6
+	kmovq	%k4, %rax
08c3a6
+	bsrq	%rax, %rax
08c3a6
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
+
08c3a6
+L(return_new_match):
08c3a6
+	VPTESTN	%YMM1, %YMM1, %k0
08c3a6
+	kunpck	%k0, %k1, %k0
08c3a6
+	kmov_2x	%k0, %maskz_2x
08c3a6
+
08c3a6
+	blsmsk	%maskz_2x, %maskz_2x
08c3a6
+	kunpck	%k2, %k3, %k3
08c3a6
+	kmov_2x	%k3, %maskm_2x
08c3a6
+	and	%maskz_2x, %maskm_2x
08c3a6
+	jz	L(return_old_match)
08c3a6
+
08c3a6
+	bsr	%maskm_2x, %maskm_2x
08c3a6
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
+	ret
08c3a6
+
08c3a6
+L(cross_page_boundary):
08c3a6
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
08c3a6
+	   rax` sets pointer will all page offset bits cleared so
08c3a6
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
08c3a6
+	   before page cross (guranteed to be safe to read). Doing this
08c3a6
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
08c3a6
+	   a bit of code size.  */
08c3a6
+	xorq	%rdi, %rax
08c3a6
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
08c3a6
+	VPTESTN	%YMM1, %YMM1, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+
08c3a6
+	/* Shift out zero CHAR matches that are before the begining of
08c3a6
+	   src (rdi).  */
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
08c3a6
-# else
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
+	movl	%edi, %esi
08c3a6
+	andl	$(VEC_SIZE - 1), %esi
08c3a6
+	shrl	$2, %esi
08c3a6
 # endif
08c3a6
-	ret
08c3a6
+	shrxl	%SHIFT_REG, %ecx, %ecx
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(char_and_nul):
08c3a6
-	/* Find both a CHAR and a null byte.  */
08c3a6
-	addq	%rcx, %rdi
08c3a6
-	movl	%edx, %ecx
08c3a6
-L(char_and_nul_in_first_vec):
08c3a6
-	/* Mask out any matching bits after the null byte.  */
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	subl	$1, %r8d
08c3a6
-	xorl	%ecx, %r8d
08c3a6
-	andl	%r8d, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	/* Return null pointer if the null byte comes first.  */
08c3a6
-	jz	L(return_null)
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(page_cross_continue)
08c3a6
+
08c3a6
+	/* Found zero CHAR so need to test for search CHAR.  */
08c3a6
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
08c3a6
+	kmovd	%k1, %eax
08c3a6
+	/* Shift out search CHAR matches that are before the begining of
08c3a6
+	   src (rdi).  */
08c3a6
+	shrxl	%SHIFT_REG, %eax, %eax
08c3a6
+
08c3a6
+	/* Check if any search CHAR match in range.  */
08c3a6
+	blsmskl	%ecx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret3)
08c3a6
 	bsrl	%eax, %eax
08c3a6
 # ifdef USE_AS_WCSRCHR
08c3a6
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
08c3a6
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
 # else
08c3a6
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
08c3a6
+	addq	%rdi, %rax
08c3a6
 # endif
08c3a6
+L(ret3):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(return_null):
08c3a6
-	xorl	%eax, %eax
08c3a6
-	ret
08c3a6
-
08c3a6
-END (STRRCHR)
08c3a6
+END(STRRCHR)
08c3a6
 #endif