Blame SOURCES/ia-opt-str-wcs_rchr-evex.patch

513694
From 9ef733cbe224b1cc12e4c8acac09627ccb3a00d8 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Thu, 21 Apr 2022 20:52:30 -0500
513694
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
513694
513694
The new code unrolls the main loop slightly without adding too much
513694
overhead and minimizes the comparisons for the search CHAR.
513694
513694
Geometric Mean of all benchmarks New / Old: 0.755
513694
See email for all results.
513694
513694
Full xcheck passes on x86_64 with and without multiarch enabled.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d)
513694
---
513694
 sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
513694
 1 file changed, 290 insertions(+), 181 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
513694
index f920b5a5..f5b6d755 100644
513694
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
513694
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
513694
@@ -24,242 +24,351 @@
513694
 #  define STRRCHR	__strrchr_evex
513694
 # endif
513694
 
513694
-# define VMOVU		vmovdqu64
513694
-# define VMOVA		vmovdqa64
513694
+# define VMOVU	vmovdqu64
513694
+# define VMOVA	vmovdqa64
513694
 
513694
 # ifdef USE_AS_WCSRCHR
513694
+#  define SHIFT_REG	esi
513694
+
513694
+#  define kunpck	kunpckbw
513694
+#  define kmov_2x	kmovd
513694
+#  define maskz_2x	ecx
513694
+#  define maskm_2x	eax
513694
+#  define CHAR_SIZE	4
513694
+#  define VPMIN	vpminud
513694
+#  define VPTESTN	vptestnmd
513694
 #  define VPBROADCAST	vpbroadcastd
513694
-#  define VPCMP		vpcmpd
513694
-#  define SHIFT_REG	r8d
513694
+#  define VPCMP	vpcmpd
513694
 # else
513694
+#  define SHIFT_REG	edi
513694
+
513694
+#  define kunpck	kunpckdq
513694
+#  define kmov_2x	kmovq
513694
+#  define maskz_2x	rcx
513694
+#  define maskm_2x	rax
513694
+
513694
+#  define CHAR_SIZE	1
513694
+#  define VPMIN	vpminub
513694
+#  define VPTESTN	vptestnmb
513694
 #  define VPBROADCAST	vpbroadcastb
513694
-#  define VPCMP		vpcmpb
513694
-#  define SHIFT_REG	ecx
513694
+#  define VPCMP	vpcmpb
513694
 # endif
513694
 
513694
 # define XMMZERO	xmm16
513694
 # define YMMZERO	ymm16
513694
 # define YMMMATCH	ymm17
513694
-# define YMM1		ymm18
513694
+# define YMMSAVE	ymm18
513694
+
513694
+# define YMM1	ymm19
513694
+# define YMM2	ymm20
513694
+# define YMM3	ymm21
513694
+# define YMM4	ymm22
513694
+# define YMM5	ymm23
513694
+# define YMM6	ymm24
513694
+# define YMM7	ymm25
513694
+# define YMM8	ymm26
513694
 
513694
-# define VEC_SIZE	32
513694
 
513694
-	.section .text.evex,"ax",@progbits
513694
-ENTRY (STRRCHR)
513694
-	movl	%edi, %ecx
513694
+# define VEC_SIZE	32
513694
+# define PAGE_SIZE	4096
513694
+	.section .text.evex, "ax", @progbits
513694
+ENTRY(STRRCHR)
513694
+	movl	%edi, %eax
513694
 	/* Broadcast CHAR to YMMMATCH.  */
513694
 	VPBROADCAST %esi, %YMMMATCH
513694
 
513694
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
513694
-
513694
-	/* Check if we may cross page boundary with one vector load.  */
513694
-	andl	$(2 * VEC_SIZE - 1), %ecx
513694
-	cmpl	$VEC_SIZE, %ecx
513694
-	ja	L(cros_page_boundary)
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
+	jg	L(cross_page_boundary)
513694
 
513694
+L(page_cross_continue):
513694
 	VMOVU	(%rdi), %YMM1
513694
-
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
513694
+	VPTESTN	%YMM1, %YMM1, %k0
513694
 	kmovd	%k0, %ecx
513694
-	kmovd	%k1, %eax
513694
-
513694
-	addq	$VEC_SIZE, %rdi
513694
-
513694
-	testl	%eax, %eax
513694
-	jnz	L(first_vec)
513694
-
513694
 	testl	%ecx, %ecx
513694
-	jnz	L(return_null)
513694
-
513694
-	andq	$-VEC_SIZE, %rdi
513694
-	xorl	%edx, %edx
513694
-	jmp	L(aligned_loop)
513694
-
513694
-	.p2align 4
513694
-L(first_vec):
513694
-	/* Check if there is a null byte.  */
513694
-	testl	%ecx, %ecx
513694
-	jnz	L(char_and_nul_in_first_vec)
513694
-
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
-	movq	%rdi, %rsi
513694
-	andq	$-VEC_SIZE, %rdi
513694
-	jmp	L(aligned_loop)
513694
-
513694
-	.p2align 4
513694
-L(cros_page_boundary):
513694
-	andl	$(VEC_SIZE - 1), %ecx
513694
-	andq	$-VEC_SIZE, %rdi
513694
+	jz	L(aligned_more)
513694
+	/* fallthrough: zero CHAR in first VEC.  */
513694
 
513694
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
513694
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
+	kmovd	%k1, %eax
513694
+	/* Build mask up until first zero CHAR (used to mask of
513694
+	   potential search CHAR matches past the end of the string).
513694
+	 */
513694
+	blsmskl	%ecx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret0)
513694
+	/* Get last match (the `andl` removed any out of bounds
513694
+	   matches).  */
513694
+	bsrl	%eax, %eax
513694
 # ifdef USE_AS_WCSRCHR
513694
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
513694
-	   bytes.  */
513694
-	movl	%ecx, %SHIFT_REG
513694
-	sarl	$2, %SHIFT_REG
513694
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
513694
+# else
513694
+	addq	%rdi, %rax
513694
 # endif
513694
+L(ret0):
513694
+	ret
513694
 
513694
-	VMOVA	(%rdi), %YMM1
513694
-
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
+	/* Returns for first vec x1/x2/x3 have hard coded backward
513694
+	   search path for earlier matches.  */
513694
+	.p2align 4,, 6
513694
+L(first_vec_x1):
513694
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
513694
+	kmovd	%k1, %eax
513694
+	blsmskl	%ecx, %ecx
513694
+	/* eax non-zero if search CHAR in range.  */
513694
+	andl	%ecx, %eax
513694
+	jnz	L(first_vec_x1_return)
513694
+
513694
+	/* fallthrough: no match in YMM2 then need to check for earlier
513694
+	   matches (in YMM1).  */
513694
+	.p2align 4,, 4
513694
+L(first_vec_x0_test):
513694
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
-	kmovd	%k0, %edx
513694
 	kmovd	%k1, %eax
513694
-
513694
-	shrxl	%SHIFT_REG, %edx, %edx
513694
-	shrxl	%SHIFT_REG, %eax, %eax
513694
-	addq	$VEC_SIZE, %rdi
513694
-
513694
-	/* Check if there is a CHAR.  */
513694
 	testl	%eax, %eax
513694
-	jnz	L(found_char)
513694
-
513694
-	testl	%edx, %edx
513694
-	jnz	L(return_null)
513694
-
513694
-	jmp	L(aligned_loop)
513694
-
513694
-	.p2align 4
513694
-L(found_char):
513694
-	testl	%edx, %edx
513694
-	jnz	L(char_and_nul)
513694
-
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
-	leaq	(%rdi, %rcx), %rsi
513694
+	jz	L(ret1)
513694
+	bsrl	%eax, %eax
513694
+# ifdef USE_AS_WCSRCHR
513694
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
513694
+# else
513694
+	addq	%rsi, %rax
513694
+# endif
513694
+L(ret1):
513694
+	ret
513694
 
513694
-	.p2align 4
513694
-L(aligned_loop):
513694
-	VMOVA	(%rdi), %YMM1
513694
-	addq	$VEC_SIZE, %rdi
513694
+	.p2align 4,, 10
513694
+L(first_vec_x1_or_x2):
513694
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
513694
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
513694
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
513694
+	   matches between either of them. Otherwise check YMM1.  */
513694
+	kortestd %k2, %k3
513694
+	jz	L(first_vec_x0_test)
513694
+
513694
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
513694
+	   two bitmasks then get last result.  */
513694
+	kunpck	%k2, %k3, %k3
513694
+	kmovq	%k3, %rax
513694
+	bsrq	%rax, %rax
513694
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
 
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
-	kmovd	%k0, %ecx
513694
+	.p2align 4,, 6
513694
+L(first_vec_x3):
513694
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
513694
 	kmovd	%k1, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
+	blsmskl	%ecx, %ecx
513694
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_vec_x1_or_x2)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
 
513694
-	VMOVA	(%rdi), %YMM1
513694
-	add	$VEC_SIZE, %rdi
513694
+	.p2align 4,, 6
513694
+L(first_vec_x0_x1_test):
513694
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
513694
+	kmovd	%k1, %eax
513694
+	/* Check YMM2 for last match first. If no match try YMM1.  */
513694
+	testl	%eax, %eax
513694
+	jz	L(first_vec_x0_test)
513694
+	.p2align 4,, 4
513694
+L(first_vec_x1_return):
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
 
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
-	kmovd	%k0, %ecx
513694
+	.p2align 4,, 10
513694
+L(first_vec_x2):
513694
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
513694
 	kmovd	%k1, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
+	blsmskl	%ecx, %ecx
513694
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
513694
+	 */
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_vec_x0_x1_test)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
 
513694
-	VMOVA	(%rdi), %YMM1
513694
-	addq	$VEC_SIZE, %rdi
513694
 
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
+	.p2align 4
513694
+L(aligned_more):
513694
+	/* Need to keep original pointer incase YMM1 has last match.  */
513694
+	movq	%rdi, %rsi
513694
+	andq	$-VEC_SIZE, %rdi
513694
+	VMOVU	VEC_SIZE(%rdi), %YMM2
513694
+	VPTESTN	%YMM2, %YMM2, %k0
513694
 	kmovd	%k0, %ecx
513694
-	kmovd	%k1, %eax
513694
-	orl	%eax, %ecx
513694
-	jnz	L(char_nor_null)
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x1)
513694
 
513694
-	VMOVA	(%rdi), %YMM1
513694
-	addq	$VEC_SIZE, %rdi
513694
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
513694
+	VPTESTN	%YMM3, %YMM3, %k0
513694
+	kmovd	%k0, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x2)
513694
 
513694
-	/* Each bit in K0 represents a null byte in YMM1.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
513694
-	/* Each bit in K1 represents a CHAR in YMM1.  */
513694
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
513694
+	VPTESTN	%YMM4, %YMM4, %k0
513694
 	kmovd	%k0, %ecx
513694
-	kmovd	%k1, %eax
513694
-	orl	%eax, %ecx
513694
-	jz	L(aligned_loop)
513694
+	movq	%rdi, %r8
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x3)
513694
 
513694
+	andq	$-(VEC_SIZE * 2), %rdi
513694
 	.p2align 4
513694
-L(char_nor_null):
513694
-	/* Find a CHAR or a null byte in a loop.  */
513694
+L(first_aligned_loop):
513694
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
513694
+	   they don't store a match.  */
513694
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
513694
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
513694
+
513694
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
513694
+	vpxord	%YMM6, %YMMMATCH, %YMM7
513694
+
513694
+	VPMIN	%YMM5, %YMM6, %YMM8
513694
+	VPMIN	%YMM8, %YMM7, %YMM7
513694
+
513694
+	VPTESTN	%YMM7, %YMM7, %k1
513694
+	subq	$(VEC_SIZE * -2), %rdi
513694
+	kortestd %k1, %k2
513694
+	jz	L(first_aligned_loop)
513694
+
513694
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
513694
+	VPTESTN	%YMM8, %YMM8, %k1
513694
+	ktestd	%k1, %k1
513694
+	jz	L(second_aligned_loop_prep)
513694
+
513694
+	kortestd %k2, %k3
513694
+	jnz	L(return_first_aligned_loop)
513694
+
513694
+	.p2align 4,, 6
513694
+L(first_vec_x1_or_x2_or_x3):
513694
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
513694
+	kmovd	%k4, %eax
513694
 	testl	%eax, %eax
513694
-	jnz	L(match)
513694
-L(return_value):
513694
-	testl	%edx, %edx
513694
-	jz	L(return_null)
513694
-	movl	%edx, %eax
513694
-	movq	%rsi, %rdi
513694
+	jz	L(first_vec_x1_or_x2)
513694
 	bsrl	%eax, %eax
513694
-# ifdef USE_AS_WCSRCHR
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
513694
-# else
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
-# endif
513694
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
513694
 	ret
513694
 
513694
-	.p2align 4
513694
-L(match):
513694
-	/* Find a CHAR.  Check if there is a null byte.  */
513694
-	kmovd	%k0, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jnz	L(find_nul)
513694
+	.p2align 4,, 8
513694
+L(return_first_aligned_loop):
513694
+	VPTESTN	%YMM5, %YMM5, %k0
513694
+	kunpck	%k0, %k1, %k0
513694
+	kmov_2x	%k0, %maskz_2x
513694
+
513694
+	blsmsk	%maskz_2x, %maskz_2x
513694
+	kunpck	%k2, %k3, %k3
513694
+	kmov_2x	%k3, %maskm_2x
513694
+	and	%maskz_2x, %maskm_2x
513694
+	jz	L(first_vec_x1_or_x2_or_x3)
513694
 
513694
-	/* Remember the match and keep searching.  */
513694
-	movl	%eax, %edx
513694
+	bsr	%maskm_2x, %maskm_2x
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
+
513694
+	.p2align 4
513694
+	/* We can throw away the work done for the first 4x checks here
513694
+	   as we have a later match. This is the 'fast' path persay.
513694
+	 */
513694
+L(second_aligned_loop_prep):
513694
+L(second_aligned_loop_set_furthest_match):
513694
 	movq	%rdi, %rsi
513694
-	jmp	L(aligned_loop)
513694
+	kunpck	%k2, %k3, %k4
513694
 
513694
 	.p2align 4
513694
-L(find_nul):
513694
-	/* Mask out any matching bits after the null byte.  */
513694
-	movl	%ecx, %r8d
513694
-	subl	$1, %r8d
513694
-	xorl	%ecx, %r8d
513694
-	andl	%r8d, %eax
513694
-	testl	%eax, %eax
513694
-	/* If there is no CHAR here, return the remembered one.  */
513694
-	jz	L(return_value)
513694
-	bsrl	%eax, %eax
513694
+L(second_aligned_loop):
513694
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
513694
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
513694
+
513694
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
513694
+	vpxord	%YMM2, %YMMMATCH, %YMM3
513694
+
513694
+	VPMIN	%YMM1, %YMM2, %YMM4
513694
+	VPMIN	%YMM3, %YMM4, %YMM3
513694
+
513694
+	VPTESTN	%YMM3, %YMM3, %k1
513694
+	subq	$(VEC_SIZE * -2), %rdi
513694
+	kortestd %k1, %k2
513694
+	jz	L(second_aligned_loop)
513694
+
513694
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
513694
+	VPTESTN	%YMM4, %YMM4, %k1
513694
+	ktestd	%k1, %k1
513694
+	jz	L(second_aligned_loop_set_furthest_match)
513694
+
513694
+	kortestd %k2, %k3
513694
+	/* branch here because there is a significant advantage interms
513694
+	   of output dependency chance in using edx.  */
513694
+	jnz	L(return_new_match)
513694
+L(return_old_match):
513694
+	kmovq	%k4, %rax
513694
+	bsrq	%rax, %rax
513694
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
+
513694
+L(return_new_match):
513694
+	VPTESTN	%YMM1, %YMM1, %k0
513694
+	kunpck	%k0, %k1, %k0
513694
+	kmov_2x	%k0, %maskz_2x
513694
+
513694
+	blsmsk	%maskz_2x, %maskz_2x
513694
+	kunpck	%k2, %k3, %k3
513694
+	kmov_2x	%k3, %maskm_2x
513694
+	and	%maskz_2x, %maskm_2x
513694
+	jz	L(return_old_match)
513694
+
513694
+	bsr	%maskm_2x, %maskm_2x
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
513694
+	ret
513694
+
513694
+L(cross_page_boundary):
513694
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
513694
+	   rax` sets pointer will all page offset bits cleared so
513694
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
513694
+	   before page cross (guranteed to be safe to read). Doing this
513694
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
513694
+	   a bit of code size.  */
513694
+	xorq	%rdi, %rax
513694
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
513694
+	VPTESTN	%YMM1, %YMM1, %k0
513694
+	kmovd	%k0, %ecx
513694
+
513694
+	/* Shift out zero CHAR matches that are before the begining of
513694
+	   src (rdi).  */
513694
 # ifdef USE_AS_WCSRCHR
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
513694
-# else
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
+	movl	%edi, %esi
513694
+	andl	$(VEC_SIZE - 1), %esi
513694
+	shrl	$2, %esi
513694
 # endif
513694
-	ret
513694
+	shrxl	%SHIFT_REG, %ecx, %ecx
513694
 
513694
-	.p2align 4
513694
-L(char_and_nul):
513694
-	/* Find both a CHAR and a null byte.  */
513694
-	addq	%rcx, %rdi
513694
-	movl	%edx, %ecx
513694
-L(char_and_nul_in_first_vec):
513694
-	/* Mask out any matching bits after the null byte.  */
513694
-	movl	%ecx, %r8d
513694
-	subl	$1, %r8d
513694
-	xorl	%ecx, %r8d
513694
-	andl	%r8d, %eax
513694
-	testl	%eax, %eax
513694
-	/* Return null pointer if the null byte comes first.  */
513694
-	jz	L(return_null)
513694
+	testl	%ecx, %ecx
513694
+	jz	L(page_cross_continue)
513694
+
513694
+	/* Found zero CHAR so need to test for search CHAR.  */
513694
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
513694
+	kmovd	%k1, %eax
513694
+	/* Shift out search CHAR matches that are before the begining of
513694
+	   src (rdi).  */
513694
+	shrxl	%SHIFT_REG, %eax, %eax
513694
+
513694
+	/* Check if any search CHAR match in range.  */
513694
+	blsmskl	%ecx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret3)
513694
 	bsrl	%eax, %eax
513694
 # ifdef USE_AS_WCSRCHR
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
513694
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
513694
 # else
513694
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
513694
+	addq	%rdi, %rax
513694
 # endif
513694
+L(ret3):
513694
 	ret
513694
 
513694
-	.p2align 4
513694
-L(return_null):
513694
-	xorl	%eax, %eax
513694
-	ret
513694
-
513694
-END (STRRCHR)
513694
+END(STRRCHR)
513694
 #endif
513694
-- 
513694
GitLab
513694