Tree - rpms/glibc - CentOS Git server

rpms / glibc

Files

Commit: 1feee8ded8b85a11246442ab0dab5196abca2a13

Blob Blame History Raw

 commit 0a11305416e287d85c64f04337cfd64b6b350e0c
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Thu Apr 21 20:52:28 2022 -0500
 
    x86: Optimize {str|wcs}rchr-sse2
    
    The new code unrolls the main loop slightly without adding too much
    overhead and minimizes the comparisons for the search CHAR.
    
    Geometric Mean of all benchmarks New / Old: 0.741
    See email for all results.
    
    Full xcheck passes on x86_64 with and without multiarch enabled.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
 
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 67c30d0260cef8a3..a56300bc1830dedd 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index a36034b40afe8d3d..00f69f2be77a43a0 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 6b318d3f29de9a9e..9006f2220963d76c 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"

	commit 0a11305416e287d85c64f04337cfd64b6b350e0c
	Author: Noah Goldstein <goldstein.w.n@gmail.com>
	Date: Thu Apr 21 20:52:28 2022 -0500

	x86: Optimize {str\|wcs}rchr-sse2

	The new code unrolls the main loop slightly without adding too much
	overhead and minimizes the comparisons for the search CHAR.

	Geometric Mean of all benchmarks New / Old: 0.741
	See email for all results.

	Full xcheck passes on x86_64 with and without multiarch enabled.
	Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

	(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)

	diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
	index 67c30d0260cef8a3..a56300bc1830dedd 100644
	--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
	+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
	@@ -17,7 +17,7 @@
	<https://www.gnu.org/licenses/>. */

	#if IS_IN (libc)
	-# define strrchr __strrchr_sse2
	+# define STRRCHR __strrchr_sse2

	# undef weak_alias
	# define weak_alias(strrchr, rindex)
	diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
	index a36034b40afe8d3d..00f69f2be77a43a0 100644
	--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
	+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
	@@ -17,7 +17,6 @@
	<https://www.gnu.org/licenses/>. */

	#if IS_IN (libc)
	-# define wcsrchr __wcsrchr_sse2
	+# define STRRCHR __wcsrchr_sse2
	#endif
	-
	#include "../wcsrchr.S"
	diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
	index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
	--- a/sysdeps/x86_64/strrchr.S
	+++ b/sysdeps/x86_64/strrchr.S
	@@ -19,210 +19,360 @@

	#include <sysdep.h>

	+#ifndef STRRCHR
	+# define STRRCHR strrchr
	+#endif
	+
	+#ifdef USE_AS_WCSRCHR
	+# define PCMPEQ pcmpeqd
	+# define CHAR_SIZE 4
	+# define PMINU pminud
	+#else
	+# define PCMPEQ pcmpeqb
	+# define CHAR_SIZE 1
	+# define PMINU pminub
	+#endif
	+
	+#define PAGE_SIZE 4096
	+#define VEC_SIZE 16
	+
	.text
	-ENTRY (strrchr)
	- movd %esi, %xmm1
	+ENTRY(STRRCHR)
	+ movd %esi, %xmm0
	movq %rdi, %rax
	- andl $4095, %eax
	- punpcklbw %xmm1, %xmm1
	- cmpq $4032, %rax
	- punpcklwd %xmm1, %xmm1
	- pshufd $0, %xmm1, %xmm1
	+ andl $(PAGE_SIZE - 1), %eax
	+#ifndef USE_AS_WCSRCHR
	+ punpcklbw %xmm0, %xmm0
	+ punpcklwd %xmm0, %xmm0
	+#endif
	+ pshufd $0, %xmm0, %xmm0
	+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
	ja L(cross_page)
	- movdqu (%rdi), %xmm0
	+
	+L(cross_page_continue):
	+ movups (%rdi), %xmm1
	pxor %xmm2, %xmm2
	- movdqa %xmm0, %xmm3
	- pcmpeqb %xmm1, %xmm0
	- pcmpeqb %xmm2, %xmm3
	- pmovmskb %xmm0, %ecx
	- pmovmskb %xmm3, %edx
	- testq %rdx, %rdx
	- je L(next_48_bytes)
	- leaq -1(%rdx), %rax
	- xorq %rdx, %rax
	- andq %rcx, %rax
	- je L(exit)
	- bsrq %rax, %rax
	+ PCMPEQ %xmm1, %xmm2
	+ pmovmskb %xmm2, %ecx
	+ testl %ecx, %ecx
	+ jz L(aligned_more)
	+
	+ PCMPEQ %xmm0, %xmm1
	+ pmovmskb %xmm1, %eax
	+ leal -1(%rcx), %edx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(ret0)
	+ bsrl %eax, %eax
	addq %rdi, %rax
	+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
	+ search CHAR is zero we are correct. Either way `andq
	+ -CHAR_SIZE, %rax` gets the correct result. */
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+L(ret0):
	ret

	+ /* Returns for first vec x1/x2 have hard coded backward search
	+ path for earlier matches. */
	.p2align 4
	-L(next_48_bytes):
	- movdqu 16(%rdi), %xmm4
	- movdqa %xmm4, %xmm5
	- movdqu 32(%rdi), %xmm3
	- pcmpeqb %xmm1, %xmm4
	- pcmpeqb %xmm2, %xmm5
	- movdqu 48(%rdi), %xmm0
	- pmovmskb %xmm5, %edx
	- movdqa %xmm3, %xmm5
	- pcmpeqb %xmm1, %xmm3
	- pcmpeqb %xmm2, %xmm5
	- pcmpeqb %xmm0, %xmm2
	- salq $16, %rdx
	- pmovmskb %xmm3, %r8d
	- pmovmskb %xmm5, %eax
	- pmovmskb %xmm2, %esi
	- salq $32, %r8
	- salq $32, %rax
	- pcmpeqb %xmm1, %xmm0
	- orq %rdx, %rax
	- movq %rsi, %rdx
	- pmovmskb %xmm4, %esi
	- salq $48, %rdx
	- salq $16, %rsi
	- orq %r8, %rsi
	- orq %rcx, %rsi
	- pmovmskb %xmm0, %ecx
	- salq $48, %rcx
	- orq %rcx, %rsi
	- orq %rdx, %rax
	- je L(loop_header2)
	- leaq -1(%rax), %rcx
	- xorq %rax, %rcx
	- andq %rcx, %rsi
	- je L(exit)
	- bsrq %rsi, %rsi
	- leaq (%rdi,%rsi), %rax
	+L(first_vec_x0_test):
	+ PCMPEQ %xmm0, %xmm1
	+ pmovmskb %xmm1, %eax
	+ testl %eax, %eax
	+ jz L(ret0)
	+ bsrl %eax, %eax
	+ addq %r8, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	ret

	.p2align 4
	-L(loop_header2):
	- testq %rsi, %rsi
	- movq %rdi, %rcx
	- je L(no_c_found)
	-L(loop_header):
	- addq $64, %rdi
	- pxor %xmm7, %xmm7
	- andq $-64, %rdi
	- jmp L(loop_entry)
	+L(first_vec_x1):
	+ PCMPEQ %xmm0, %xmm2
	+ pmovmskb %xmm2, %eax
	+ leal -1(%rcx), %edx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(first_vec_x0_test)
	+ bsrl %eax, %eax
	+ leaq (VEC_SIZE)(%rdi, %rax), %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+ ret

	.p2align 4
	-L(loop64):
	- testq %rdx, %rdx
	- cmovne %rdx, %rsi
	- cmovne %rdi, %rcx
	- addq $64, %rdi
	-L(loop_entry):
	- movdqa 32(%rdi), %xmm3
	- pxor %xmm6, %xmm6
	- movdqa 48(%rdi), %xmm2
	- movdqa %xmm3, %xmm0
	- movdqa 16(%rdi), %xmm4
	- pminub %xmm2, %xmm0
	- movdqa (%rdi), %xmm5
	- pminub %xmm4, %xmm0
	- pminub %xmm5, %xmm0
	- pcmpeqb %xmm7, %xmm0
	- pmovmskb %xmm0, %eax
	- movdqa %xmm5, %xmm0
	- pcmpeqb %xmm1, %xmm0
	- pmovmskb %xmm0, %r9d
	- movdqa %xmm4, %xmm0
	- pcmpeqb %xmm1, %xmm0
	- pmovmskb %xmm0, %edx
	- movdqa %xmm3, %xmm0
	- pcmpeqb %xmm1, %xmm0
	- salq $16, %rdx
	- pmovmskb %xmm0, %r10d
	- movdqa %xmm2, %xmm0
	- pcmpeqb %xmm1, %xmm0
	- salq $32, %r10
	- orq %r10, %rdx
	- pmovmskb %xmm0, %r8d
	- orq %r9, %rdx
	- salq $48, %r8
	- orq %r8, %rdx
	+L(first_vec_x1_test):
	+ PCMPEQ %xmm0, %xmm2
	+ pmovmskb %xmm2, %eax
	testl %eax, %eax
	- je L(loop64)
	- pcmpeqb %xmm6, %xmm4
	- pcmpeqb %xmm6, %xmm3
	- pcmpeqb %xmm6, %xmm5
	- pmovmskb %xmm4, %eax
	- pmovmskb %xmm3, %r10d
	- pcmpeqb %xmm6, %xmm2
	- pmovmskb %xmm5, %r9d
	- salq $32, %r10
	- salq $16, %rax
	- pmovmskb %xmm2, %r8d
	- orq %r10, %rax
	- orq %r9, %rax
	- salq $48, %r8
	- orq %r8, %rax
	- leaq -1(%rax), %r8
	- xorq %rax, %r8
	- andq %r8, %rdx
	- cmovne %rdi, %rcx
	- cmovne %rdx, %rsi
	- bsrq %rsi, %rsi
	- leaq (%rcx,%rsi), %rax
	+ jz L(first_vec_x0_test)
	+ bsrl %eax, %eax
	+ leaq (VEC_SIZE)(%rdi, %rax), %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+ ret
	+
	+ .p2align 4
	+L(first_vec_x2):
	+ PCMPEQ %xmm0, %xmm3
	+ pmovmskb %xmm3, %eax
	+ leal -1(%rcx), %edx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(first_vec_x1_test)
	+ bsrl %eax, %eax
	+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+ ret
	+
	+ .p2align 4
	+L(aligned_more):
	+ /* Save original pointer if match was in VEC 0. */
	+ movq %rdi, %r8
	+ andq $-VEC_SIZE, %rdi
	+
	+ movaps VEC_SIZE(%rdi), %xmm2
	+ pxor %xmm3, %xmm3
	+ PCMPEQ %xmm2, %xmm3
	+ pmovmskb %xmm3, %ecx
	+ testl %ecx, %ecx
	+ jnz L(first_vec_x1)
	+
	+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
	+ pxor %xmm4, %xmm4
	+ PCMPEQ %xmm3, %xmm4
	+ pmovmskb %xmm4, %ecx
	+ testl %ecx, %ecx
	+ jnz L(first_vec_x2)
	+
	+ addq $VEC_SIZE, %rdi
	+ /* Save pointer again before realigning. */
	+ movq %rdi, %rsi
	+ andq $-(VEC_SIZE * 2), %rdi
	+ .p2align 4
	+L(first_loop):
	+ /* Do 2x VEC at a time. */
	+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
	+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
	+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
	+ detecting zero. Note if this is found to be a bottleneck it
	+ may be worth adding an SSE4.1 wcsrchr implementation. */
	+#ifdef USE_AS_WCSRCHR
	+ movaps %xmm5, %xmm6
	+ pxor %xmm8, %xmm8
	+
	+ PCMPEQ %xmm8, %xmm5
	+ PCMPEQ %xmm4, %xmm8
	+ por %xmm5, %xmm8
	+#else
	+ movaps %xmm5, %xmm6
	+ PMINU %xmm4, %xmm5
	+#endif
	+
	+ movaps %xmm4, %xmm9
	+ PCMPEQ %xmm0, %xmm4
	+ PCMPEQ %xmm0, %xmm6
	+ movaps %xmm6, %xmm7
	+ por %xmm4, %xmm6
	+#ifndef USE_AS_WCSRCHR
	+ pxor %xmm8, %xmm8
	+ PCMPEQ %xmm5, %xmm8
	+#endif
	+ pmovmskb %xmm8, %ecx
	+ pmovmskb %xmm6, %eax
	+
	+ addq $(VEC_SIZE * 2), %rdi
	+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
	+ macro-fuse with `jz`. */
	+ addl %ecx, %eax
	+ jz L(first_loop)
	+
	+ /* Check if there is zero match. */
	+ testl %ecx, %ecx
	+ jz L(second_loop_match)
	+
	+ /* Check if there was a match in last iteration. */
	+ subl %ecx, %eax
	+ jnz L(new_match)
	+
	+L(first_loop_old_match):
	+ PCMPEQ %xmm0, %xmm2
	+ PCMPEQ %xmm0, %xmm3
	+ pmovmskb %xmm2, %ecx
	+ pmovmskb %xmm3, %eax
	+ addl %eax, %ecx
	+ jz L(first_vec_x0_test)
	+ /* NB: We could move this shift to before the branch and save a
	+ bit of code size / performance on the fall through. The
	+ branch leads to the null case which generally seems hotter
	+ than char in first 3x VEC. */
	+ sall $16, %eax
	+ orl %ecx, %eax
	+
	+ bsrl %eax, %eax
	+ addq %rsi, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+ ret
	+
	+ .p2align 4
	+L(new_match):
	+ pxor %xmm6, %xmm6
	+ PCMPEQ %xmm9, %xmm6
	+ pmovmskb %xmm6, %eax
	+ sall $16, %ecx
	+ orl %eax, %ecx
	+
	+ /* We can't reuse either of the old comparisons as since we mask
	+ of zeros after first zero (instead of using the full
	+ comparison) we can't gurantee no interference between match
	+ after end of string and valid match. */
	+ pmovmskb %xmm4, %eax
	+ pmovmskb %xmm7, %edx
	+ sall $16, %edx
	+ orl %edx, %eax
	+
	+ leal -1(%ecx), %edx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(first_loop_old_match)
	+ bsrl %eax, %eax
	+ addq %rdi, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	ret

	+ /* Save minimum state for getting most recent match. We can
	+ throw out all previous work. */
	.p2align 4
	-L(no_c_found):
	- movl $1, %esi
	- xorl %ecx, %ecx
	- jmp L(loop_header)
	+L(second_loop_match):
	+ movq %rdi, %rsi
	+ movaps %xmm4, %xmm2
	+ movaps %xmm7, %xmm3

	.p2align 4
	-L(exit):
	- xorl %eax, %eax
	+L(second_loop):
	+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
	+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
	+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
	+ detecting zero. Note if this is found to be a bottleneck it
	+ may be worth adding an SSE4.1 wcsrchr implementation. */
	+#ifdef USE_AS_WCSRCHR
	+ movaps %xmm5, %xmm6
	+ pxor %xmm8, %xmm8
	+
	+ PCMPEQ %xmm8, %xmm5
	+ PCMPEQ %xmm4, %xmm8
	+ por %xmm5, %xmm8
	+#else
	+ movaps %xmm5, %xmm6
	+ PMINU %xmm4, %xmm5
	+#endif
	+
	+ movaps %xmm4, %xmm9
	+ PCMPEQ %xmm0, %xmm4
	+ PCMPEQ %xmm0, %xmm6
	+ movaps %xmm6, %xmm7
	+ por %xmm4, %xmm6
	+#ifndef USE_AS_WCSRCHR
	+ pxor %xmm8, %xmm8
	+ PCMPEQ %xmm5, %xmm8
	+#endif
	+
	+ pmovmskb %xmm8, %ecx
	+ pmovmskb %xmm6, %eax
	+
	+ addq $(VEC_SIZE * 2), %rdi
	+ /* Either null term or new occurence of CHAR. */
	+ addl %ecx, %eax
	+ jz L(second_loop)
	+
	+ /* No null term so much be new occurence of CHAR. */
	+ testl %ecx, %ecx
	+ jz L(second_loop_match)
	+
	+
	+ subl %ecx, %eax
	+ jnz L(second_loop_new_match)
	+
	+L(second_loop_old_match):
	+ pmovmskb %xmm2, %ecx
	+ pmovmskb %xmm3, %eax
	+ sall $16, %eax
	+ orl %ecx, %eax
	+ bsrl %eax, %eax
	+ addq %rsi, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	ret

	.p2align 4
	+L(second_loop_new_match):
	+ pxor %xmm6, %xmm6
	+ PCMPEQ %xmm9, %xmm6
	+ pmovmskb %xmm6, %eax
	+ sall $16, %ecx
	+ orl %eax, %ecx
	+
	+ /* We can't reuse either of the old comparisons as since we mask
	+ of zeros after first zero (instead of using the full
	+ comparison) we can't gurantee no interference between match
	+ after end of string and valid match. */
	+ pmovmskb %xmm4, %eax
	+ pmovmskb %xmm7, %edx
	+ sall $16, %edx
	+ orl %edx, %eax
	+
	+ leal -1(%ecx), %edx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(second_loop_old_match)
	+ bsrl %eax, %eax
	+ addq %rdi, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+ ret
	+
	+ .p2align 4,, 4
	L(cross_page):
	- movq %rdi, %rax
	- pxor %xmm0, %xmm0
	- andq $-64, %rax
	- movdqu (%rax), %xmm5
	- movdqa %xmm5, %xmm6
	- movdqu 16(%rax), %xmm4
	- pcmpeqb %xmm1, %xmm5
	- pcmpeqb %xmm0, %xmm6
	- movdqu 32(%rax), %xmm3
	- pmovmskb %xmm6, %esi
	- movdqa %xmm4, %xmm6
	- movdqu 48(%rax), %xmm2
	- pcmpeqb %xmm1, %xmm4
	- pcmpeqb %xmm0, %xmm6
	- pmovmskb %xmm6, %edx
	- movdqa %xmm3, %xmm6
	- pcmpeqb %xmm1, %xmm3
	- pcmpeqb %xmm0, %xmm6
	- pcmpeqb %xmm2, %xmm0
	- salq $16, %rdx
	- pmovmskb %xmm3, %r9d
	- pmovmskb %xmm6, %r8d
	- pmovmskb %xmm0, %ecx
	- salq $32, %r9
	- salq $32, %r8
	- pcmpeqb %xmm1, %xmm2
	- orq %r8, %rdx
	- salq $48, %rcx
	- pmovmskb %xmm5, %r8d
	- orq %rsi, %rdx
	- pmovmskb %xmm4, %esi
	- orq %rcx, %rdx
	- pmovmskb %xmm2, %ecx
	- salq $16, %rsi
	- salq $48, %rcx
	- orq %r9, %rsi
	- orq %r8, %rsi
	- orq %rcx, %rsi
	+ movq %rdi, %rsi
	+ andq $-VEC_SIZE, %rsi
	+ movaps (%rsi), %xmm1
	+ pxor %xmm2, %xmm2
	+ PCMPEQ %xmm1, %xmm2
	+ pmovmskb %xmm2, %edx
	movl %edi, %ecx
	- subl %eax, %ecx
	- shrq %cl, %rdx
	- shrq %cl, %rsi
	- testq %rdx, %rdx
	- je L(loop_header2)
	- leaq -1(%rdx), %rax
	- xorq %rdx, %rax
	- andq %rax, %rsi
	- je L(exit)
	- bsrq %rsi, %rax
	+ andl $(VEC_SIZE - 1), %ecx
	+ sarl %cl, %edx
	+ jz L(cross_page_continue)
	+ PCMPEQ %xmm0, %xmm1
	+ pmovmskb %xmm1, %eax
	+ sarl %cl, %eax
	+ leal -1(%rdx), %ecx
	+ xorl %edx, %ecx
	+ andl %ecx, %eax
	+ jz L(ret1)
	+ bsrl %eax, %eax
	addq %rdi, %rax
	+#ifdef USE_AS_WCSRCHR
	+ andq $-CHAR_SIZE, %rax
	+#endif
	+L(ret1):
	ret
	-END (strrchr)
	+END(STRRCHR)

	-weak_alias (strrchr, rindex)
	-libc_hidden_builtin_def (strrchr)
	+#ifndef USE_AS_WCSRCHR
	+ weak_alias (STRRCHR, rindex)
	+ libc_hidden_builtin_def (STRRCHR)
	+#endif
	diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
	index 6b318d3f29de9a9e..9006f2220963d76c 100644
	--- a/sysdeps/x86_64/wcsrchr.S
	+++ b/sysdeps/x86_64/wcsrchr.S
	@@ -17,266 +17,12 @@
	License along with the GNU C Library; if not, see
	<https://www.gnu.org/licenses/>. */

	-#include <sysdep.h>

	- .text
	-ENTRY (wcsrchr)
	+#define USE_AS_WCSRCHR 1
	+#define NO_PMINU 1

	- movd %rsi, %xmm1
	- mov %rdi, %rcx
	- punpckldq %xmm1, %xmm1
	- pxor %xmm2, %xmm2
	- punpckldq %xmm1, %xmm1
	- and $63, %rcx
	- cmp $48, %rcx
	- ja L(crosscache)
	+#ifndef STRRCHR
	+# define STRRCHR wcsrchr
	+#endif

	- movdqu (%rdi), %xmm0
	- pcmpeqd %xmm0, %xmm2
	- pcmpeqd %xmm1, %xmm0
	- pmovmskb %xmm2, %rcx
	- pmovmskb %xmm0, %rax
	- add $16, %rdi
	-
	- test %rax, %rax
	- jnz L(unaligned_match1)
	-
	- test %rcx, %rcx
	- jnz L(return_null)
	-
	- and $-16, %rdi
	- xor %r8, %r8
	- jmp L(loop)
	-
	- .p2align 4
	-L(unaligned_match1):
	- test %rcx, %rcx
	- jnz L(prolog_find_zero_1)
	-
	- mov %rax, %r8
	- mov %rdi, %rsi
	- and $-16, %rdi
	- jmp L(loop)
	-
	- .p2align 4
	-L(crosscache):
	- and $15, %rcx
	- and $-16, %rdi
	- pxor %xmm3, %xmm3
	- movdqa (%rdi), %xmm0
	- pcmpeqd %xmm0, %xmm3
	- pcmpeqd %xmm1, %xmm0
	- pmovmskb %xmm3, %rdx
	- pmovmskb %xmm0, %rax
	- shr %cl, %rdx
	- shr %cl, %rax
	- add $16, %rdi
	-
	- test %rax, %rax
	- jnz L(unaligned_match)
	-
	- test %rdx, %rdx
	- jnz L(return_null)
	-
	- xor %r8, %r8
	- jmp L(loop)
	-
	- .p2align 4
	-L(unaligned_match):
	- test %rdx, %rdx
	- jnz L(prolog_find_zero)
	-
	- mov %rax, %r8
	- lea (%rdi, %rcx), %rsi
	-
	-/* Loop start on aligned string. */
	- .p2align 4
	-L(loop):
	- movdqa (%rdi), %xmm0
	- pcmpeqd %xmm0, %xmm2
	- add $16, %rdi
	- pcmpeqd %xmm1, %xmm0
	- pmovmskb %xmm2, %rcx
	- pmovmskb %xmm0, %rax
	- or %rax, %rcx
	- jnz L(matches)
	-
	- movdqa (%rdi), %xmm3
	- pcmpeqd %xmm3, %xmm2
	- add $16, %rdi
	- pcmpeqd %xmm1, %xmm3
	- pmovmskb %xmm2, %rcx
	- pmovmskb %xmm3, %rax
	- or %rax, %rcx
	- jnz L(matches)
	-
	- movdqa (%rdi), %xmm4
	- pcmpeqd %xmm4, %xmm2
	- add $16, %rdi
	- pcmpeqd %xmm1, %xmm4
	- pmovmskb %xmm2, %rcx
	- pmovmskb %xmm4, %rax
	- or %rax, %rcx
	- jnz L(matches)
	-
	- movdqa (%rdi), %xmm5
	- pcmpeqd %xmm5, %xmm2
	- add $16, %rdi
	- pcmpeqd %xmm1, %xmm5
	- pmovmskb %xmm2, %rcx
	- pmovmskb %xmm5, %rax
	- or %rax, %rcx
	- jz L(loop)
	-
	- .p2align 4
	-L(matches):
	- test %rax, %rax
	- jnz L(match)
	-L(return_value):
	- test %r8, %r8
	- jz L(return_null)
	- mov %r8, %rax
	- mov %rsi, %rdi
	-
	- test $15 << 4, %ah
	- jnz L(match_fourth_wchar)
	- test %ah, %ah
	- jnz L(match_third_wchar)
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(match):
	- pmovmskb %xmm2, %rcx
	- test %rcx, %rcx
	- jnz L(find_zero)
	- mov %rax, %r8
	- mov %rdi, %rsi
	- jmp L(loop)
	-
	- .p2align 4
	-L(find_zero):
	- test $15, %cl
	- jnz L(find_zero_in_first_wchar)
	- test %cl, %cl
	- jnz L(find_zero_in_second_wchar)
	- test $15, %ch
	- jnz L(find_zero_in_third_wchar)
	-
	- and $1 << 13 - 1, %rax
	- jz L(return_value)
	-
	- test $15 << 4, %ah
	- jnz L(match_fourth_wchar)
	- test %ah, %ah
	- jnz L(match_third_wchar)
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(find_zero_in_first_wchar):
	- test $1, %rax
	- jz L(return_value)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(find_zero_in_second_wchar):
	- and $1 << 5 - 1, %rax
	- jz L(return_value)
	-
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(find_zero_in_third_wchar):
	- and $1 << 9 - 1, %rax
	- jz L(return_value)
	-
	- test %ah, %ah
	- jnz L(match_third_wchar)
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(prolog_find_zero):
	- add %rcx, %rdi
	- mov %rdx, %rcx
	-L(prolog_find_zero_1):
	- test $15, %cl
	- jnz L(prolog_find_zero_in_first_wchar)
	- test %cl, %cl
	- jnz L(prolog_find_zero_in_second_wchar)
	- test $15, %ch
	- jnz L(prolog_find_zero_in_third_wchar)
	-
	- and $1 << 13 - 1, %rax
	- jz L(return_null)
	-
	- test $15 << 4, %ah
	- jnz L(match_fourth_wchar)
	- test %ah, %ah
	- jnz L(match_third_wchar)
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(prolog_find_zero_in_first_wchar):
	- test $1, %rax
	- jz L(return_null)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(prolog_find_zero_in_second_wchar):
	- and $1 << 5 - 1, %rax
	- jz L(return_null)
	-
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(prolog_find_zero_in_third_wchar):
	- and $1 << 9 - 1, %rax
	- jz L(return_null)
	-
	- test %ah, %ah
	- jnz L(match_third_wchar)
	- test $15 << 4, %al
	- jnz L(match_second_wchar)
	- lea -16(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(match_second_wchar):
	- lea -12(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(match_third_wchar):
	- lea -8(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(match_fourth_wchar):
	- lea -4(%rdi), %rax
	- ret
	-
	- .p2align 4
	-L(return_null):
	- xor %rax, %rax
	- ret
	-
	-END (wcsrchr)
	+#include "../strrchr.S"

rpms / glibc

Source Code

Files