08c3a6
commit 0a11305416e287d85c64f04337cfd64b6b350e0c
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Thu Apr 21 20:52:28 2022 -0500
08c3a6
08c3a6
    x86: Optimize {str|wcs}rchr-sse2
08c3a6
    
08c3a6
    The new code unrolls the main loop slightly without adding too much
08c3a6
    overhead and minimizes the comparisons for the search CHAR.
08c3a6
    
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.741
08c3a6
    See email for all results.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64 with and without multiarch enabled.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
08c3a6
index 67c30d0260cef8a3..a56300bc1830dedd 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
08c3a6
@@ -17,7 +17,7 @@
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
 #if IS_IN (libc)
08c3a6
-# define strrchr __strrchr_sse2
08c3a6
+# define STRRCHR __strrchr_sse2
08c3a6
 
08c3a6
 # undef weak_alias
08c3a6
 # define weak_alias(strrchr, rindex)
08c3a6
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
08c3a6
index a36034b40afe8d3d..00f69f2be77a43a0 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
08c3a6
@@ -17,7 +17,6 @@
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
 #if IS_IN (libc)
08c3a6
-# define wcsrchr __wcsrchr_sse2
08c3a6
+# define STRRCHR	__wcsrchr_sse2
08c3a6
 #endif
08c3a6
-
08c3a6
 #include "../wcsrchr.S"
08c3a6
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
08c3a6
index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
08c3a6
--- a/sysdeps/x86_64/strrchr.S
08c3a6
+++ b/sysdeps/x86_64/strrchr.S
08c3a6
@@ -19,210 +19,360 @@
08c3a6
 
08c3a6
 #include <sysdep.h>
08c3a6
 
08c3a6
+#ifndef STRRCHR
08c3a6
+# define STRRCHR	strrchr
08c3a6
+#endif
08c3a6
+
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+# define PCMPEQ	pcmpeqd
08c3a6
+# define CHAR_SIZE	4
08c3a6
+# define PMINU	pminud
08c3a6
+#else
08c3a6
+# define PCMPEQ	pcmpeqb
08c3a6
+# define CHAR_SIZE	1
08c3a6
+# define PMINU	pminub
08c3a6
+#endif
08c3a6
+
08c3a6
+#define PAGE_SIZE	4096
08c3a6
+#define VEC_SIZE	16
08c3a6
+
08c3a6
 	.text
08c3a6
-ENTRY (strrchr)
08c3a6
-	movd	%esi, %xmm1
08c3a6
+ENTRY(STRRCHR)
08c3a6
+	movd	%esi, %xmm0
08c3a6
 	movq	%rdi, %rax
08c3a6
-	andl	$4095, %eax
08c3a6
-	punpcklbw	%xmm1, %xmm1
08c3a6
-	cmpq	$4032, %rax
08c3a6
-	punpcklwd	%xmm1, %xmm1
08c3a6
-	pshufd	$0, %xmm1, %xmm1
08c3a6
+	andl	$(PAGE_SIZE - 1), %eax
08c3a6
+#ifndef USE_AS_WCSRCHR
08c3a6
+	punpcklbw %xmm0, %xmm0
08c3a6
+	punpcklwd %xmm0, %xmm0
08c3a6
+#endif
08c3a6
+	pshufd	$0, %xmm0, %xmm0
08c3a6
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
 	ja	L(cross_page)
08c3a6
-	movdqu	(%rdi), %xmm0
08c3a6
+
08c3a6
+L(cross_page_continue):
08c3a6
+	movups	(%rdi), %xmm1
08c3a6
 	pxor	%xmm2, %xmm2
08c3a6
-	movdqa	%xmm0, %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pcmpeqb	%xmm2, %xmm3
08c3a6
-	pmovmskb	%xmm0, %ecx
08c3a6
-	pmovmskb	%xmm3, %edx
08c3a6
-	testq	%rdx, %rdx
08c3a6
-	je	L(next_48_bytes)
08c3a6
-	leaq	-1(%rdx), %rax
08c3a6
-	xorq	%rdx, %rax
08c3a6
-	andq	%rcx, %rax
08c3a6
-	je	L(exit)
08c3a6
-	bsrq	%rax, %rax
08c3a6
+	PCMPEQ	%xmm1, %xmm2
08c3a6
+	pmovmskb %xmm2, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(aligned_more)
08c3a6
+
08c3a6
+	PCMPEQ	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+	leal	-1(%rcx), %edx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret0)
08c3a6
+	bsrl	%eax, %eax
08c3a6
 	addq	%rdi, %rax
08c3a6
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
08c3a6
+	   search CHAR is zero we are correct. Either way `andq
08c3a6
+	   -CHAR_SIZE, %rax` gets the correct result.  */
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+L(ret0):
08c3a6
 	ret
08c3a6
 
08c3a6
+	/* Returns for first vec x1/x2 have hard coded backward search
08c3a6
+	   path for earlier matches.  */
08c3a6
 	.p2align 4
08c3a6
-L(next_48_bytes):
08c3a6
-	movdqu	16(%rdi), %xmm4
08c3a6
-	movdqa	%xmm4, %xmm5
08c3a6
-	movdqu	32(%rdi), %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm4
08c3a6
-	pcmpeqb	%xmm2, %xmm5
08c3a6
-	movdqu	48(%rdi), %xmm0
08c3a6
-	pmovmskb	%xmm5, %edx
08c3a6
-	movdqa	%xmm3, %xmm5
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pcmpeqb	%xmm2, %xmm5
08c3a6
-	pcmpeqb	%xmm0, %xmm2
08c3a6
-	salq	$16, %rdx
08c3a6
-	pmovmskb	%xmm3, %r8d
08c3a6
-	pmovmskb	%xmm5, %eax
08c3a6
-	pmovmskb	%xmm2, %esi
08c3a6
-	salq	$32, %r8
08c3a6
-	salq	$32, %rax
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	orq	%rdx, %rax
08c3a6
-	movq	%rsi, %rdx
08c3a6
-	pmovmskb	%xmm4, %esi
08c3a6
-	salq	$48, %rdx
08c3a6
-	salq	$16, %rsi
08c3a6
-	orq	%r8, %rsi
08c3a6
-	orq	%rcx, %rsi
08c3a6
-	pmovmskb	%xmm0, %ecx
08c3a6
-	salq	$48, %rcx
08c3a6
-	orq	%rcx, %rsi
08c3a6
-	orq	%rdx, %rax
08c3a6
-	je	L(loop_header2)
08c3a6
-	leaq	-1(%rax), %rcx
08c3a6
-	xorq	%rax, %rcx
08c3a6
-	andq	%rcx, %rsi
08c3a6
-	je	L(exit)
08c3a6
-	bsrq	%rsi, %rsi
08c3a6
-	leaq	(%rdi,%rsi), %rax
08c3a6
+L(first_vec_x0_test):
08c3a6
+	PCMPEQ	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jz	L(ret0)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%r8, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
 	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(loop_header2):
08c3a6
-	testq	%rsi, %rsi
08c3a6
-	movq	%rdi, %rcx
08c3a6
-	je	L(no_c_found)
08c3a6
-L(loop_header):
08c3a6
-	addq	$64, %rdi
08c3a6
-	pxor	%xmm7, %xmm7
08c3a6
-	andq	$-64, %rdi
08c3a6
-	jmp	L(loop_entry)
08c3a6
+L(first_vec_x1):
08c3a6
+	PCMPEQ	%xmm0, %xmm2
08c3a6
+	pmovmskb %xmm2, %eax
08c3a6
+	leal	-1(%rcx), %edx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(loop64):
08c3a6
-	testq	%rdx, %rdx
08c3a6
-	cmovne	%rdx, %rsi
08c3a6
-	cmovne	%rdi, %rcx
08c3a6
-	addq	$64, %rdi
08c3a6
-L(loop_entry):
08c3a6
-	movdqa	32(%rdi), %xmm3
08c3a6
-	pxor	%xmm6, %xmm6
08c3a6
-	movdqa	48(%rdi), %xmm2
08c3a6
-	movdqa	%xmm3, %xmm0
08c3a6
-	movdqa	16(%rdi), %xmm4
08c3a6
-	pminub	%xmm2, %xmm0
08c3a6
-	movdqa	(%rdi), %xmm5
08c3a6
-	pminub	%xmm4, %xmm0
08c3a6
-	pminub	%xmm5, %xmm0
08c3a6
-	pcmpeqb	%xmm7, %xmm0
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	movdqa	%xmm5, %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %r9d
08c3a6
-	movdqa	%xmm4, %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %edx
08c3a6
-	movdqa	%xmm3, %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	salq	$16, %rdx
08c3a6
-	pmovmskb	%xmm0, %r10d
08c3a6
-	movdqa	%xmm2, %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	salq	$32, %r10
08c3a6
-	orq	%r10, %rdx
08c3a6
-	pmovmskb	%xmm0, %r8d
08c3a6
-	orq	%r9, %rdx
08c3a6
-	salq	$48, %r8
08c3a6
-	orq	%r8, %rdx
08c3a6
+L(first_vec_x1_test):
08c3a6
+	PCMPEQ	%xmm0, %xmm2
08c3a6
+	pmovmskb %xmm2, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	je	L(loop64)
08c3a6
-	pcmpeqb	%xmm6, %xmm4
08c3a6
-	pcmpeqb	%xmm6, %xmm3
08c3a6
-	pcmpeqb	%xmm6, %xmm5
08c3a6
-	pmovmskb	%xmm4, %eax
08c3a6
-	pmovmskb	%xmm3, %r10d
08c3a6
-	pcmpeqb	%xmm6, %xmm2
08c3a6
-	pmovmskb	%xmm5, %r9d
08c3a6
-	salq	$32, %r10
08c3a6
-	salq	$16, %rax
08c3a6
-	pmovmskb	%xmm2, %r8d
08c3a6
-	orq	%r10, %rax
08c3a6
-	orq	%r9, %rax
08c3a6
-	salq	$48, %r8
08c3a6
-	orq	%r8, %rax
08c3a6
-	leaq	-1(%rax), %r8
08c3a6
-	xorq	%rax, %r8
08c3a6
-	andq	%r8, %rdx
08c3a6
-	cmovne	%rdi, %rcx
08c3a6
-	cmovne	%rdx, %rsi
08c3a6
-	bsrq	%rsi, %rsi
08c3a6
-	leaq	(%rcx,%rsi), %rax
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4
08c3a6
+L(first_vec_x2):
08c3a6
+	PCMPEQ	%xmm0, %xmm3
08c3a6
+	pmovmskb %xmm3, %eax
08c3a6
+	leal	-1(%rcx), %edx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_vec_x1_test)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4
08c3a6
+L(aligned_more):
08c3a6
+	/* Save original pointer if match was in VEC 0.  */
08c3a6
+	movq	%rdi, %r8
08c3a6
+	andq	$-VEC_SIZE, %rdi
08c3a6
+
08c3a6
+	movaps	VEC_SIZE(%rdi), %xmm2
08c3a6
+	pxor	%xmm3, %xmm3
08c3a6
+	PCMPEQ	%xmm2, %xmm3
08c3a6
+	pmovmskb %xmm3, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x1)
08c3a6
+
08c3a6
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
08c3a6
+	pxor	%xmm4, %xmm4
08c3a6
+	PCMPEQ	%xmm3, %xmm4
08c3a6
+	pmovmskb %xmm4, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(first_vec_x2)
08c3a6
+
08c3a6
+	addq	$VEC_SIZE, %rdi
08c3a6
+	/* Save pointer again before realigning.  */
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	andq	$-(VEC_SIZE * 2), %rdi
08c3a6
+	.p2align 4
08c3a6
+L(first_loop):
08c3a6
+	/* Do 2x VEC at a time.  */
08c3a6
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
08c3a6
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
08c3a6
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
08c3a6
+	   detecting zero. Note if this is found to be a bottleneck it
08c3a6
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	movaps	%xmm5, %xmm6
08c3a6
+	pxor	%xmm8, %xmm8
08c3a6
+
08c3a6
+	PCMPEQ	%xmm8, %xmm5
08c3a6
+	PCMPEQ	%xmm4, %xmm8
08c3a6
+	por	%xmm5, %xmm8
08c3a6
+#else
08c3a6
+	movaps	%xmm5, %xmm6
08c3a6
+	PMINU	%xmm4, %xmm5
08c3a6
+#endif
08c3a6
+
08c3a6
+	movaps	%xmm4, %xmm9
08c3a6
+	PCMPEQ	%xmm0, %xmm4
08c3a6
+	PCMPEQ	%xmm0, %xmm6
08c3a6
+	movaps	%xmm6, %xmm7
08c3a6
+	por	%xmm4, %xmm6
08c3a6
+#ifndef USE_AS_WCSRCHR
08c3a6
+	pxor	%xmm8, %xmm8
08c3a6
+	PCMPEQ	%xmm5, %xmm8
08c3a6
+#endif
08c3a6
+	pmovmskb %xmm8, %ecx
08c3a6
+	pmovmskb %xmm6, %eax
08c3a6
+
08c3a6
+	addq	$(VEC_SIZE * 2), %rdi
08c3a6
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
08c3a6
+	   macro-fuse with `jz`.  */
08c3a6
+	addl	%ecx, %eax
08c3a6
+	jz	L(first_loop)
08c3a6
+
08c3a6
+	/* Check if there is zero match.  */
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(second_loop_match)
08c3a6
+
08c3a6
+	/* Check if there was a match in last iteration.  */
08c3a6
+	subl	%ecx, %eax
08c3a6
+	jnz	L(new_match)
08c3a6
+
08c3a6
+L(first_loop_old_match):
08c3a6
+	PCMPEQ	%xmm0, %xmm2
08c3a6
+	PCMPEQ	%xmm0, %xmm3
08c3a6
+	pmovmskb %xmm2, %ecx
08c3a6
+	pmovmskb %xmm3, %eax
08c3a6
+	addl	%eax, %ecx
08c3a6
+	jz	L(first_vec_x0_test)
08c3a6
+	/* NB: We could move this shift to before the branch and save a
08c3a6
+	   bit of code size / performance on the fall through. The
08c3a6
+	   branch leads to the null case which generally seems hotter
08c3a6
+	   than char in first 3x VEC.  */
08c3a6
+	sall	$16, %eax
08c3a6
+	orl	%ecx, %eax
08c3a6
+
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rsi, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4
08c3a6
+L(new_match):
08c3a6
+	pxor	%xmm6, %xmm6
08c3a6
+	PCMPEQ	%xmm9, %xmm6
08c3a6
+	pmovmskb %xmm6, %eax
08c3a6
+	sall	$16, %ecx
08c3a6
+	orl	%eax, %ecx
08c3a6
+
08c3a6
+	/* We can't reuse either of the old comparisons as since we mask
08c3a6
+	   of zeros after first zero (instead of using the full
08c3a6
+	   comparison) we can't gurantee no interference between match
08c3a6
+	   after end of string and valid match.  */
08c3a6
+	pmovmskb %xmm4, %eax
08c3a6
+	pmovmskb %xmm7, %edx
08c3a6
+	sall	$16, %edx
08c3a6
+	orl	%edx, %eax
08c3a6
+
08c3a6
+	leal	-1(%ecx), %edx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(first_loop_old_match)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rdi, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
 	ret
08c3a6
 
08c3a6
+	/* Save minimum state for getting most recent match. We can
08c3a6
+	   throw out all previous work.  */
08c3a6
 	.p2align 4
08c3a6
-L(no_c_found):
08c3a6
-	movl	$1, %esi
08c3a6
-	xorl	%ecx, %ecx
08c3a6
-	jmp	L(loop_header)
08c3a6
+L(second_loop_match):
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	movaps	%xmm4, %xmm2
08c3a6
+	movaps	%xmm7, %xmm3
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(exit):
08c3a6
-	xorl	%eax, %eax
08c3a6
+L(second_loop):
08c3a6
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
08c3a6
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
08c3a6
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
08c3a6
+	   detecting zero. Note if this is found to be a bottleneck it
08c3a6
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	movaps	%xmm5, %xmm6
08c3a6
+	pxor	%xmm8, %xmm8
08c3a6
+
08c3a6
+	PCMPEQ	%xmm8, %xmm5
08c3a6
+	PCMPEQ	%xmm4, %xmm8
08c3a6
+	por	%xmm5, %xmm8
08c3a6
+#else
08c3a6
+	movaps	%xmm5, %xmm6
08c3a6
+	PMINU	%xmm4, %xmm5
08c3a6
+#endif
08c3a6
+
08c3a6
+	movaps	%xmm4, %xmm9
08c3a6
+	PCMPEQ	%xmm0, %xmm4
08c3a6
+	PCMPEQ	%xmm0, %xmm6
08c3a6
+	movaps	%xmm6, %xmm7
08c3a6
+	por	%xmm4, %xmm6
08c3a6
+#ifndef USE_AS_WCSRCHR
08c3a6
+	pxor	%xmm8, %xmm8
08c3a6
+	PCMPEQ	%xmm5, %xmm8
08c3a6
+#endif
08c3a6
+
08c3a6
+	pmovmskb %xmm8, %ecx
08c3a6
+	pmovmskb %xmm6, %eax
08c3a6
+
08c3a6
+	addq	$(VEC_SIZE * 2), %rdi
08c3a6
+	/* Either null term or new occurence of CHAR.  */
08c3a6
+	addl	%ecx, %eax
08c3a6
+	jz	L(second_loop)
08c3a6
+
08c3a6
+	/* No null term so much be new occurence of CHAR.  */
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jz	L(second_loop_match)
08c3a6
+
08c3a6
+
08c3a6
+	subl	%ecx, %eax
08c3a6
+	jnz	L(second_loop_new_match)
08c3a6
+
08c3a6
+L(second_loop_old_match):
08c3a6
+	pmovmskb %xmm2, %ecx
08c3a6
+	pmovmskb %xmm3, %eax
08c3a6
+	sall	$16, %eax
08c3a6
+	orl	%ecx, %eax
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rsi, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
 	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
+L(second_loop_new_match):
08c3a6
+	pxor	%xmm6, %xmm6
08c3a6
+	PCMPEQ	%xmm9, %xmm6
08c3a6
+	pmovmskb %xmm6, %eax
08c3a6
+	sall	$16, %ecx
08c3a6
+	orl	%eax, %ecx
08c3a6
+
08c3a6
+	/* We can't reuse either of the old comparisons as since we mask
08c3a6
+	   of zeros after first zero (instead of using the full
08c3a6
+	   comparison) we can't gurantee no interference between match
08c3a6
+	   after end of string and valid match.  */
08c3a6
+	pmovmskb %xmm4, %eax
08c3a6
+	pmovmskb %xmm7, %edx
08c3a6
+	sall	$16, %edx
08c3a6
+	orl	%edx, %eax
08c3a6
+
08c3a6
+	leal	-1(%ecx), %edx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(second_loop_old_match)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	addq	%rdi, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4,, 4
08c3a6
 L(cross_page):
08c3a6
-	movq	%rdi, %rax
08c3a6
-	pxor	%xmm0, %xmm0
08c3a6
-	andq	$-64, %rax
08c3a6
-	movdqu	(%rax), %xmm5
08c3a6
-	movdqa	%xmm5, %xmm6
08c3a6
-	movdqu	16(%rax), %xmm4
08c3a6
-	pcmpeqb	%xmm1, %xmm5
08c3a6
-	pcmpeqb	%xmm0, %xmm6
08c3a6
-	movdqu	32(%rax), %xmm3
08c3a6
-	pmovmskb	%xmm6, %esi
08c3a6
-	movdqa	%xmm4, %xmm6
08c3a6
-	movdqu	48(%rax), %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm4
08c3a6
-	pcmpeqb	%xmm0, %xmm6
08c3a6
-	pmovmskb	%xmm6, %edx
08c3a6
-	movdqa	%xmm3, %xmm6
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pcmpeqb	%xmm0, %xmm6
08c3a6
-	pcmpeqb	%xmm2, %xmm0
08c3a6
-	salq	$16, %rdx
08c3a6
-	pmovmskb	%xmm3, %r9d
08c3a6
-	pmovmskb	%xmm6, %r8d
08c3a6
-	pmovmskb	%xmm0, %ecx
08c3a6
-	salq	$32, %r9
08c3a6
-	salq	$32, %r8
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	orq	%r8, %rdx
08c3a6
-	salq	$48, %rcx
08c3a6
-	pmovmskb	%xmm5, %r8d
08c3a6
-	orq	%rsi, %rdx
08c3a6
-	pmovmskb	%xmm4, %esi
08c3a6
-	orq	%rcx, %rdx
08c3a6
-	pmovmskb	%xmm2, %ecx
08c3a6
-	salq	$16, %rsi
08c3a6
-	salq	$48, %rcx
08c3a6
-	orq	%r9, %rsi
08c3a6
-	orq	%r8, %rsi
08c3a6
-	orq	%rcx, %rsi
08c3a6
+	movq	%rdi, %rsi
08c3a6
+	andq	$-VEC_SIZE, %rsi
08c3a6
+	movaps	(%rsi), %xmm1
08c3a6
+	pxor	%xmm2, %xmm2
08c3a6
+	PCMPEQ	%xmm1, %xmm2
08c3a6
+	pmovmskb %xmm2, %edx
08c3a6
 	movl	%edi, %ecx
08c3a6
-	subl	%eax, %ecx
08c3a6
-	shrq	%cl, %rdx
08c3a6
-	shrq	%cl, %rsi
08c3a6
-	testq	%rdx, %rdx
08c3a6
-	je	L(loop_header2)
08c3a6
-	leaq	-1(%rdx), %rax
08c3a6
-	xorq	%rdx, %rax
08c3a6
-	andq	%rax, %rsi
08c3a6
-	je	L(exit)
08c3a6
-	bsrq	%rsi, %rax
08c3a6
+	andl	$(VEC_SIZE - 1), %ecx
08c3a6
+	sarl	%cl, %edx
08c3a6
+	jz	L(cross_page_continue)
08c3a6
+	PCMPEQ	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+	sarl	%cl, %eax
08c3a6
+	leal	-1(%rdx), %ecx
08c3a6
+	xorl	%edx, %ecx
08c3a6
+	andl	%ecx, %eax
08c3a6
+	jz	L(ret1)
08c3a6
+	bsrl	%eax, %eax
08c3a6
 	addq	%rdi, %rax
08c3a6
+#ifdef USE_AS_WCSRCHR
08c3a6
+	andq	$-CHAR_SIZE, %rax
08c3a6
+#endif
08c3a6
+L(ret1):
08c3a6
 	ret
08c3a6
-END (strrchr)
08c3a6
+END(STRRCHR)
08c3a6
 
08c3a6
-weak_alias (strrchr, rindex)
08c3a6
-libc_hidden_builtin_def (strrchr)
08c3a6
+#ifndef USE_AS_WCSRCHR
08c3a6
+	weak_alias (STRRCHR, rindex)
08c3a6
+	libc_hidden_builtin_def (STRRCHR)
08c3a6
+#endif
08c3a6
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
08c3a6
index 6b318d3f29de9a9e..9006f2220963d76c 100644
08c3a6
--- a/sysdeps/x86_64/wcsrchr.S
08c3a6
+++ b/sysdeps/x86_64/wcsrchr.S
08c3a6
@@ -17,266 +17,12 @@
08c3a6
    License along with the GNU C Library; if not, see
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
-#include <sysdep.h>
08c3a6
 
08c3a6
-	.text
08c3a6
-ENTRY (wcsrchr)
08c3a6
+#define USE_AS_WCSRCHR	1
08c3a6
+#define NO_PMINU	1
08c3a6
 
08c3a6
-	movd	%rsi, %xmm1
08c3a6
-	mov	%rdi, %rcx
08c3a6
-	punpckldq %xmm1, %xmm1
08c3a6
-	pxor	%xmm2, %xmm2
08c3a6
-	punpckldq %xmm1, %xmm1
08c3a6
-	and	$63, %rcx
08c3a6
-	cmp	$48, %rcx
08c3a6
-	ja	L(crosscache)
08c3a6
+#ifndef STRRCHR
08c3a6
+# define STRRCHR	wcsrchr
08c3a6
+#endif
08c3a6
 
08c3a6
-	movdqu	(%rdi), %xmm0
08c3a6
-	pcmpeqd	%xmm0, %xmm2
08c3a6
-	pcmpeqd	%xmm1, %xmm0
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	pmovmskb %xmm0, %rax
08c3a6
-	add	$16, %rdi
08c3a6
-
08c3a6
-	test	%rax, %rax
08c3a6
-	jnz	L(unaligned_match1)
08c3a6
-
08c3a6
-	test	%rcx, %rcx
08c3a6
-	jnz	L(return_null)
08c3a6
-
08c3a6
-	and	$-16, %rdi
08c3a6
-	xor	%r8, %r8
08c3a6
-	jmp	L(loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(unaligned_match1):
08c3a6
-	test	%rcx, %rcx
08c3a6
-	jnz	L(prolog_find_zero_1)
08c3a6
-
08c3a6
-	mov	%rax, %r8
08c3a6
-	mov	%rdi, %rsi
08c3a6
-	and	$-16, %rdi
08c3a6
-	jmp	L(loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(crosscache):
08c3a6
-	and	$15, %rcx
08c3a6
-	and	$-16, %rdi
08c3a6
-	pxor	%xmm3, %xmm3
08c3a6
-	movdqa	(%rdi), %xmm0
08c3a6
-	pcmpeqd	%xmm0, %xmm3
08c3a6
-	pcmpeqd	%xmm1, %xmm0
08c3a6
-	pmovmskb %xmm3, %rdx
08c3a6
-	pmovmskb %xmm0, %rax
08c3a6
-	shr	%cl, %rdx
08c3a6
-	shr	%cl, %rax
08c3a6
-	add	$16, %rdi
08c3a6
-
08c3a6
-	test	%rax, %rax
08c3a6
-	jnz	L(unaligned_match)
08c3a6
-
08c3a6
-	test	%rdx, %rdx
08c3a6
-	jnz	L(return_null)
08c3a6
-
08c3a6
-	xor	%r8, %r8
08c3a6
-	jmp	L(loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(unaligned_match):
08c3a6
-	test	%rdx, %rdx
08c3a6
-	jnz	L(prolog_find_zero)
08c3a6
-
08c3a6
-	mov	%rax, %r8
08c3a6
-	lea	(%rdi, %rcx), %rsi
08c3a6
-
08c3a6
-/* Loop start on aligned string.  */
08c3a6
-	.p2align 4
08c3a6
-L(loop):
08c3a6
-	movdqa	(%rdi), %xmm0
08c3a6
-	pcmpeqd	%xmm0, %xmm2
08c3a6
-	add	$16, %rdi
08c3a6
-	pcmpeqd	%xmm1, %xmm0
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	pmovmskb %xmm0, %rax
08c3a6
-	or	%rax, %rcx
08c3a6
-	jnz	L(matches)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm3
08c3a6
-	pcmpeqd	%xmm3, %xmm2
08c3a6
-	add	$16, %rdi
08c3a6
-	pcmpeqd	%xmm1, %xmm3
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	pmovmskb %xmm3, %rax
08c3a6
-	or	%rax, %rcx
08c3a6
-	jnz	L(matches)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm4
08c3a6
-	pcmpeqd	%xmm4, %xmm2
08c3a6
-	add	$16, %rdi
08c3a6
-	pcmpeqd	%xmm1, %xmm4
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	pmovmskb %xmm4, %rax
08c3a6
-	or	%rax, %rcx
08c3a6
-	jnz	L(matches)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm5
08c3a6
-	pcmpeqd	%xmm5, %xmm2
08c3a6
-	add	$16, %rdi
08c3a6
-	pcmpeqd	%xmm1, %xmm5
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	pmovmskb %xmm5, %rax
08c3a6
-	or	%rax, %rcx
08c3a6
-	jz	L(loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(matches):
08c3a6
-	test	%rax, %rax
08c3a6
-	jnz	L(match)
08c3a6
-L(return_value):
08c3a6
-	test	%r8, %r8
08c3a6
-	jz	L(return_null)
08c3a6
-	mov	%r8, %rax
08c3a6
-	mov	%rsi, %rdi
08c3a6
-
08c3a6
-	test	$15 << 4, %ah
08c3a6
-	jnz	L(match_fourth_wchar)
08c3a6
-	test	%ah, %ah
08c3a6
-	jnz	L(match_third_wchar)
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(match):
08c3a6
-	pmovmskb %xmm2, %rcx
08c3a6
-	test	%rcx, %rcx
08c3a6
-	jnz	L(find_zero)
08c3a6
-	mov	%rax, %r8
08c3a6
-	mov	%rdi, %rsi
08c3a6
-	jmp	L(loop)
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(find_zero):
08c3a6
-	test	$15, %cl
08c3a6
-	jnz	L(find_zero_in_first_wchar)
08c3a6
-	test	%cl, %cl
08c3a6
-	jnz	L(find_zero_in_second_wchar)
08c3a6
-	test	$15, %ch
08c3a6
-	jnz	L(find_zero_in_third_wchar)
08c3a6
-
08c3a6
-	and	$1 << 13 - 1, %rax
08c3a6
-	jz	L(return_value)
08c3a6
-
08c3a6
-	test	$15 << 4, %ah
08c3a6
-	jnz	L(match_fourth_wchar)
08c3a6
-	test	%ah, %ah
08c3a6
-	jnz	L(match_third_wchar)
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(find_zero_in_first_wchar):
08c3a6
-	test	$1, %rax
08c3a6
-	jz	L(return_value)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(find_zero_in_second_wchar):
08c3a6
-	and	$1 << 5 - 1, %rax
08c3a6
-	jz	L(return_value)
08c3a6
-
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(find_zero_in_third_wchar):
08c3a6
-	and	$1 << 9 - 1, %rax
08c3a6
-	jz	L(return_value)
08c3a6
-
08c3a6
-	test	%ah, %ah
08c3a6
-	jnz	L(match_third_wchar)
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(prolog_find_zero):
08c3a6
-	add	%rcx, %rdi
08c3a6
-	mov     %rdx, %rcx
08c3a6
-L(prolog_find_zero_1):
08c3a6
-	test	$15, %cl
08c3a6
-	jnz	L(prolog_find_zero_in_first_wchar)
08c3a6
-	test	%cl, %cl
08c3a6
-	jnz	L(prolog_find_zero_in_second_wchar)
08c3a6
-	test	$15, %ch
08c3a6
-	jnz	L(prolog_find_zero_in_third_wchar)
08c3a6
-
08c3a6
-	and	$1 << 13 - 1, %rax
08c3a6
-	jz	L(return_null)
08c3a6
-
08c3a6
-	test	$15 << 4, %ah
08c3a6
-	jnz	L(match_fourth_wchar)
08c3a6
-	test	%ah, %ah
08c3a6
-	jnz	L(match_third_wchar)
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(prolog_find_zero_in_first_wchar):
08c3a6
-	test	$1, %rax
08c3a6
-	jz	L(return_null)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(prolog_find_zero_in_second_wchar):
08c3a6
-	and	$1 << 5 - 1, %rax
08c3a6
-	jz	L(return_null)
08c3a6
-
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(prolog_find_zero_in_third_wchar):
08c3a6
-	and	$1 << 9 - 1, %rax
08c3a6
-	jz	L(return_null)
08c3a6
-
08c3a6
-	test	%ah, %ah
08c3a6
-	jnz	L(match_third_wchar)
08c3a6
-	test	$15 << 4, %al
08c3a6
-	jnz	L(match_second_wchar)
08c3a6
-	lea	-16(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(match_second_wchar):
08c3a6
-	lea	-12(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(match_third_wchar):
08c3a6
-	lea	-8(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(match_fourth_wchar):
08c3a6
-	lea	-4(%rdi), %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(return_null):
08c3a6
-	xor	%rax, %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-END (wcsrchr)
08c3a6
+#include "../strrchr.S"