Blame SOURCES/ia-opt-str-wcs_rchr-sse2.patch

513694
From 70016c060a99e8534469cdeb847eabe60bff2b54 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Thu, 21 Apr 2022 20:52:28 -0500
513694
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
513694
513694
The new code unrolls the main loop slightly without adding too much
513694
overhead and minimizes the comparisons for the search CHAR.
513694
513694
Geometric Mean of all benchmarks New / Old: 0.741
513694
See email for all results.
513694
513694
Full xcheck passes on x86_64 with and without multiarch enabled.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
513694
---
513694
 sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
513694
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
513694
 sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
513694
 sysdeps/x86_64/wcsrchr.S                | 266 +-----------
513694
 4 files changed, 338 insertions(+), 443 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
513694
index 0ec76fe9..6bb1284b 100644
513694
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
513694
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
513694
@@ -17,7 +17,7 @@
513694
    <http://www.gnu.org/licenses/>.  */
513694
 
513694
 #if IS_IN (libc)
513694
-# define strrchr __strrchr_sse2
513694
+# define STRRCHR __strrchr_sse2
513694
 
513694
 # undef weak_alias
513694
 # define weak_alias(strrchr, rindex)
513694
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
513694
index d015e953..f26d53b5 100644
513694
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
513694
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
513694
@@ -17,7 +17,6 @@
513694
    <http://www.gnu.org/licenses/>.  */
513694
 
513694
 #if IS_IN (libc)
513694
-# define wcsrchr __wcsrchr_sse2
513694
+# define STRRCHR	__wcsrchr_sse2
513694
 #endif
513694
-
513694
 #include "../wcsrchr.S"
513694
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
513694
index aca98e7e..a58cc220 100644
513694
--- a/sysdeps/x86_64/strrchr.S
513694
+++ b/sysdeps/x86_64/strrchr.S
513694
@@ -19,210 +19,360 @@
513694
 
513694
 #include <sysdep.h>
513694
 
513694
+#ifndef STRRCHR
513694
+# define STRRCHR	strrchr
513694
+#endif
513694
+
513694
+#ifdef USE_AS_WCSRCHR
513694
+# define PCMPEQ	pcmpeqd
513694
+# define CHAR_SIZE	4
513694
+# define PMINU	pminud
513694
+#else
513694
+# define PCMPEQ	pcmpeqb
513694
+# define CHAR_SIZE	1
513694
+# define PMINU	pminub
513694
+#endif
513694
+
513694
+#define PAGE_SIZE	4096
513694
+#define VEC_SIZE	16
513694
+
513694
 	.text
513694
-ENTRY (strrchr)
513694
-	movd	%esi, %xmm1
513694
+ENTRY(STRRCHR)
513694
+	movd	%esi, %xmm0
513694
 	movq	%rdi, %rax
513694
-	andl	$4095, %eax
513694
-	punpcklbw	%xmm1, %xmm1
513694
-	cmpq	$4032, %rax
513694
-	punpcklwd	%xmm1, %xmm1
513694
-	pshufd	$0, %xmm1, %xmm1
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+#ifndef USE_AS_WCSRCHR
513694
+	punpcklbw %xmm0, %xmm0
513694
+	punpcklwd %xmm0, %xmm0
513694
+#endif
513694
+	pshufd	$0, %xmm0, %xmm0
513694
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
 	ja	L(cross_page)
513694
-	movdqu	(%rdi), %xmm0
513694
+
513694
+L(cross_page_continue):
513694
+	movups	(%rdi), %xmm1
513694
 	pxor	%xmm2, %xmm2
513694
-	movdqa	%xmm0, %xmm3
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	pcmpeqb	%xmm2, %xmm3
513694
-	pmovmskb	%xmm0, %ecx
513694
-	pmovmskb	%xmm3, %edx
513694
-	testq	%rdx, %rdx
513694
-	je	L(next_48_bytes)
513694
-	leaq	-1(%rdx), %rax
513694
-	xorq	%rdx, %rax
513694
-	andq	%rcx, %rax
513694
-	je	L(exit)
513694
-	bsrq	%rax, %rax
513694
+	PCMPEQ	%xmm1, %xmm2
513694
+	pmovmskb %xmm2, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jz	L(aligned_more)
513694
+
513694
+	PCMPEQ	%xmm0, %xmm1
513694
+	pmovmskb %xmm1, %eax
513694
+	leal	-1(%rcx), %edx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret0)
513694
+	bsrl	%eax, %eax
513694
 	addq	%rdi, %rax
513694
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
513694
+	   search CHAR is zero we are correct. Either way `andq
513694
+	   -CHAR_SIZE, %rax` gets the correct result.  */
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+L(ret0):
513694
 	ret
513694
 
513694
+	/* Returns for first vec x1/x2 have hard coded backward search
513694
+	   path for earlier matches.  */
513694
 	.p2align 4
513694
-L(next_48_bytes):
513694
-	movdqu	16(%rdi), %xmm4
513694
-	movdqa	%xmm4, %xmm5
513694
-	movdqu	32(%rdi), %xmm3
513694
-	pcmpeqb	%xmm1, %xmm4
513694
-	pcmpeqb	%xmm2, %xmm5
513694
-	movdqu	48(%rdi), %xmm0
513694
-	pmovmskb	%xmm5, %edx
513694
-	movdqa	%xmm3, %xmm5
513694
-	pcmpeqb	%xmm1, %xmm3
513694
-	pcmpeqb	%xmm2, %xmm5
513694
-	pcmpeqb	%xmm0, %xmm2
513694
-	salq	$16, %rdx
513694
-	pmovmskb	%xmm3, %r8d
513694
-	pmovmskb	%xmm5, %eax
513694
-	pmovmskb	%xmm2, %esi
513694
-	salq	$32, %r8
513694
-	salq	$32, %rax
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	orq	%rdx, %rax
513694
-	movq	%rsi, %rdx
513694
-	pmovmskb	%xmm4, %esi
513694
-	salq	$48, %rdx
513694
-	salq	$16, %rsi
513694
-	orq	%r8, %rsi
513694
-	orq	%rcx, %rsi
513694
-	pmovmskb	%xmm0, %ecx
513694
-	salq	$48, %rcx
513694
-	orq	%rcx, %rsi
513694
-	orq	%rdx, %rax
513694
-	je	L(loop_header2)
513694
-	leaq	-1(%rax), %rcx
513694
-	xorq	%rax, %rcx
513694
-	andq	%rcx, %rsi
513694
-	je	L(exit)
513694
-	bsrq	%rsi, %rsi
513694
-	leaq	(%rdi,%rsi), %rax
513694
+L(first_vec_x0_test):
513694
+	PCMPEQ	%xmm0, %xmm1
513694
+	pmovmskb %xmm1, %eax
513694
+	testl	%eax, %eax
513694
+	jz	L(ret0)
513694
+	bsrl	%eax, %eax
513694
+	addq	%r8, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
 	ret
513694
 
513694
 	.p2align 4
513694
-L(loop_header2):
513694
-	testq	%rsi, %rsi
513694
-	movq	%rdi, %rcx
513694
-	je	L(no_c_found)
513694
-L(loop_header):
513694
-	addq	$64, %rdi
513694
-	pxor	%xmm7, %xmm7
513694
-	andq	$-64, %rdi
513694
-	jmp	L(loop_entry)
513694
+L(first_vec_x1):
513694
+	PCMPEQ	%xmm0, %xmm2
513694
+	pmovmskb %xmm2, %eax
513694
+	leal	-1(%rcx), %edx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_vec_x0_test)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+	ret
513694
 
513694
 	.p2align 4
513694
-L(loop64):
513694
-	testq	%rdx, %rdx
513694
-	cmovne	%rdx, %rsi
513694
-	cmovne	%rdi, %rcx
513694
-	addq	$64, %rdi
513694
-L(loop_entry):
513694
-	movdqa	32(%rdi), %xmm3
513694
-	pxor	%xmm6, %xmm6
513694
-	movdqa	48(%rdi), %xmm2
513694
-	movdqa	%xmm3, %xmm0
513694
-	movdqa	16(%rdi), %xmm4
513694
-	pminub	%xmm2, %xmm0
513694
-	movdqa	(%rdi), %xmm5
513694
-	pminub	%xmm4, %xmm0
513694
-	pminub	%xmm5, %xmm0
513694
-	pcmpeqb	%xmm7, %xmm0
513694
-	pmovmskb	%xmm0, %eax
513694
-	movdqa	%xmm5, %xmm0
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	pmovmskb	%xmm0, %r9d
513694
-	movdqa	%xmm4, %xmm0
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	pmovmskb	%xmm0, %edx
513694
-	movdqa	%xmm3, %xmm0
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	salq	$16, %rdx
513694
-	pmovmskb	%xmm0, %r10d
513694
-	movdqa	%xmm2, %xmm0
513694
-	pcmpeqb	%xmm1, %xmm0
513694
-	salq	$32, %r10
513694
-	orq	%r10, %rdx
513694
-	pmovmskb	%xmm0, %r8d
513694
-	orq	%r9, %rdx
513694
-	salq	$48, %r8
513694
-	orq	%r8, %rdx
513694
+L(first_vec_x1_test):
513694
+	PCMPEQ	%xmm0, %xmm2
513694
+	pmovmskb %xmm2, %eax
513694
 	testl	%eax, %eax
513694
-	je	L(loop64)
513694
-	pcmpeqb	%xmm6, %xmm4
513694
-	pcmpeqb	%xmm6, %xmm3
513694
-	pcmpeqb	%xmm6, %xmm5
513694
-	pmovmskb	%xmm4, %eax
513694
-	pmovmskb	%xmm3, %r10d
513694
-	pcmpeqb	%xmm6, %xmm2
513694
-	pmovmskb	%xmm5, %r9d
513694
-	salq	$32, %r10
513694
-	salq	$16, %rax
513694
-	pmovmskb	%xmm2, %r8d
513694
-	orq	%r10, %rax
513694
-	orq	%r9, %rax
513694
-	salq	$48, %r8
513694
-	orq	%r8, %rax
513694
-	leaq	-1(%rax), %r8
513694
-	xorq	%rax, %r8
513694
-	andq	%r8, %rdx
513694
-	cmovne	%rdi, %rcx
513694
-	cmovne	%rdx, %rsi
513694
-	bsrq	%rsi, %rsi
513694
-	leaq	(%rcx,%rsi), %rax
513694
+	jz	L(first_vec_x0_test)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+	ret
513694
+
513694
+	.p2align 4
513694
+L(first_vec_x2):
513694
+	PCMPEQ	%xmm0, %xmm3
513694
+	pmovmskb %xmm3, %eax
513694
+	leal	-1(%rcx), %edx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_vec_x1_test)
513694
+	bsrl	%eax, %eax
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+	ret
513694
+
513694
+	.p2align 4
513694
+L(aligned_more):
513694
+	/* Save original pointer if match was in VEC 0.  */
513694
+	movq	%rdi, %r8
513694
+	andq	$-VEC_SIZE, %rdi
513694
+
513694
+	movaps	VEC_SIZE(%rdi), %xmm2
513694
+	pxor	%xmm3, %xmm3
513694
+	PCMPEQ	%xmm2, %xmm3
513694
+	pmovmskb %xmm3, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x1)
513694
+
513694
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
513694
+	pxor	%xmm4, %xmm4
513694
+	PCMPEQ	%xmm3, %xmm4
513694
+	pmovmskb %xmm4, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(first_vec_x2)
513694
+
513694
+	addq	$VEC_SIZE, %rdi
513694
+	/* Save pointer again before realigning.  */
513694
+	movq	%rdi, %rsi
513694
+	andq	$-(VEC_SIZE * 2), %rdi
513694
+	.p2align 4
513694
+L(first_loop):
513694
+	/* Do 2x VEC at a time.  */
513694
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
513694
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
513694
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
513694
+	   detecting zero. Note if this is found to be a bottleneck it
513694
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
513694
+#ifdef USE_AS_WCSRCHR
513694
+	movaps	%xmm5, %xmm6
513694
+	pxor	%xmm8, %xmm8
513694
+
513694
+	PCMPEQ	%xmm8, %xmm5
513694
+	PCMPEQ	%xmm4, %xmm8
513694
+	por	%xmm5, %xmm8
513694
+#else
513694
+	movaps	%xmm5, %xmm6
513694
+	PMINU	%xmm4, %xmm5
513694
+#endif
513694
+
513694
+	movaps	%xmm4, %xmm9
513694
+	PCMPEQ	%xmm0, %xmm4
513694
+	PCMPEQ	%xmm0, %xmm6
513694
+	movaps	%xmm6, %xmm7
513694
+	por	%xmm4, %xmm6
513694
+#ifndef USE_AS_WCSRCHR
513694
+	pxor	%xmm8, %xmm8
513694
+	PCMPEQ	%xmm5, %xmm8
513694
+#endif
513694
+	pmovmskb %xmm8, %ecx
513694
+	pmovmskb %xmm6, %eax
513694
+
513694
+	addq	$(VEC_SIZE * 2), %rdi
513694
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
513694
+	   macro-fuse with `jz`.  */
513694
+	addl	%ecx, %eax
513694
+	jz	L(first_loop)
513694
+
513694
+	/* Check if there is zero match.  */
513694
+	testl	%ecx, %ecx
513694
+	jz	L(second_loop_match)
513694
+
513694
+	/* Check if there was a match in last iteration.  */
513694
+	subl	%ecx, %eax
513694
+	jnz	L(new_match)
513694
+
513694
+L(first_loop_old_match):
513694
+	PCMPEQ	%xmm0, %xmm2
513694
+	PCMPEQ	%xmm0, %xmm3
513694
+	pmovmskb %xmm2, %ecx
513694
+	pmovmskb %xmm3, %eax
513694
+	addl	%eax, %ecx
513694
+	jz	L(first_vec_x0_test)
513694
+	/* NB: We could move this shift to before the branch and save a
513694
+	   bit of code size / performance on the fall through. The
513694
+	   branch leads to the null case which generally seems hotter
513694
+	   than char in first 3x VEC.  */
513694
+	sall	$16, %eax
513694
+	orl	%ecx, %eax
513694
+
513694
+	bsrl	%eax, %eax
513694
+	addq	%rsi, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+	ret
513694
+
513694
+	.p2align 4
513694
+L(new_match):
513694
+	pxor	%xmm6, %xmm6
513694
+	PCMPEQ	%xmm9, %xmm6
513694
+	pmovmskb %xmm6, %eax
513694
+	sall	$16, %ecx
513694
+	orl	%eax, %ecx
513694
+
513694
+	/* We can't reuse either of the old comparisons as since we mask
513694
+	   of zeros after first zero (instead of using the full
513694
+	   comparison) we can't gurantee no interference between match
513694
+	   after end of string and valid match.  */
513694
+	pmovmskb %xmm4, %eax
513694
+	pmovmskb %xmm7, %edx
513694
+	sall	$16, %edx
513694
+	orl	%edx, %eax
513694
+
513694
+	leal	-1(%ecx), %edx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(first_loop_old_match)
513694
+	bsrl	%eax, %eax
513694
+	addq	%rdi, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
 	ret
513694
 
513694
+	/* Save minimum state for getting most recent match. We can
513694
+	   throw out all previous work.  */
513694
 	.p2align 4
513694
-L(no_c_found):
513694
-	movl	$1, %esi
513694
-	xorl	%ecx, %ecx
513694
-	jmp	L(loop_header)
513694
+L(second_loop_match):
513694
+	movq	%rdi, %rsi
513694
+	movaps	%xmm4, %xmm2
513694
+	movaps	%xmm7, %xmm3
513694
 
513694
 	.p2align 4
513694
-L(exit):
513694
-	xorl	%eax, %eax
513694
+L(second_loop):
513694
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
513694
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
513694
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
513694
+	   detecting zero. Note if this is found to be a bottleneck it
513694
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
513694
+#ifdef USE_AS_WCSRCHR
513694
+	movaps	%xmm5, %xmm6
513694
+	pxor	%xmm8, %xmm8
513694
+
513694
+	PCMPEQ	%xmm8, %xmm5
513694
+	PCMPEQ	%xmm4, %xmm8
513694
+	por	%xmm5, %xmm8
513694
+#else
513694
+	movaps	%xmm5, %xmm6
513694
+	PMINU	%xmm4, %xmm5
513694
+#endif
513694
+
513694
+	movaps	%xmm4, %xmm9
513694
+	PCMPEQ	%xmm0, %xmm4
513694
+	PCMPEQ	%xmm0, %xmm6
513694
+	movaps	%xmm6, %xmm7
513694
+	por	%xmm4, %xmm6
513694
+#ifndef USE_AS_WCSRCHR
513694
+	pxor	%xmm8, %xmm8
513694
+	PCMPEQ	%xmm5, %xmm8
513694
+#endif
513694
+
513694
+	pmovmskb %xmm8, %ecx
513694
+	pmovmskb %xmm6, %eax
513694
+
513694
+	addq	$(VEC_SIZE * 2), %rdi
513694
+	/* Either null term or new occurence of CHAR.  */
513694
+	addl	%ecx, %eax
513694
+	jz	L(second_loop)
513694
+
513694
+	/* No null term so much be new occurence of CHAR.  */
513694
+	testl	%ecx, %ecx
513694
+	jz	L(second_loop_match)
513694
+
513694
+
513694
+	subl	%ecx, %eax
513694
+	jnz	L(second_loop_new_match)
513694
+
513694
+L(second_loop_old_match):
513694
+	pmovmskb %xmm2, %ecx
513694
+	pmovmskb %xmm3, %eax
513694
+	sall	$16, %eax
513694
+	orl	%ecx, %eax
513694
+	bsrl	%eax, %eax
513694
+	addq	%rsi, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
 	ret
513694
 
513694
 	.p2align 4
513694
+L(second_loop_new_match):
513694
+	pxor	%xmm6, %xmm6
513694
+	PCMPEQ	%xmm9, %xmm6
513694
+	pmovmskb %xmm6, %eax
513694
+	sall	$16, %ecx
513694
+	orl	%eax, %ecx
513694
+
513694
+	/* We can't reuse either of the old comparisons as since we mask
513694
+	   of zeros after first zero (instead of using the full
513694
+	   comparison) we can't gurantee no interference between match
513694
+	   after end of string and valid match.  */
513694
+	pmovmskb %xmm4, %eax
513694
+	pmovmskb %xmm7, %edx
513694
+	sall	$16, %edx
513694
+	orl	%edx, %eax
513694
+
513694
+	leal	-1(%ecx), %edx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(second_loop_old_match)
513694
+	bsrl	%eax, %eax
513694
+	addq	%rdi, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+	ret
513694
+
513694
+	.p2align 4,, 4
513694
 L(cross_page):
513694
-	movq	%rdi, %rax
513694
-	pxor	%xmm0, %xmm0
513694
-	andq	$-64, %rax
513694
-	movdqu	(%rax), %xmm5
513694
-	movdqa	%xmm5, %xmm6
513694
-	movdqu	16(%rax), %xmm4
513694
-	pcmpeqb	%xmm1, %xmm5
513694
-	pcmpeqb	%xmm0, %xmm6
513694
-	movdqu	32(%rax), %xmm3
513694
-	pmovmskb	%xmm6, %esi
513694
-	movdqa	%xmm4, %xmm6
513694
-	movdqu	48(%rax), %xmm2
513694
-	pcmpeqb	%xmm1, %xmm4
513694
-	pcmpeqb	%xmm0, %xmm6
513694
-	pmovmskb	%xmm6, %edx
513694
-	movdqa	%xmm3, %xmm6
513694
-	pcmpeqb	%xmm1, %xmm3
513694
-	pcmpeqb	%xmm0, %xmm6
513694
-	pcmpeqb	%xmm2, %xmm0
513694
-	salq	$16, %rdx
513694
-	pmovmskb	%xmm3, %r9d
513694
-	pmovmskb	%xmm6, %r8d
513694
-	pmovmskb	%xmm0, %ecx
513694
-	salq	$32, %r9
513694
-	salq	$32, %r8
513694
-	pcmpeqb	%xmm1, %xmm2
513694
-	orq	%r8, %rdx
513694
-	salq	$48, %rcx
513694
-	pmovmskb	%xmm5, %r8d
513694
-	orq	%rsi, %rdx
513694
-	pmovmskb	%xmm4, %esi
513694
-	orq	%rcx, %rdx
513694
-	pmovmskb	%xmm2, %ecx
513694
-	salq	$16, %rsi
513694
-	salq	$48, %rcx
513694
-	orq	%r9, %rsi
513694
-	orq	%r8, %rsi
513694
-	orq	%rcx, %rsi
513694
+	movq	%rdi, %rsi
513694
+	andq	$-VEC_SIZE, %rsi
513694
+	movaps	(%rsi), %xmm1
513694
+	pxor	%xmm2, %xmm2
513694
+	PCMPEQ	%xmm1, %xmm2
513694
+	pmovmskb %xmm2, %edx
513694
 	movl	%edi, %ecx
513694
-	subl	%eax, %ecx
513694
-	shrq	%cl, %rdx
513694
-	shrq	%cl, %rsi
513694
-	testq	%rdx, %rdx
513694
-	je	L(loop_header2)
513694
-	leaq	-1(%rdx), %rax
513694
-	xorq	%rdx, %rax
513694
-	andq	%rax, %rsi
513694
-	je	L(exit)
513694
-	bsrq	%rsi, %rax
513694
+	andl	$(VEC_SIZE - 1), %ecx
513694
+	sarl	%cl, %edx
513694
+	jz	L(cross_page_continue)
513694
+	PCMPEQ	%xmm0, %xmm1
513694
+	pmovmskb %xmm1, %eax
513694
+	sarl	%cl, %eax
513694
+	leal	-1(%rdx), %ecx
513694
+	xorl	%edx, %ecx
513694
+	andl	%ecx, %eax
513694
+	jz	L(ret1)
513694
+	bsrl	%eax, %eax
513694
 	addq	%rdi, %rax
513694
+#ifdef USE_AS_WCSRCHR
513694
+	andq	$-CHAR_SIZE, %rax
513694
+#endif
513694
+L(ret1):
513694
 	ret
513694
-END (strrchr)
513694
+END(STRRCHR)
513694
 
513694
-weak_alias (strrchr, rindex)
513694
-libc_hidden_builtin_def (strrchr)
513694
+#ifndef USE_AS_WCSRCHR
513694
+	weak_alias (STRRCHR, rindex)
513694
+	libc_hidden_builtin_def (STRRCHR)
513694
+#endif
513694
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
513694
index 2f388537..ae3cfa7d 100644
513694
--- a/sysdeps/x86_64/wcsrchr.S
513694
+++ b/sysdeps/x86_64/wcsrchr.S
513694
@@ -17,266 +17,12 @@
513694
    License along with the GNU C Library; if not, see
513694
    <http://www.gnu.org/licenses/>.  */
513694
 
513694
-#include <sysdep.h>
513694
 
513694
-	.text
513694
-ENTRY (wcsrchr)
513694
+#define USE_AS_WCSRCHR	1
513694
+#define NO_PMINU	1
513694
 
513694
-	movd	%rsi, %xmm1
513694
-	mov	%rdi, %rcx
513694
-	punpckldq %xmm1, %xmm1
513694
-	pxor	%xmm2, %xmm2
513694
-	punpckldq %xmm1, %xmm1
513694
-	and	$63, %rcx
513694
-	cmp	$48, %rcx
513694
-	ja	L(crosscache)
513694
+#ifndef STRRCHR
513694
+# define STRRCHR	wcsrchr
513694
+#endif
513694
 
513694
-	movdqu	(%rdi), %xmm0
513694
-	pcmpeqd	%xmm0, %xmm2
513694
-	pcmpeqd	%xmm1, %xmm0
513694
-	pmovmskb %xmm2, %rcx
513694
-	pmovmskb %xmm0, %rax
513694
-	add	$16, %rdi
513694
-
513694
-	test	%rax, %rax
513694
-	jnz	L(unaligned_match1)
513694
-
513694
-	test	%rcx, %rcx
513694
-	jnz	L(return_null)
513694
-
513694
-	and	$-16, %rdi
513694
-	xor	%r8, %r8
513694
-	jmp	L(loop)
513694
-
513694
-	.p2align 4
513694
-L(unaligned_match1):
513694
-	test	%rcx, %rcx
513694
-	jnz	L(prolog_find_zero_1)
513694
-
513694
-	mov	%rax, %r8
513694
-	mov	%rdi, %rsi
513694
-	and	$-16, %rdi
513694
-	jmp	L(loop)
513694
-
513694
-	.p2align 4
513694
-L(crosscache):
513694
-	and	$15, %rcx
513694
-	and	$-16, %rdi
513694
-	pxor	%xmm3, %xmm3
513694
-	movdqa	(%rdi), %xmm0
513694
-	pcmpeqd	%xmm0, %xmm3
513694
-	pcmpeqd	%xmm1, %xmm0
513694
-	pmovmskb %xmm3, %rdx
513694
-	pmovmskb %xmm0, %rax
513694
-	shr	%cl, %rdx
513694
-	shr	%cl, %rax
513694
-	add	$16, %rdi
513694
-
513694
-	test	%rax, %rax
513694
-	jnz	L(unaligned_match)
513694
-
513694
-	test	%rdx, %rdx
513694
-	jnz	L(return_null)
513694
-
513694
-	xor	%r8, %r8
513694
-	jmp	L(loop)
513694
-
513694
-	.p2align 4
513694
-L(unaligned_match):
513694
-	test	%rdx, %rdx
513694
-	jnz	L(prolog_find_zero)
513694
-
513694
-	mov	%rax, %r8
513694
-	lea	(%rdi, %rcx), %rsi
513694
-
513694
-/* Loop start on aligned string.  */
513694
-	.p2align 4
513694
-L(loop):
513694
-	movdqa	(%rdi), %xmm0
513694
-	pcmpeqd	%xmm0, %xmm2
513694
-	add	$16, %rdi
513694
-	pcmpeqd	%xmm1, %xmm0
513694
-	pmovmskb %xmm2, %rcx
513694
-	pmovmskb %xmm0, %rax
513694
-	or	%rax, %rcx
513694
-	jnz	L(matches)
513694
-
513694
-	movdqa	(%rdi), %xmm3
513694
-	pcmpeqd	%xmm3, %xmm2
513694
-	add	$16, %rdi
513694
-	pcmpeqd	%xmm1, %xmm3
513694
-	pmovmskb %xmm2, %rcx
513694
-	pmovmskb %xmm3, %rax
513694
-	or	%rax, %rcx
513694
-	jnz	L(matches)
513694
-
513694
-	movdqa	(%rdi), %xmm4
513694
-	pcmpeqd	%xmm4, %xmm2
513694
-	add	$16, %rdi
513694
-	pcmpeqd	%xmm1, %xmm4
513694
-	pmovmskb %xmm2, %rcx
513694
-	pmovmskb %xmm4, %rax
513694
-	or	%rax, %rcx
513694
-	jnz	L(matches)
513694
-
513694
-	movdqa	(%rdi), %xmm5
513694
-	pcmpeqd	%xmm5, %xmm2
513694
-	add	$16, %rdi
513694
-	pcmpeqd	%xmm1, %xmm5
513694
-	pmovmskb %xmm2, %rcx
513694
-	pmovmskb %xmm5, %rax
513694
-	or	%rax, %rcx
513694
-	jz	L(loop)
513694
-
513694
-	.p2align 4
513694
-L(matches):
513694
-	test	%rax, %rax
513694
-	jnz	L(match)
513694
-L(return_value):
513694
-	test	%r8, %r8
513694
-	jz	L(return_null)
513694
-	mov	%r8, %rax
513694
-	mov	%rsi, %rdi
513694
-
513694
-	test	$15 << 4, %ah
513694
-	jnz	L(match_fourth_wchar)
513694
-	test	%ah, %ah
513694
-	jnz	L(match_third_wchar)
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(match):
513694
-	pmovmskb %xmm2, %rcx
513694
-	test	%rcx, %rcx
513694
-	jnz	L(find_zero)
513694
-	mov	%rax, %r8
513694
-	mov	%rdi, %rsi
513694
-	jmp	L(loop)
513694
-
513694
-	.p2align 4
513694
-L(find_zero):
513694
-	test	$15, %cl
513694
-	jnz	L(find_zero_in_first_wchar)
513694
-	test	%cl, %cl
513694
-	jnz	L(find_zero_in_second_wchar)
513694
-	test	$15, %ch
513694
-	jnz	L(find_zero_in_third_wchar)
513694
-
513694
-	and	$1 << 13 - 1, %rax
513694
-	jz	L(return_value)
513694
-
513694
-	test	$15 << 4, %ah
513694
-	jnz	L(match_fourth_wchar)
513694
-	test	%ah, %ah
513694
-	jnz	L(match_third_wchar)
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(find_zero_in_first_wchar):
513694
-	test	$1, %rax
513694
-	jz	L(return_value)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(find_zero_in_second_wchar):
513694
-	and	$1 << 5 - 1, %rax
513694
-	jz	L(return_value)
513694
-
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(find_zero_in_third_wchar):
513694
-	and	$1 << 9 - 1, %rax
513694
-	jz	L(return_value)
513694
-
513694
-	test	%ah, %ah
513694
-	jnz	L(match_third_wchar)
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(prolog_find_zero):
513694
-	add	%rcx, %rdi
513694
-	mov     %rdx, %rcx
513694
-L(prolog_find_zero_1):
513694
-	test	$15, %cl
513694
-	jnz	L(prolog_find_zero_in_first_wchar)
513694
-	test	%cl, %cl
513694
-	jnz	L(prolog_find_zero_in_second_wchar)
513694
-	test	$15, %ch
513694
-	jnz	L(prolog_find_zero_in_third_wchar)
513694
-
513694
-	and	$1 << 13 - 1, %rax
513694
-	jz	L(return_null)
513694
-
513694
-	test	$15 << 4, %ah
513694
-	jnz	L(match_fourth_wchar)
513694
-	test	%ah, %ah
513694
-	jnz	L(match_third_wchar)
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(prolog_find_zero_in_first_wchar):
513694
-	test	$1, %rax
513694
-	jz	L(return_null)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(prolog_find_zero_in_second_wchar):
513694
-	and	$1 << 5 - 1, %rax
513694
-	jz	L(return_null)
513694
-
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(prolog_find_zero_in_third_wchar):
513694
-	and	$1 << 9 - 1, %rax
513694
-	jz	L(return_null)
513694
-
513694
-	test	%ah, %ah
513694
-	jnz	L(match_third_wchar)
513694
-	test	$15 << 4, %al
513694
-	jnz	L(match_second_wchar)
513694
-	lea	-16(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(match_second_wchar):
513694
-	lea	-12(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(match_third_wchar):
513694
-	lea	-8(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(match_fourth_wchar):
513694
-	lea	-4(%rdi), %rax
513694
-	ret
513694
-
513694
-	.p2align 4
513694
-L(return_null):
513694
-	xor	%rax, %rax
513694
-	ret
513694
-
513694
-END (wcsrchr)
513694
+#include "../strrchr.S"
513694
-- 
513694
GitLab
513694