076f82
commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Jun 6 21:11:30 2022 -0700
076f82
076f82
    x86: Optimize memrchr-sse2.S
076f82
    
076f82
    The new code:
076f82
        1. prioritizes smaller lengths more.
076f82
        2. optimizes target placement more carefully.
076f82
        3. reuses logic more.
076f82
        4. fixes up various inefficiencies in the logic.
076f82
    
076f82
    The total code size saving is: 394 bytes
076f82
    Geometric Mean of all benchmarks New / Old: 0.874
076f82
    
076f82
    Regressions:
076f82
        1. The page cross case is now colder, especially re-entry from the
076f82
           page cross case if a match is not found in the first VEC
076f82
           (roughly 50%). My general opinion with this patch is this is
076f82
           acceptable given the "coldness" of this case (less than 4%) and
076f82
           generally performance improvement in the other far more common
076f82
           cases.
076f82
    
076f82
        2. There are some regressions 5-15% for medium/large user-arg
076f82
           lengths that have a match in the first VEC. This is because the
076f82
           logic was rewritten to optimize finds in the first VEC if the
076f82
           user-arg length is shorter (where we see roughly 20-50%
076f82
           performance improvements). It is not always the case this is a
076f82
           regression. My intuition is some frontend quirk is partially
076f82
           explaining the data although I haven't been able to find the
076f82
           root cause.
076f82
    
076f82
    Full xcheck passes on x86_64.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit 731feee3869550e93177e604604c1765d81de571)
076f82
076f82
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
076f82
index cc2001167d77c83c..c2a5902bf9385c67 100644
076f82
--- a/sysdeps/x86_64/memrchr.S
076f82
+++ b/sysdeps/x86_64/memrchr.S
076f82
@@ -19,362 +19,333 @@
076f82
    <https://www.gnu.org/licenses/>.  */
076f82
 
076f82
 #include <sysdep.h>
076f82
+#define VEC_SIZE			16
076f82
+#define PAGE_SIZE			4096
076f82
 
076f82
 	.text
076f82
-ENTRY (__memrchr)
076f82
-	movd	%esi, %xmm1
076f82
-
076f82
-	sub	$16, %RDX_LP
076f82
-	jbe	L(length_less16)
076f82
-
076f82
-	punpcklbw	%xmm1, %xmm1
076f82
-	punpcklbw	%xmm1, %xmm1
076f82
-
076f82
-	add	%RDX_LP, %RDI_LP
076f82
-	pshufd	$0, %xmm1, %xmm1
076f82
-
076f82
-	movdqu	(%rdi), %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-
076f82
-/* Check if there is a match.  */
076f82
-	pmovmskb	%xmm0, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches0)
076f82
-
076f82
-	sub	$64, %rdi
076f82
-	mov	%edi, %ecx
076f82
-	and	$15, %ecx
076f82
-	jz	L(loop_prolog)
076f82
-
076f82
-	add	$16, %rdi
076f82
-	add	$16, %rdx
076f82
-	and	$-16, %rdi
076f82
-	sub	%rcx, %rdx
076f82
-
076f82
-	.p2align 4
076f82
-L(loop_prolog):
076f82
-	sub	$64, %rdx
076f82
-	jbe	L(exit_loop)
076f82
-
076f82
-	movdqa	48(%rdi), %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-	pmovmskb	%xmm0, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches48)
076f82
-
076f82
-	movdqa	32(%rdi), %xmm2
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pmovmskb	%xmm2, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches32)
076f82
-
076f82
-	movdqa	16(%rdi), %xmm3
076f82
-	pcmpeqb	%xmm1, %xmm3
076f82
-	pmovmskb	%xmm3, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches16)
076f82
-
076f82
-	movdqa	(%rdi), %xmm4
076f82
-	pcmpeqb	%xmm1, %xmm4
076f82
-	pmovmskb	%xmm4, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches0)
076f82
-
076f82
-	sub	$64, %rdi
076f82
-	sub	$64, %rdx
076f82
-	jbe	L(exit_loop)
076f82
-
076f82
-	movdqa	48(%rdi), %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-	pmovmskb	%xmm0, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches48)
076f82
-
076f82
-	movdqa	32(%rdi), %xmm2
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pmovmskb	%xmm2, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches32)
076f82
-
076f82
-	movdqa	16(%rdi), %xmm3
076f82
-	pcmpeqb	%xmm1, %xmm3
076f82
-	pmovmskb	%xmm3, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches16)
076f82
-
076f82
-	movdqa	(%rdi), %xmm3
076f82
-	pcmpeqb	%xmm1, %xmm3
076f82
-	pmovmskb	%xmm3, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches0)
076f82
-
076f82
-	mov	%edi, %ecx
076f82
-	and	$63, %ecx
076f82
-	jz	L(align64_loop)
076f82
-
076f82
-	add	$64, %rdi
076f82
-	add	$64, %rdx
076f82
-	and	$-64, %rdi
076f82
-	sub	%rcx, %rdx
076f82
-
076f82
-	.p2align 4
076f82
-L(align64_loop):
076f82
-	sub	$64, %rdi
076f82
-	sub	$64, %rdx
076f82
-	jbe	L(exit_loop)
076f82
-
076f82
-	movdqa	(%rdi), %xmm0
076f82
-	movdqa	16(%rdi), %xmm2
076f82
-	movdqa	32(%rdi), %xmm3
076f82
-	movdqa	48(%rdi), %xmm4
076f82
-
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pcmpeqb	%xmm1, %xmm3
076f82
-	pcmpeqb	%xmm1, %xmm4
076f82
-
076f82
-	pmaxub	%xmm3, %xmm0
076f82
-	pmaxub	%xmm4, %xmm2
076f82
-	pmaxub	%xmm0, %xmm2
076f82
-	pmovmskb	%xmm2, %eax
076f82
-
076f82
-	test	%eax, %eax
076f82
-	jz	L(align64_loop)
076f82
-
076f82
-	pmovmskb	%xmm4, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches48)
076f82
-
076f82
-	pmovmskb	%xmm3, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches32)
076f82
-
076f82
-	movdqa	16(%rdi), %xmm2
076f82
-
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pcmpeqb	(%rdi), %xmm1
076f82
-
076f82
-	pmovmskb	%xmm2, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches16)
076f82
-
076f82
-	pmovmskb	%xmm1, %eax
076f82
-	bsr	%eax, %eax
076f82
-
076f82
-	add	%rdi, %rax
076f82
+ENTRY_P2ALIGN(__memrchr, 6)
076f82
+#ifdef __ILP32__
076f82
+	/* Clear upper bits.  */
076f82
+	mov	%RDX_LP, %RDX_LP
076f82
+#endif
076f82
+	movd	%esi, %xmm0
076f82
+
076f82
+	/* Get end pointer.  */
076f82
+	leaq	(%rdx, %rdi), %rcx
076f82
+
076f82
+	punpcklbw %xmm0, %xmm0
076f82
+	punpcklwd %xmm0, %xmm0
076f82
+	pshufd	$0, %xmm0, %xmm0
076f82
+
076f82
+	/* Check if we can load 1x VEC without cross a page.  */
076f82
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
076f82
+	jz	L(page_cross)
076f82
+
076f82
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
076f82
+	   it doesn't cross a page and the standard gurantees any pointer have
076f82
+	   at least one-valid byte this load must be safe. For the entire
076f82
+	   history of the x86 memrchr implementation this has been possible so
076f82
+	   no code "should" be relying on a zero-length check before this load.
076f82
+	   The zero-length check is moved to the page cross case because it is
076f82
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
076f82
+	   into 2-cache lines.  */
076f82
+	movups	-(VEC_SIZE)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	subq	$VEC_SIZE, %rdx
076f82
+	ja	L(more_1x_vec)
076f82
+L(ret_vec_x0_test):
076f82
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
076f82
+	   zero.  */
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(ret_0)
076f82
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
076f82
+	   if out of bounds.  */
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_0)
076f82
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
076f82
+	   ptr.  */
076f82
+	addq	%rdi, %rax
076f82
+L(ret_0):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(exit_loop):
076f82
-	add	$64, %edx
076f82
-	cmp	$32, %edx
076f82
-	jbe	L(exit_loop_32)
076f82
-
076f82
-	movdqa	48(%rdi), %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-	pmovmskb	%xmm0, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches48)
076f82
-
076f82
-	movdqa	32(%rdi), %xmm2
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pmovmskb	%xmm2, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches32)
076f82
-
076f82
-	movdqa	16(%rdi), %xmm3
076f82
-	pcmpeqb	%xmm1, %xmm3
076f82
-	pmovmskb	%xmm3, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches16_1)
076f82
-	cmp	$48, %edx
076f82
-	jbe	L(return_null)
076f82
-
076f82
-	pcmpeqb	(%rdi), %xmm1
076f82
-	pmovmskb	%xmm1, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches0_1)
076f82
-	xor	%eax, %eax
076f82
+	.p2align 4,, 5
076f82
+L(ret_vec_x0):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(exit_loop_32):
076f82
-	movdqa	48(%rdi), %xmm0
076f82
-	pcmpeqb	%xmm1, %xmm0
076f82
-	pmovmskb	%xmm0, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches48_1)
076f82
-	cmp	$16, %edx
076f82
-	jbe	L(return_null)
076f82
-
076f82
-	pcmpeqb	32(%rdi), %xmm1
076f82
-	pmovmskb	%xmm1, %eax
076f82
-	test	%eax, %eax
076f82
-	jnz	L(matches32_1)
076f82
-	xor	%eax, %eax
076f82
+	.p2align 4,, 2
076f82
+L(zero_0):
076f82
+	xorl	%eax, %eax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches0):
076f82
-	bsr	%eax, %eax
076f82
-	add	%rdi, %rax
076f82
-	ret
076f82
-
076f82
-	.p2align 4
076f82
-L(matches16):
076f82
-	bsr	%eax, %eax
076f82
-	lea	16(%rax, %rdi), %rax
076f82
-	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches32):
076f82
-	bsr	%eax, %eax
076f82
-	lea	32(%rax, %rdi), %rax
076f82
+	.p2align 4,, 8
076f82
+L(more_1x_vec):
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+	/* Align rcx (pointer to string).  */
076f82
+	decq	%rcx
076f82
+	andq	$-VEC_SIZE, %rcx
076f82
+
076f82
+	movq	%rcx, %rdx
076f82
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
076f82
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
076f82
+	   it adds more frontend uops (even if the moves can be eliminated) and
076f82
+	   some percentage of the time actual backend uops.  */
076f82
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	subq	%rdi, %rdx
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	cmpq	$(VEC_SIZE * 2), %rdx
076f82
+	ja	L(more_2x_vec)
076f82
+L(last_2x_vec):
076f82
+	subl	$VEC_SIZE, %edx
076f82
+	jbe	L(ret_vec_x0_test)
076f82
+
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	subl	$VEC_SIZE, %edx
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(ret_1)
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_0)
076f82
+	addq	%rdi, %rax
076f82
+L(ret_1):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches48):
076f82
-	bsr	%eax, %eax
076f82
-	lea	48(%rax, %rdi), %rax
076f82
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
076f82
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
076f82
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
076f82
+L(page_cross):
076f82
+	/* Zero length check.  */
076f82
+	testq	%rdx, %rdx
076f82
+	jz	L(zero_0)
076f82
+
076f82
+	leaq	-1(%rcx), %r8
076f82
+	andq	$-(VEC_SIZE), %r8
076f82
+
076f82
+	movaps	(%r8), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %esi
076f82
+	/* Shift out negative alignment (because we are starting from endptr and
076f82
+	   working backwards).  */
076f82
+	negl	%ecx
076f82
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
076f82
+	   explicitly.  */
076f82
+	andl	$(VEC_SIZE - 1), %ecx
076f82
+	shl	%cl, %esi
076f82
+	movzwl	%si, %eax
076f82
+	leaq	(%rdi, %rdx), %rcx
076f82
+	cmpq	%rdi, %r8
076f82
+	ja	L(more_1x_vec)
076f82
+	subl	$VEC_SIZE, %edx
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(ret_2)
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_1)
076f82
+	addq	%rdi, %rax
076f82
+L(ret_2):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches0_1):
076f82
-	bsr	%eax, %eax
076f82
-	sub	$64, %rdx
076f82
-	add	%rax, %rdx
076f82
-	jl	L(return_null)
076f82
-	add	%rdi, %rax
076f82
+	/* Fits in aliging bytes.  */
076f82
+L(zero_1):
076f82
+	xorl	%eax, %eax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches16_1):
076f82
-	bsr	%eax, %eax
076f82
-	sub	$48, %rdx
076f82
-	add	%rax, %rdx
076f82
-	jl	L(return_null)
076f82
-	lea	16(%rdi, %rax), %rax
076f82
+	.p2align 4,, 5
076f82
+L(ret_vec_x1):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(matches32_1):
076f82
-	bsr	%eax, %eax
076f82
-	sub	$32, %rdx
076f82
-	add	%rax, %rdx
076f82
-	jl	L(return_null)
076f82
-	lea	32(%rdi, %rax), %rax
076f82
-	ret
076f82
+	.p2align 4,, 8
076f82
+L(more_2x_vec):
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x0)
076f82
 
076f82
-	.p2align 4
076f82
-L(matches48_1):
076f82
-	bsr	%eax, %eax
076f82
-	sub	$16, %rdx
076f82
-	add	%rax, %rdx
076f82
-	jl	L(return_null)
076f82
-	lea	48(%rdi, %rax), %rax
076f82
-	ret
076f82
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x1)
076f82
 
076f82
-	.p2align 4
076f82
-L(return_null):
076f82
-	xor	%eax, %eax
076f82
-	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(length_less16_offset0):
076f82
-	test	%edx, %edx
076f82
-	jz	L(return_null)
076f82
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
 
076f82
-	mov	%dl, %cl
076f82
-	pcmpeqb	(%rdi), %xmm1
076f82
+	subq	$(VEC_SIZE * 4), %rdx
076f82
+	ja	L(more_4x_vec)
076f82
 
076f82
-	mov	$1, %edx
076f82
-	sal	%cl, %edx
076f82
-	sub	$1, %edx
076f82
+	addl	$(VEC_SIZE), %edx
076f82
+	jle	L(ret_vec_x2_test)
076f82
 
076f82
-	pmovmskb	%xmm1, %eax
076f82
+L(last_vec):
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x2)
076f82
 
076f82
-	and	%edx, %eax
076f82
-	test	%eax, %eax
076f82
-	jz	L(return_null)
076f82
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
 
076f82
-	bsr	%eax, %eax
076f82
-	add	%rdi, %rax
076f82
+	subl	$(VEC_SIZE), %edx
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(ret_3)
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_2)
076f82
+	addq	%rdi, %rax
076f82
+L(ret_3):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(length_less16):
076f82
-	punpcklbw	%xmm1, %xmm1
076f82
-	punpcklbw	%xmm1, %xmm1
076f82
-
076f82
-	add	$16, %edx
076f82
-
076f82
-	pshufd	$0, %xmm1, %xmm1
076f82
-
076f82
-	mov	%edi, %ecx
076f82
-	and	$15, %ecx
076f82
-	jz	L(length_less16_offset0)
076f82
-
076f82
-	mov	%cl, %dh
076f82
-	mov	%ecx, %esi
076f82
-	add	%dl, %dh
076f82
-	and	$-16, %rdi
076f82
-
076f82
-	sub	$16, %dh
076f82
-	ja	L(length_less16_part2)
076f82
-
076f82
-	pcmpeqb	(%rdi), %xmm1
076f82
-	pmovmskb	%xmm1, %eax
076f82
-
076f82
-	sar	%cl, %eax
076f82
-	mov	%dl, %cl
076f82
-
076f82
-	mov	$1, %edx
076f82
-	sal	%cl, %edx
076f82
-	sub	$1, %edx
076f82
-
076f82
-	and	%edx, %eax
076f82
-	test	%eax, %eax
076f82
-	jz	L(return_null)
076f82
-
076f82
-	bsr	%eax, %eax
076f82
-	add	%rdi, %rax
076f82
-	add	%rsi, %rax
076f82
+	.p2align 4,, 6
076f82
+L(ret_vec_x2_test):
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(zero_2)
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_2)
076f82
+	addq	%rdi, %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(length_less16_part2):
076f82
-	movdqa	16(%rdi), %xmm2
076f82
-	pcmpeqb	%xmm1, %xmm2
076f82
-	pmovmskb	%xmm2, %eax
076f82
-
076f82
-	mov	%dh, %cl
076f82
-	mov	$1, %edx
076f82
-	sal	%cl, %edx
076f82
-	sub	$1, %edx
076f82
-
076f82
-	and	%edx, %eax
076f82
+L(zero_2):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
 
076f82
-	test	%eax, %eax
076f82
-	jnz	L(length_less16_part2_return)
076f82
 
076f82
-	pcmpeqb	(%rdi), %xmm1
076f82
-	pmovmskb	%xmm1, %eax
076f82
+	.p2align 4,, 5
076f82
+L(ret_vec_x2):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
076f82
+	ret
076f82
 
076f82
-	mov	%esi, %ecx
076f82
-	sar	%cl, %eax
076f82
-	test	%eax, %eax
076f82
-	jz	L(return_null)
076f82
+	.p2align 4,, 5
076f82
+L(ret_vec_x3):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
076f82
+	ret
076f82
 
076f82
-	bsr	%eax, %eax
076f82
-	add	%rdi, %rax
076f82
-	add	%rsi, %rax
076f82
+	.p2align 4,, 8
076f82
+L(more_4x_vec):
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x2)
076f82
+
076f82
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x3)
076f82
+
076f82
+	addq	$-(VEC_SIZE * 4), %rcx
076f82
+	cmpq	$(VEC_SIZE * 4), %rdx
076f82
+	jbe	L(last_4x_vec)
076f82
+
076f82
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
076f82
+	   keeping the code from spilling to the next cache line.  */
076f82
+	addq	$(VEC_SIZE * 4 - 1), %rcx
076f82
+	andq	$-(VEC_SIZE * 4), %rcx
076f82
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
076f82
+	andq	$-(VEC_SIZE * 4), %rdx
076f82
+
076f82
+	.p2align 4,, 11
076f82
+L(loop_4x_vec):
076f82
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
076f82
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
076f82
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
076f82
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm2
076f82
+	pcmpeqb	%xmm0, %xmm3
076f82
+	pcmpeqb	%xmm0, %xmm4
076f82
+
076f82
+	por	%xmm1, %xmm2
076f82
+	por	%xmm3, %xmm4
076f82
+	por	%xmm2, %xmm4
076f82
+
076f82
+	pmovmskb %xmm4, %esi
076f82
+	testl	%esi, %esi
076f82
+	jnz	L(loop_end)
076f82
+
076f82
+	addq	$-(VEC_SIZE * 4), %rcx
076f82
+	cmpq	%rdx, %rcx
076f82
+	jne	L(loop_4x_vec)
076f82
+
076f82
+	subl	%edi, %edx
076f82
+
076f82
+	/* Ends up being 1-byte nop.  */
076f82
+	.p2align 4,, 2
076f82
+L(last_4x_vec):
076f82
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	cmpl	$(VEC_SIZE * 2), %edx
076f82
+	jbe	L(last_2x_vec)
076f82
+
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+
076f82
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_end)
076f82
+
076f82
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
076f82
+	pcmpeqb	%xmm0, %xmm1
076f82
+	pmovmskb %xmm1, %eax
076f82
+
076f82
+	subl	$(VEC_SIZE * 3), %edx
076f82
+	ja	L(last_vec)
076f82
+	bsrl	%eax, %eax
076f82
+	jz	L(ret_4)
076f82
+	addl	%edx, %eax
076f82
+	jl	L(zero_3)
076f82
+	addq	%rdi, %rax
076f82
+L(ret_4):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(length_less16_part2_return):
076f82
-	bsr	%eax, %eax
076f82
-	lea	16(%rax, %rdi), %rax
076f82
+	/* Ends up being 1-byte nop.  */
076f82
+	.p2align 4,, 3
076f82
+L(loop_end):
076f82
+	pmovmskb %xmm1, %eax
076f82
+	sall	$16, %eax
076f82
+	jnz	L(ret_vec_end)
076f82
+
076f82
+	pmovmskb %xmm2, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(ret_vec_end)
076f82
+
076f82
+	pmovmskb %xmm3, %eax
076f82
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
076f82
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
076f82
+	   then CHAR in VEC3 and bsrq will use that position.  */
076f82
+	sall	$16, %eax
076f82
+	orl	%esi, %eax
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
076f82
 	ret
076f82
 
076f82
-END (__memrchr)
076f82
+L(ret_vec_end):
076f82
+	bsrl	%eax, %eax
076f82
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
076f82
+	ret
076f82
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
076f82
+	   aligning bytes.  */
076f82
+L(zero_3):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+	/* 2-bytes from next cache line.  */
076f82
+END(__memrchr)
076f82
 weak_alias (__memrchr, memrchr)