08c3a6
commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Jun 6 21:11:30 2022 -0700
08c3a6
08c3a6
    x86: Optimize memrchr-sse2.S
08c3a6
    
08c3a6
    The new code:
08c3a6
        1. prioritizes smaller lengths more.
08c3a6
        2. optimizes target placement more carefully.
08c3a6
        3. reuses logic more.
08c3a6
        4. fixes up various inefficiencies in the logic.
08c3a6
    
08c3a6
    The total code size saving is: 394 bytes
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.874
08c3a6
    
08c3a6
    Regressions:
08c3a6
        1. The page cross case is now colder, especially re-entry from the
08c3a6
           page cross case if a match is not found in the first VEC
08c3a6
           (roughly 50%). My general opinion with this patch is this is
08c3a6
           acceptable given the "coldness" of this case (less than 4%) and
08c3a6
           generally performance improvement in the other far more common
08c3a6
           cases.
08c3a6
    
08c3a6
        2. There are some regressions 5-15% for medium/large user-arg
08c3a6
           lengths that have a match in the first VEC. This is because the
08c3a6
           logic was rewritten to optimize finds in the first VEC if the
08c3a6
           user-arg length is shorter (where we see roughly 20-50%
08c3a6
           performance improvements). It is not always the case this is a
08c3a6
           regression. My intuition is some frontend quirk is partially
08c3a6
           explaining the data although I haven't been able to find the
08c3a6
           root cause.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 731feee3869550e93177e604604c1765d81de571)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
08c3a6
index cc2001167d77c83c..c2a5902bf9385c67 100644
08c3a6
--- a/sysdeps/x86_64/memrchr.S
08c3a6
+++ b/sysdeps/x86_64/memrchr.S
08c3a6
@@ -19,362 +19,333 @@
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
 #include <sysdep.h>
08c3a6
+#define VEC_SIZE			16
08c3a6
+#define PAGE_SIZE			4096
08c3a6
 
08c3a6
 	.text
08c3a6
-ENTRY (__memrchr)
08c3a6
-	movd	%esi, %xmm1
08c3a6
-
08c3a6
-	sub	$16, %RDX_LP
08c3a6
-	jbe	L(length_less16)
08c3a6
-
08c3a6
-	punpcklbw	%xmm1, %xmm1
08c3a6
-	punpcklbw	%xmm1, %xmm1
08c3a6
-
08c3a6
-	add	%RDX_LP, %RDI_LP
08c3a6
-	pshufd	$0, %xmm1, %xmm1
08c3a6
-
08c3a6
-	movdqu	(%rdi), %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-
08c3a6
-/* Check if there is a match.  */
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches0)
08c3a6
-
08c3a6
-	sub	$64, %rdi
08c3a6
-	mov	%edi, %ecx
08c3a6
-	and	$15, %ecx
08c3a6
-	jz	L(loop_prolog)
08c3a6
-
08c3a6
-	add	$16, %rdi
08c3a6
-	add	$16, %rdx
08c3a6
-	and	$-16, %rdi
08c3a6
-	sub	%rcx, %rdx
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(loop_prolog):
08c3a6
-	sub	$64, %rdx
08c3a6
-	jbe	L(exit_loop)
08c3a6
-
08c3a6
-	movdqa	48(%rdi), %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches48)
08c3a6
-
08c3a6
-	movdqa	32(%rdi), %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches32)
08c3a6
-
08c3a6
-	movdqa	16(%rdi), %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pmovmskb	%xmm3, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches16)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm4
08c3a6
-	pcmpeqb	%xmm1, %xmm4
08c3a6
-	pmovmskb	%xmm4, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches0)
08c3a6
-
08c3a6
-	sub	$64, %rdi
08c3a6
-	sub	$64, %rdx
08c3a6
-	jbe	L(exit_loop)
08c3a6
-
08c3a6
-	movdqa	48(%rdi), %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches48)
08c3a6
-
08c3a6
-	movdqa	32(%rdi), %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches32)
08c3a6
-
08c3a6
-	movdqa	16(%rdi), %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pmovmskb	%xmm3, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches16)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pmovmskb	%xmm3, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches0)
08c3a6
-
08c3a6
-	mov	%edi, %ecx
08c3a6
-	and	$63, %ecx
08c3a6
-	jz	L(align64_loop)
08c3a6
-
08c3a6
-	add	$64, %rdi
08c3a6
-	add	$64, %rdx
08c3a6
-	and	$-64, %rdi
08c3a6
-	sub	%rcx, %rdx
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(align64_loop):
08c3a6
-	sub	$64, %rdi
08c3a6
-	sub	$64, %rdx
08c3a6
-	jbe	L(exit_loop)
08c3a6
-
08c3a6
-	movdqa	(%rdi), %xmm0
08c3a6
-	movdqa	16(%rdi), %xmm2
08c3a6
-	movdqa	32(%rdi), %xmm3
08c3a6
-	movdqa	48(%rdi), %xmm4
08c3a6
-
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm4
08c3a6
-
08c3a6
-	pmaxub	%xmm3, %xmm0
08c3a6
-	pmaxub	%xmm4, %xmm2
08c3a6
-	pmaxub	%xmm0, %xmm2
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-
08c3a6
-	test	%eax, %eax
08c3a6
-	jz	L(align64_loop)
08c3a6
-
08c3a6
-	pmovmskb	%xmm4, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches48)
08c3a6
-
08c3a6
-	pmovmskb	%xmm3, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches32)
08c3a6
-
08c3a6
-	movdqa	16(%rdi), %xmm2
08c3a6
-
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pcmpeqb	(%rdi), %xmm1
08c3a6
-
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches16)
08c3a6
-
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
-	bsr	%eax, %eax
08c3a6
-
08c3a6
-	add	%rdi, %rax
08c3a6
+ENTRY_P2ALIGN(__memrchr, 6)
08c3a6
+#ifdef __ILP32__
08c3a6
+	/* Clear upper bits.  */
08c3a6
+	mov	%RDX_LP, %RDX_LP
08c3a6
+#endif
08c3a6
+	movd	%esi, %xmm0
08c3a6
+
08c3a6
+	/* Get end pointer.  */
08c3a6
+	leaq	(%rdx, %rdi), %rcx
08c3a6
+
08c3a6
+	punpcklbw %xmm0, %xmm0
08c3a6
+	punpcklwd %xmm0, %xmm0
08c3a6
+	pshufd	$0, %xmm0, %xmm0
08c3a6
+
08c3a6
+	/* Check if we can load 1x VEC without cross a page.  */
08c3a6
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
08c3a6
+	jz	L(page_cross)
08c3a6
+
08c3a6
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
08c3a6
+	   it doesn't cross a page and the standard gurantees any pointer have
08c3a6
+	   at least one-valid byte this load must be safe. For the entire
08c3a6
+	   history of the x86 memrchr implementation this has been possible so
08c3a6
+	   no code "should" be relying on a zero-length check before this load.
08c3a6
+	   The zero-length check is moved to the page cross case because it is
08c3a6
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
08c3a6
+	   into 2-cache lines.  */
08c3a6
+	movups	-(VEC_SIZE)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	subq	$VEC_SIZE, %rdx
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+L(ret_vec_x0_test):
08c3a6
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
08c3a6
+	   zero.  */
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(ret_0)
08c3a6
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
08c3a6
+	   if out of bounds.  */
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_0)
08c3a6
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
08c3a6
+	   ptr.  */
08c3a6
+	addq	%rdi, %rax
08c3a6
+L(ret_0):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(exit_loop):
08c3a6
-	add	$64, %edx
08c3a6
-	cmp	$32, %edx
08c3a6
-	jbe	L(exit_loop_32)
08c3a6
-
08c3a6
-	movdqa	48(%rdi), %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches48)
08c3a6
-
08c3a6
-	movdqa	32(%rdi), %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches32)
08c3a6
-
08c3a6
-	movdqa	16(%rdi), %xmm3
08c3a6
-	pcmpeqb	%xmm1, %xmm3
08c3a6
-	pmovmskb	%xmm3, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches16_1)
08c3a6
-	cmp	$48, %edx
08c3a6
-	jbe	L(return_null)
08c3a6
-
08c3a6
-	pcmpeqb	(%rdi), %xmm1
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches0_1)
08c3a6
-	xor	%eax, %eax
08c3a6
+	.p2align 4,, 5
08c3a6
+L(ret_vec_x0):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(exit_loop_32):
08c3a6
-	movdqa	48(%rdi), %xmm0
08c3a6
-	pcmpeqb	%xmm1, %xmm0
08c3a6
-	pmovmskb	%xmm0, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches48_1)
08c3a6
-	cmp	$16, %edx
08c3a6
-	jbe	L(return_null)
08c3a6
-
08c3a6
-	pcmpeqb	32(%rdi), %xmm1
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(matches32_1)
08c3a6
-	xor	%eax, %eax
08c3a6
+	.p2align 4,, 2
08c3a6
+L(zero_0):
08c3a6
+	xorl	%eax, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches0):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	add	%rdi, %rax
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(matches16):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	lea	16(%rax, %rdi), %rax
08c3a6
-	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches32):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	lea	32(%rax, %rdi), %rax
08c3a6
+	.p2align 4,, 8
08c3a6
+L(more_1x_vec):
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+	/* Align rcx (pointer to string).  */
08c3a6
+	decq	%rcx
08c3a6
+	andq	$-VEC_SIZE, %rcx
08c3a6
+
08c3a6
+	movq	%rcx, %rdx
08c3a6
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
08c3a6
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
08c3a6
+	   it adds more frontend uops (even if the moves can be eliminated) and
08c3a6
+	   some percentage of the time actual backend uops.  */
08c3a6
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	subq	%rdi, %rdx
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	cmpq	$(VEC_SIZE * 2), %rdx
08c3a6
+	ja	L(more_2x_vec)
08c3a6
+L(last_2x_vec):
08c3a6
+	subl	$VEC_SIZE, %edx
08c3a6
+	jbe	L(ret_vec_x0_test)
08c3a6
+
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	subl	$VEC_SIZE, %edx
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(ret_1)
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_0)
08c3a6
+	addq	%rdi, %rax
08c3a6
+L(ret_1):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches48):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	lea	48(%rax, %rdi), %rax
08c3a6
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
08c3a6
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
08c3a6
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
08c3a6
+L(page_cross):
08c3a6
+	/* Zero length check.  */
08c3a6
+	testq	%rdx, %rdx
08c3a6
+	jz	L(zero_0)
08c3a6
+
08c3a6
+	leaq	-1(%rcx), %r8
08c3a6
+	andq	$-(VEC_SIZE), %r8
08c3a6
+
08c3a6
+	movaps	(%r8), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %esi
08c3a6
+	/* Shift out negative alignment (because we are starting from endptr and
08c3a6
+	   working backwards).  */
08c3a6
+	negl	%ecx
08c3a6
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
08c3a6
+	   explicitly.  */
08c3a6
+	andl	$(VEC_SIZE - 1), %ecx
08c3a6
+	shl	%cl, %esi
08c3a6
+	movzwl	%si, %eax
08c3a6
+	leaq	(%rdi, %rdx), %rcx
08c3a6
+	cmpq	%rdi, %r8
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+	subl	$VEC_SIZE, %edx
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(ret_2)
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_1)
08c3a6
+	addq	%rdi, %rax
08c3a6
+L(ret_2):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches0_1):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	sub	$64, %rdx
08c3a6
-	add	%rax, %rdx
08c3a6
-	jl	L(return_null)
08c3a6
-	add	%rdi, %rax
08c3a6
+	/* Fits in aliging bytes.  */
08c3a6
+L(zero_1):
08c3a6
+	xorl	%eax, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches16_1):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	sub	$48, %rdx
08c3a6
-	add	%rax, %rdx
08c3a6
-	jl	L(return_null)
08c3a6
-	lea	16(%rdi, %rax), %rax
08c3a6
+	.p2align 4,, 5
08c3a6
+L(ret_vec_x1):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches32_1):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	sub	$32, %rdx
08c3a6
-	add	%rax, %rdx
08c3a6
-	jl	L(return_null)
08c3a6
-	lea	32(%rdi, %rax), %rax
08c3a6
-	ret
08c3a6
+	.p2align 4,, 8
08c3a6
+L(more_2x_vec):
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(matches48_1):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	sub	$16, %rdx
08c3a6
-	add	%rax, %rdx
08c3a6
-	jl	L(return_null)
08c3a6
-	lea	48(%rdi, %rax), %rax
08c3a6
-	ret
08c3a6
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x1)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(return_null):
08c3a6
-	xor	%eax, %eax
08c3a6
-	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(length_less16_offset0):
08c3a6
-	test	%edx, %edx
08c3a6
-	jz	L(return_null)
08c3a6
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
 
08c3a6
-	mov	%dl, %cl
08c3a6
-	pcmpeqb	(%rdi), %xmm1
08c3a6
+	subq	$(VEC_SIZE * 4), %rdx
08c3a6
+	ja	L(more_4x_vec)
08c3a6
 
08c3a6
-	mov	$1, %edx
08c3a6
-	sal	%cl, %edx
08c3a6
-	sub	$1, %edx
08c3a6
+	addl	$(VEC_SIZE), %edx
08c3a6
+	jle	L(ret_vec_x2_test)
08c3a6
 
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
+L(last_vec):
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
 
08c3a6
-	and	%edx, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jz	L(return_null)
08c3a6
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
 
08c3a6
-	bsr	%eax, %eax
08c3a6
-	add	%rdi, %rax
08c3a6
+	subl	$(VEC_SIZE), %edx
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(ret_3)
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_2)
08c3a6
+	addq	%rdi, %rax
08c3a6
+L(ret_3):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(length_less16):
08c3a6
-	punpcklbw	%xmm1, %xmm1
08c3a6
-	punpcklbw	%xmm1, %xmm1
08c3a6
-
08c3a6
-	add	$16, %edx
08c3a6
-
08c3a6
-	pshufd	$0, %xmm1, %xmm1
08c3a6
-
08c3a6
-	mov	%edi, %ecx
08c3a6
-	and	$15, %ecx
08c3a6
-	jz	L(length_less16_offset0)
08c3a6
-
08c3a6
-	mov	%cl, %dh
08c3a6
-	mov	%ecx, %esi
08c3a6
-	add	%dl, %dh
08c3a6
-	and	$-16, %rdi
08c3a6
-
08c3a6
-	sub	$16, %dh
08c3a6
-	ja	L(length_less16_part2)
08c3a6
-
08c3a6
-	pcmpeqb	(%rdi), %xmm1
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
-
08c3a6
-	sar	%cl, %eax
08c3a6
-	mov	%dl, %cl
08c3a6
-
08c3a6
-	mov	$1, %edx
08c3a6
-	sal	%cl, %edx
08c3a6
-	sub	$1, %edx
08c3a6
-
08c3a6
-	and	%edx, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jz	L(return_null)
08c3a6
-
08c3a6
-	bsr	%eax, %eax
08c3a6
-	add	%rdi, %rax
08c3a6
-	add	%rsi, %rax
08c3a6
+	.p2align 4,, 6
08c3a6
+L(ret_vec_x2_test):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(zero_2)
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_2)
08c3a6
+	addq	%rdi, %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(length_less16_part2):
08c3a6
-	movdqa	16(%rdi), %xmm2
08c3a6
-	pcmpeqb	%xmm1, %xmm2
08c3a6
-	pmovmskb	%xmm2, %eax
08c3a6
-
08c3a6
-	mov	%dh, %cl
08c3a6
-	mov	$1, %edx
08c3a6
-	sal	%cl, %edx
08c3a6
-	sub	$1, %edx
08c3a6
-
08c3a6
-	and	%edx, %eax
08c3a6
+L(zero_2):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
 
08c3a6
-	test	%eax, %eax
08c3a6
-	jnz	L(length_less16_part2_return)
08c3a6
 
08c3a6
-	pcmpeqb	(%rdi), %xmm1
08c3a6
-	pmovmskb	%xmm1, %eax
08c3a6
+	.p2align 4,, 5
08c3a6
+L(ret_vec_x2):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	mov	%esi, %ecx
08c3a6
-	sar	%cl, %eax
08c3a6
-	test	%eax, %eax
08c3a6
-	jz	L(return_null)
08c3a6
+	.p2align 4,, 5
08c3a6
+L(ret_vec_x3):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	bsr	%eax, %eax
08c3a6
-	add	%rdi, %rax
08c3a6
-	add	%rsi, %rax
08c3a6
+	.p2align 4,, 8
08c3a6
+L(more_4x_vec):
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
+
08c3a6
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x3)
08c3a6
+
08c3a6
+	addq	$-(VEC_SIZE * 4), %rcx
08c3a6
+	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
+	jbe	L(last_4x_vec)
08c3a6
+
08c3a6
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
08c3a6
+	   keeping the code from spilling to the next cache line.  */
08c3a6
+	addq	$(VEC_SIZE * 4 - 1), %rcx
08c3a6
+	andq	$-(VEC_SIZE * 4), %rcx
08c3a6
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
08c3a6
+	andq	$-(VEC_SIZE * 4), %rdx
08c3a6
+
08c3a6
+	.p2align 4,, 11
08c3a6
+L(loop_4x_vec):
08c3a6
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
08c3a6
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
08c3a6
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
08c3a6
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm2
08c3a6
+	pcmpeqb	%xmm0, %xmm3
08c3a6
+	pcmpeqb	%xmm0, %xmm4
08c3a6
+
08c3a6
+	por	%xmm1, %xmm2
08c3a6
+	por	%xmm3, %xmm4
08c3a6
+	por	%xmm2, %xmm4
08c3a6
+
08c3a6
+	pmovmskb %xmm4, %esi
08c3a6
+	testl	%esi, %esi
08c3a6
+	jnz	L(loop_end)
08c3a6
+
08c3a6
+	addq	$-(VEC_SIZE * 4), %rcx
08c3a6
+	cmpq	%rdx, %rcx
08c3a6
+	jne	L(loop_4x_vec)
08c3a6
+
08c3a6
+	subl	%edi, %edx
08c3a6
+
08c3a6
+	/* Ends up being 1-byte nop.  */
08c3a6
+	.p2align 4,, 2
08c3a6
+L(last_4x_vec):
08c3a6
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	cmpl	$(VEC_SIZE * 2), %edx
08c3a6
+	jbe	L(last_2x_vec)
08c3a6
+
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+
08c3a6
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_end)
08c3a6
+
08c3a6
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
08c3a6
+	pcmpeqb	%xmm0, %xmm1
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+
08c3a6
+	subl	$(VEC_SIZE * 3), %edx
08c3a6
+	ja	L(last_vec)
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	jz	L(ret_4)
08c3a6
+	addl	%edx, %eax
08c3a6
+	jl	L(zero_3)
08c3a6
+	addq	%rdi, %rax
08c3a6
+L(ret_4):
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(length_less16_part2_return):
08c3a6
-	bsr	%eax, %eax
08c3a6
-	lea	16(%rax, %rdi), %rax
08c3a6
+	/* Ends up being 1-byte nop.  */
08c3a6
+	.p2align 4,, 3
08c3a6
+L(loop_end):
08c3a6
+	pmovmskb %xmm1, %eax
08c3a6
+	sall	$16, %eax
08c3a6
+	jnz	L(ret_vec_end)
08c3a6
+
08c3a6
+	pmovmskb %xmm2, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(ret_vec_end)
08c3a6
+
08c3a6
+	pmovmskb %xmm3, %eax
08c3a6
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
08c3a6
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
08c3a6
+	   then CHAR in VEC3 and bsrq will use that position.  */
08c3a6
+	sall	$16, %eax
08c3a6
+	orl	%esi, %eax
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-END (__memrchr)
08c3a6
+L(ret_vec_end):
08c3a6
+	bsrl	%eax, %eax
08c3a6
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
08c3a6
+	ret
08c3a6
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
08c3a6
+	   aligning bytes.  */
08c3a6
+L(zero_3):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
+	/* 2-bytes from next cache line.  */
08c3a6
+END(__memrchr)
08c3a6
 weak_alias (__memrchr, memrchr)