08c3a6
commit b05bd59823bcedee281d3fd5bd4928698ea9d69d
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Jun 6 21:11:32 2022 -0700
08c3a6
08c3a6
    x86: Optimize memrchr-avx2.S
08c3a6
    
08c3a6
    The new code:
08c3a6
        1. prioritizes smaller user-arg lengths more.
08c3a6
        2. optimizes target placement more carefully
08c3a6
        3. reuses logic more
08c3a6
        4. fixes up various inefficiencies in the logic. The biggest
08c3a6
           case here is the `lzcnt` logic for checking returns which
08c3a6
           saves either a branch or multiple instructions.
08c3a6
    
08c3a6
    The total code size saving is: 306 bytes
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.760
08c3a6
    
08c3a6
    Regressions:
08c3a6
    There are some regressions. Particularly where the length (user arg
08c3a6
    length) is large but the position of the match char is near the
08c3a6
    beginning of the string (in first VEC). This case has roughly a
08c3a6
    10-20% regression.
08c3a6
    
08c3a6
    This is because the new logic gives the hot path for immediate matches
08c3a6
    to shorter lengths (the more common input). This case has roughly
08c3a6
    a 15-45% speedup.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
08c3a6
index cea2d2a72db7406a..5e9beeeef2677c9f 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
08c3a6
@@ -2,6 +2,7 @@
08c3a6
 # define MEMRCHR __memrchr_avx2_rtm
08c3a6
 #endif
08c3a6
 
08c3a6
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
08c3a6
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
08c3a6
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
08c3a6
 
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
08c3a6
index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
08c3a6
@@ -21,340 +21,318 @@
08c3a6
 # include <sysdep.h>
08c3a6
 
08c3a6
 # ifndef MEMRCHR
08c3a6
-#  define MEMRCHR	__memrchr_avx2
08c3a6
+#  define MEMRCHR				__memrchr_avx2
08c3a6
 # endif
08c3a6
 
08c3a6
 # ifndef VZEROUPPER
08c3a6
-#  define VZEROUPPER	vzeroupper
08c3a6
+#  define VZEROUPPER			vzeroupper
08c3a6
 # endif
08c3a6
 
08c3a6
 # ifndef SECTION
08c3a6
 #  define SECTION(p)	p##.avx
08c3a6
 # endif
08c3a6
 
08c3a6
-# define VEC_SIZE 32
08c3a6
+# define VEC_SIZE			32
08c3a6
+# define PAGE_SIZE			4096
08c3a6
+	.section SECTION(.text), "ax", @progbits
08c3a6
+ENTRY(MEMRCHR)
08c3a6
+# ifdef __ILP32__
08c3a6
+	/* Clear upper bits.  */
08c3a6
+	and	%RDX_LP, %RDX_LP
08c3a6
+# else
08c3a6
+	test	%RDX_LP, %RDX_LP
08c3a6
+# endif
08c3a6
+	jz	L(zero_0)
08c3a6
 
08c3a6
-	.section SECTION(.text),"ax",@progbits
08c3a6
-ENTRY (MEMRCHR)
08c3a6
-	/* Broadcast CHAR to YMM0.  */
08c3a6
 	vmovd	%esi, %xmm0
08c3a6
-	vpbroadcastb %xmm0, %ymm0
08c3a6
-
08c3a6
-	sub	$VEC_SIZE, %RDX_LP
08c3a6
-	jbe	L(last_vec_or_less)
08c3a6
-
08c3a6
-	add	%RDX_LP, %RDI_LP
08c3a6
-
08c3a6
-	/* Check the last VEC_SIZE bytes.  */
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x0)
08c3a6
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
08c3a6
+	   correct page cross check and 2) it correctly sets up end ptr to be
08c3a6
+	   subtract by lzcnt aligned.  */
08c3a6
+	leaq	-1(%rdx, %rdi), %rax
08c3a6
 
08c3a6
-	subq	$(VEC_SIZE * 4), %rdi
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	jz	L(aligned_more)
08c3a6
+	vpbroadcastb %xmm0, %ymm0
08c3a6
 
08c3a6
-	/* Align data for aligned loads in the loop.  */
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-	addq	$VEC_SIZE, %rdx
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	subq	%rcx, %rdx
08c3a6
+	/* Check if we can load 1x VEC without cross a page.  */
08c3a6
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
+	jz	L(page_cross)
08c3a6
+
08c3a6
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	cmpq	$VEC_SIZE, %rdx
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+
08c3a6
+L(ret_vec_x0_test):
08c3a6
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
08c3a6
+	   will gurantee edx (len) is less than it.  */
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+
08c3a6
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
08c3a6
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
08c3a6
+	COND_VZEROUPPER
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_0)
08c3a6
+	subq	%rcx, %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(aligned_more):
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	jbe	L(last_4x_vec_or_less)
08c3a6
-
08c3a6
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
08c3a6
-	   since data is only aligned to VEC_SIZE.  */
08c3a6
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
-
08c3a6
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
08c3a6
-	vpmovmskb %ymm2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
-
08c3a6
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
-
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm4
08c3a6
-	vpmovmskb %ymm4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x0)
08c3a6
-
08c3a6
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
08c3a6
-	   There are some overlaps with above if data isn't aligned
08c3a6
-	   to 4 * VEC_SIZE.  */
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE * 4 - 1), %ecx
08c3a6
-	jz	L(loop_4x_vec)
08c3a6
-
08c3a6
-	addq	$(VEC_SIZE * 4), %rdi
08c3a6
-	addq	$(VEC_SIZE * 4), %rdx
08c3a6
-	andq	$-(VEC_SIZE * 4), %rdi
08c3a6
-	subq	%rcx, %rdx
08c3a6
+	/* Fits in aligning bytes of first cache line.  */
08c3a6
+L(zero_0):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(loop_4x_vec):
08c3a6
-	/* Compare 4 * VEC at a time forward.  */
08c3a6
-	subq	$(VEC_SIZE * 4), %rdi
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	jbe	L(last_4x_vec_or_less)
08c3a6
-
08c3a6
-	vmovdqa	(%rdi), %ymm1
08c3a6
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
08c3a6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
08c3a6
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
08c3a6
-
08c3a6
-	vpcmpeqb %ymm1, %ymm0, %ymm1
08c3a6
-	vpcmpeqb %ymm2, %ymm0, %ymm2
08c3a6
-	vpcmpeqb %ymm3, %ymm0, %ymm3
08c3a6
-	vpcmpeqb %ymm4, %ymm0, %ymm4
08c3a6
-
08c3a6
-	vpor	%ymm1, %ymm2, %ymm5
08c3a6
-	vpor	%ymm3, %ymm4, %ymm6
08c3a6
-	vpor	%ymm5, %ymm6, %ymm5
08c3a6
-
08c3a6
-	vpmovmskb %ymm5, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(loop_4x_vec)
08c3a6
-
08c3a6
-	/* There is a match.  */
08c3a6
-	vpmovmskb %ymm4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
-
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
-
08c3a6
-	vpmovmskb %ymm2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
-
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	.p2align 4,, 9
08c3a6
+L(ret_vec_x0):
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	%rcx, %rax
08c3a6
 L(return_vzeroupper):
08c3a6
 	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_4x_vec_or_less):
08c3a6
-	addl	$(VEC_SIZE * 4), %edx
08c3a6
-	cmpl	$(VEC_SIZE * 2), %edx
08c3a6
-	jbe	L(last_2x_vec)
08c3a6
-
08c3a6
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
-
08c3a6
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
08c3a6
-	vpmovmskb %ymm2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
-
08c3a6
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
08c3a6
-	vpmovmskb %ymm3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1_check)
08c3a6
-	cmpl	$(VEC_SIZE * 3), %edx
08c3a6
-	jbe	L(zero)
08c3a6
-
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm4
08c3a6
-	vpmovmskb %ymm4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 10
08c3a6
+L(more_1x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+	/* Align rax (string pointer).  */
08c3a6
+	andq	$-VEC_SIZE, %rax
08c3a6
+
08c3a6
+	/* Recompute remaining length after aligning.  */
08c3a6
+	movq	%rax, %rdx
08c3a6
+	/* Need this comparison next no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
08c3a6
+	subq	%rdi, %rdx
08c3a6
+	decq	%rax
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	/* Fall through for short (hotter than length).  */
08c3a6
+	cmpq	$(VEC_SIZE * 2), %rdx
08c3a6
+	ja	L(more_2x_vec)
08c3a6
 L(last_2x_vec):
08c3a6
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3_check)
08c3a6
 	cmpl	$VEC_SIZE, %edx
08c3a6
-	jbe	L(zero)
08c3a6
-
08c3a6
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 2), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$(VEC_SIZE * 2), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x0):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	jbe	L(ret_vec_x0_test)
08c3a6
+
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
08c3a6
+	lzcntq	%rcx, %rcx
08c3a6
+	COND_VZEROUPPER
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_0)
08c3a6
+	subq	%rcx, %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x1):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$VEC_SIZE, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x2):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$(VEC_SIZE * 2), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	/* Inexpensive place to put this regarding code size / target alignments
08c3a6
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
08c3a6
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
08c3a6
+	   in first cache line.  */
08c3a6
+L(page_cross):
08c3a6
+	movq	%rax, %rsi
08c3a6
+	andq	$-VEC_SIZE, %rsi
08c3a6
+	vpcmpeqb (%rsi), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	/* Shift out negative alignment (because we are starting from endptr and
08c3a6
+	   working backwards).  */
08c3a6
+	movl	%eax, %r8d
08c3a6
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
08c3a6
+	notl	%r8d
08c3a6
+	shlxl	%r8d, %ecx, %ecx
08c3a6
+	cmpq	%rdi, %rsi
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	COND_VZEROUPPER
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_0)
08c3a6
+	subq	%rcx, %rax
08c3a6
+	ret
08c3a6
+	.p2align 4,, 11
08c3a6
+L(ret_vec_x1):
08c3a6
+	/* This will naturally add 32 to position.  */
08c3a6
+	lzcntq	%rcx, %rcx
08c3a6
+	subq	%rcx, %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
+	.p2align 4,, 10
08c3a6
+L(more_2x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x3):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$(VEC_SIZE * 3), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	ret
08c3a6
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x1_check):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 3), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$VEC_SIZE, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x3_check):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$VEC_SIZE, %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$(VEC_SIZE * 3), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	/* Needed no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(zero):
08c3a6
-	xorl	%eax, %eax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	subq	$(VEC_SIZE * 4), %rdx
08c3a6
+	ja	L(more_4x_vec)
08c3a6
+
08c3a6
+	cmpl	$(VEC_SIZE * -1), %edx
08c3a6
+	jle	L(ret_vec_x2_test)
08c3a6
+
08c3a6
+L(last_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
+
08c3a6
+	/* Needed no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 3), %rax
08c3a6
+	COND_VZEROUPPER
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	ja	L(zero_2)
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(null):
08c3a6
+	/* First in aligning bytes.  */
08c3a6
+L(zero_2):
08c3a6
 	xorl	%eax, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_or_less_aligned):
08c3a6
-	movl	%edx, %ecx
08c3a6
+	.p2align 4,, 4
08c3a6
+L(ret_vec_x2_test):
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 2), %rax
08c3a6
+	COND_VZEROUPPER
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	ja	L(zero_2)
08c3a6
+	ret
08c3a6
 
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm1
08c3a6
 
08c3a6
-	movl	$1, %edx
08c3a6
-	/* Support rdx << 32.  */
08c3a6
-	salq	%cl, %rdx
08c3a6
-	subq	$1, %rdx
08c3a6
+	.p2align 4,, 11
08c3a6
+L(ret_vec_x2):
08c3a6
+	/* ecx must be non-zero.  */
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
+	.p2align 4,, 14
08c3a6
+L(ret_vec_x3):
08c3a6
+	/* ecx must be non-zero.  */
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	/* Remove the trailing bytes.  */
08c3a6
-	andl	%edx, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
 
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(last_vec_or_less):
08c3a6
-	addl	$VEC_SIZE, %edx
08c3a6
+L(more_4x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
 
08c3a6
-	/* Check for zero length.  */
08c3a6
-	testl	%edx, %edx
08c3a6
-	jz	L(null)
08c3a6
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
 
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	jz	L(last_vec_or_less_aligned)
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x3)
08c3a6
 
08c3a6
-	movl	%ecx, %esi
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	addl	%edx, %esi
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
+	/* Check if near end before re-aligning (otherwise might do an
08c3a6
+	   unnecissary loop iteration).  */
08c3a6
+	addq	$-(VEC_SIZE * 4), %rax
08c3a6
+	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
+	jbe	L(last_4x_vec)
08c3a6
 
08c3a6
-	subl	$VEC_SIZE, %esi
08c3a6
-	ja	L(last_vec_2x_aligned)
08c3a6
+	/* Align rax to (VEC_SIZE - 1).  */
08c3a6
+	orq	$(VEC_SIZE * 4 - 1), %rax
08c3a6
+	movq	%rdi, %rdx
08c3a6
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
08c3a6
+	   lengths that overflow can be valid and break the comparison.  */
08c3a6
+	orq	$(VEC_SIZE * 4 - 1), %rdx
08c3a6
 
08c3a6
-	/* Check the last VEC.  */
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-
08c3a6
-	/* Remove the leading and trailing bytes.  */
08c3a6
-	sarl	%cl, %eax
08c3a6
-	movl	%edx, %ecx
08c3a6
+	.p2align 4
08c3a6
+L(loop_4x_vec):
08c3a6
+	/* Need this comparison next no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
08c3a6
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
08c3a6
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
08c3a6
 
08c3a6
-	movl	$1, %edx
08c3a6
-	sall	%cl, %edx
08c3a6
-	subl	$1, %edx
08c3a6
+	vpor	%ymm1, %ymm2, %ymm2
08c3a6
+	vpor	%ymm3, %ymm4, %ymm4
08c3a6
+	vpor	%ymm2, %ymm4, %ymm4
08c3a6
+	vpmovmskb %ymm4, %esi
08c3a6
 
08c3a6
-	andl	%edx, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
+	testl	%esi, %esi
08c3a6
+	jnz	L(loop_end)
08c3a6
 
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	addq	%r8, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+	addq	$(VEC_SIZE * -4), %rax
08c3a6
+	cmpq	%rdx, %rax
08c3a6
+	jne	L(loop_4x_vec)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_2x_aligned):
08c3a6
-	movl	%esi, %ecx
08c3a6
+	subl	%edi, %edx
08c3a6
+	incl	%edx
08c3a6
 
08c3a6
-	/* Check the last VEC.  */
08c3a6
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
08c3a6
+L(last_4x_vec):
08c3a6
+	/* Used no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
 
08c3a6
-	movl	$1, %edx
08c3a6
-	sall	%cl, %edx
08c3a6
-	subl	$1, %edx
08c3a6
+	cmpl	$(VEC_SIZE * 2), %edx
08c3a6
+	jbe	L(last_2x_vec)
08c3a6
 
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0_end)
08c3a6
 
08c3a6
-	/* Remove the trailing bytes.  */
08c3a6
-	andl	%edx, %eax
08c3a6
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1_end)
08c3a6
 
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
+	/* Used no matter what.  */
08c3a6
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
 
08c3a6
-	/* Check the second last VEC.  */
08c3a6
-	vpcmpeqb (%rdi), %ymm0, %ymm1
08c3a6
+	cmpl	$(VEC_SIZE * 3), %edx
08c3a6
+	ja	L(last_vec)
08c3a6
+
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 2), %rax
08c3a6
+	COND_VZEROUPPER
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	jbe	L(ret0)
08c3a6
+	xorl	%eax, %eax
08c3a6
+L(ret0):
08c3a6
+	ret
08c3a6
 
08c3a6
-	movl	%r8d, %ecx
08c3a6
 
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
+	.p2align 4
08c3a6
+L(loop_end):
08c3a6
+	vpmovmskb %ymm1, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0_end)
08c3a6
+
08c3a6
+	vpmovmskb %ymm2, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1_end)
08c3a6
+
08c3a6
+	vpmovmskb %ymm3, %ecx
08c3a6
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
08c3a6
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
08c3a6
+	   then CHAR in VEC3 and bsrq will use that position.  */
08c3a6
+	salq	$32, %rcx
08c3a6
+	orq	%rsi, %rcx
08c3a6
+	bsrq	%rcx, %rcx
08c3a6
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	/* Remove the leading bytes.  Must use unsigned right shift for
08c3a6
-	   bsrl below.  */
08c3a6
-	shrl	%cl, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
+	.p2align 4,, 4
08c3a6
+L(ret_vec_x1_end):
08c3a6
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
08c3a6
+	lzcntq	%rcx, %rcx
08c3a6
+	subq	%rcx, %rax
08c3a6
+	VZEROUPPER_RETURN
08c3a6
 
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	addq	%r8, %rax
08c3a6
+	.p2align 4,, 4
08c3a6
+L(ret_vec_x0_end):
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	%rcx, %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-END (MEMRCHR)
08c3a6
+
08c3a6
+	/* 2 bytes until next cache line.  */
08c3a6
+END(MEMRCHR)
08c3a6
 #endif