076f82
commit b05bd59823bcedee281d3fd5bd4928698ea9d69d
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Jun 6 21:11:32 2022 -0700
076f82
076f82
    x86: Optimize memrchr-avx2.S
076f82
    
076f82
    The new code:
076f82
        1. prioritizes smaller user-arg lengths more.
076f82
        2. optimizes target placement more carefully
076f82
        3. reuses logic more
076f82
        4. fixes up various inefficiencies in the logic. The biggest
076f82
           case here is the `lzcnt` logic for checking returns which
076f82
           saves either a branch or multiple instructions.
076f82
    
076f82
    The total code size saving is: 306 bytes
076f82
    Geometric Mean of all benchmarks New / Old: 0.760
076f82
    
076f82
    Regressions:
076f82
    There are some regressions. Particularly where the length (user arg
076f82
    length) is large but the position of the match char is near the
076f82
    beginning of the string (in first VEC). This case has roughly a
076f82
    10-20% regression.
076f82
    
076f82
    This is because the new logic gives the hot path for immediate matches
076f82
    to shorter lengths (the more common input). This case has roughly
076f82
    a 15-45% speedup.
076f82
    
076f82
    Full xcheck passes on x86_64.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
076f82
index cea2d2a72db7406a..5e9beeeef2677c9f 100644
076f82
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
076f82
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
076f82
@@ -2,6 +2,7 @@
076f82
 # define MEMRCHR __memrchr_avx2_rtm
076f82
 #endif
076f82
 
076f82
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
076f82
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
076f82
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
076f82
 
076f82
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
076f82
index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644
076f82
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
076f82
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
076f82
@@ -21,340 +21,318 @@
076f82
 # include <sysdep.h>
076f82
 
076f82
 # ifndef MEMRCHR
076f82
-#  define MEMRCHR	__memrchr_avx2
076f82
+#  define MEMRCHR				__memrchr_avx2
076f82
 # endif
076f82
 
076f82
 # ifndef VZEROUPPER
076f82
-#  define VZEROUPPER	vzeroupper
076f82
+#  define VZEROUPPER			vzeroupper
076f82
 # endif
076f82
 
076f82
 # ifndef SECTION
076f82
 #  define SECTION(p)	p##.avx
076f82
 # endif
076f82
 
076f82
-# define VEC_SIZE 32
076f82
+# define VEC_SIZE			32
076f82
+# define PAGE_SIZE			4096
076f82
+	.section SECTION(.text), "ax", @progbits
076f82
+ENTRY(MEMRCHR)
076f82
+# ifdef __ILP32__
076f82
+	/* Clear upper bits.  */
076f82
+	and	%RDX_LP, %RDX_LP
076f82
+# else
076f82
+	test	%RDX_LP, %RDX_LP
076f82
+# endif
076f82
+	jz	L(zero_0)
076f82
 
076f82
-	.section SECTION(.text),"ax",@progbits
076f82
-ENTRY (MEMRCHR)
076f82
-	/* Broadcast CHAR to YMM0.  */
076f82
 	vmovd	%esi, %xmm0
076f82
-	vpbroadcastb %xmm0, %ymm0
076f82
-
076f82
-	sub	$VEC_SIZE, %RDX_LP
076f82
-	jbe	L(last_vec_or_less)
076f82
-
076f82
-	add	%RDX_LP, %RDI_LP
076f82
-
076f82
-	/* Check the last VEC_SIZE bytes.  */
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x0)
076f82
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
076f82
+	   correct page cross check and 2) it correctly sets up end ptr to be
076f82
+	   subtract by lzcnt aligned.  */
076f82
+	leaq	-1(%rdx, %rdi), %rax
076f82
 
076f82
-	subq	$(VEC_SIZE * 4), %rdi
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	jz	L(aligned_more)
076f82
+	vpbroadcastb %xmm0, %ymm0
076f82
 
076f82
-	/* Align data for aligned loads in the loop.  */
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	addq	$VEC_SIZE, %rdx
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	subq	%rcx, %rdx
076f82
+	/* Check if we can load 1x VEC without cross a page.  */
076f82
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
+	jz	L(page_cross)
076f82
+
076f82
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	cmpq	$VEC_SIZE, %rdx
076f82
+	ja	L(more_1x_vec)
076f82
+
076f82
+L(ret_vec_x0_test):
076f82
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
076f82
+	   will gurantee edx (len) is less than it.  */
076f82
+	lzcntl	%ecx, %ecx
076f82
+
076f82
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
076f82
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
076f82
+	COND_VZEROUPPER
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_0)
076f82
+	subq	%rcx, %rax
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(aligned_more):
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	jbe	L(last_4x_vec_or_less)
076f82
-
076f82
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
076f82
-	   since data is only aligned to VEC_SIZE.  */
076f82
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
-
076f82
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
076f82
-	vpmovmskb %ymm2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
-
076f82
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
-
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm4
076f82
-	vpmovmskb %ymm4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x0)
076f82
-
076f82
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
076f82
-	   There are some overlaps with above if data isn't aligned
076f82
-	   to 4 * VEC_SIZE.  */
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE * 4 - 1), %ecx
076f82
-	jz	L(loop_4x_vec)
076f82
-
076f82
-	addq	$(VEC_SIZE * 4), %rdi
076f82
-	addq	$(VEC_SIZE * 4), %rdx
076f82
-	andq	$-(VEC_SIZE * 4), %rdi
076f82
-	subq	%rcx, %rdx
076f82
+	/* Fits in aligning bytes of first cache line.  */
076f82
+L(zero_0):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(loop_4x_vec):
076f82
-	/* Compare 4 * VEC at a time forward.  */
076f82
-	subq	$(VEC_SIZE * 4), %rdi
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	jbe	L(last_4x_vec_or_less)
076f82
-
076f82
-	vmovdqa	(%rdi), %ymm1
076f82
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
076f82
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
076f82
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
076f82
-
076f82
-	vpcmpeqb %ymm1, %ymm0, %ymm1
076f82
-	vpcmpeqb %ymm2, %ymm0, %ymm2
076f82
-	vpcmpeqb %ymm3, %ymm0, %ymm3
076f82
-	vpcmpeqb %ymm4, %ymm0, %ymm4
076f82
-
076f82
-	vpor	%ymm1, %ymm2, %ymm5
076f82
-	vpor	%ymm3, %ymm4, %ymm6
076f82
-	vpor	%ymm5, %ymm6, %ymm5
076f82
-
076f82
-	vpmovmskb %ymm5, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(loop_4x_vec)
076f82
-
076f82
-	/* There is a match.  */
076f82
-	vpmovmskb %ymm4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
-
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
-
076f82
-	vpmovmskb %ymm2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
-
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
+	.p2align 4,, 9
076f82
+L(ret_vec_x0):
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	%rcx, %rax
076f82
 L(return_vzeroupper):
076f82
 	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(last_4x_vec_or_less):
076f82
-	addl	$(VEC_SIZE * 4), %edx
076f82
-	cmpl	$(VEC_SIZE * 2), %edx
076f82
-	jbe	L(last_2x_vec)
076f82
-
076f82
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
-
076f82
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
076f82
-	vpmovmskb %ymm2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
-
076f82
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
076f82
-	vpmovmskb %ymm3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1_check)
076f82
-	cmpl	$(VEC_SIZE * 3), %edx
076f82
-	jbe	L(zero)
076f82
-
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm4
076f82
-	vpmovmskb %ymm4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
-
076f82
-	.p2align 4
076f82
+	.p2align 4,, 10
076f82
+L(more_1x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+	/* Align rax (string pointer).  */
076f82
+	andq	$-VEC_SIZE, %rax
076f82
+
076f82
+	/* Recompute remaining length after aligning.  */
076f82
+	movq	%rax, %rdx
076f82
+	/* Need this comparison next no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
076f82
+	subq	%rdi, %rdx
076f82
+	decq	%rax
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	/* Fall through for short (hotter than length).  */
076f82
+	cmpq	$(VEC_SIZE * 2), %rdx
076f82
+	ja	L(more_2x_vec)
076f82
 L(last_2x_vec):
076f82
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3_check)
076f82
 	cmpl	$VEC_SIZE, %edx
076f82
-	jbe	L(zero)
076f82
-
076f82
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 2), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$(VEC_SIZE * 2), %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
-
076f82
-	.p2align 4
076f82
-L(last_vec_x0):
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
+	jbe	L(ret_vec_x0_test)
076f82
+
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
076f82
+	lzcntq	%rcx, %rcx
076f82
+	COND_VZEROUPPER
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_0)
076f82
+	subq	%rcx, %rax
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x1):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$VEC_SIZE, %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x2):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$(VEC_SIZE * 2), %eax
076f82
-	addq	%rdi, %rax
076f82
+	/* Inexpensive place to put this regarding code size / target alignments
076f82
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
076f82
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
076f82
+	   in first cache line.  */
076f82
+L(page_cross):
076f82
+	movq	%rax, %rsi
076f82
+	andq	$-VEC_SIZE, %rsi
076f82
+	vpcmpeqb (%rsi), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	/* Shift out negative alignment (because we are starting from endptr and
076f82
+	   working backwards).  */
076f82
+	movl	%eax, %r8d
076f82
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
076f82
+	notl	%r8d
076f82
+	shlxl	%r8d, %ecx, %ecx
076f82
+	cmpq	%rdi, %rsi
076f82
+	ja	L(more_1x_vec)
076f82
+	lzcntl	%ecx, %ecx
076f82
+	COND_VZEROUPPER
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_0)
076f82
+	subq	%rcx, %rax
076f82
+	ret
076f82
+	.p2align 4,, 11
076f82
+L(ret_vec_x1):
076f82
+	/* This will naturally add 32 to position.  */
076f82
+	lzcntq	%rcx, %rcx
076f82
+	subq	%rcx, %rax
076f82
 	VZEROUPPER_RETURN
076f82
+	.p2align 4,, 10
076f82
+L(more_2x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0)
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x3):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$(VEC_SIZE * 3), %eax
076f82
-	addq	%rdi, %rax
076f82
-	ret
076f82
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1)
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x1_check):
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 3), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$VEC_SIZE, %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x3_check):
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$VEC_SIZE, %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$(VEC_SIZE * 3), %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
+	/* Needed no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 
076f82
-	.p2align 4
076f82
-L(zero):
076f82
-	xorl	%eax, %eax
076f82
-	VZEROUPPER_RETURN
076f82
+	subq	$(VEC_SIZE * 4), %rdx
076f82
+	ja	L(more_4x_vec)
076f82
+
076f82
+	cmpl	$(VEC_SIZE * -1), %edx
076f82
+	jle	L(ret_vec_x2_test)
076f82
+
076f82
+L(last_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x2)
076f82
+
076f82
+	/* Needed no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 3), %rax
076f82
+	COND_VZEROUPPER
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	ja	L(zero_2)
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(null):
076f82
+	/* First in aligning bytes.  */
076f82
+L(zero_2):
076f82
 	xorl	%eax, %eax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_or_less_aligned):
076f82
-	movl	%edx, %ecx
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_x2_test):
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 2), %rax
076f82
+	COND_VZEROUPPER
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	ja	L(zero_2)
076f82
+	ret
076f82
 
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm1
076f82
 
076f82
-	movl	$1, %edx
076f82
-	/* Support rdx << 32.  */
076f82
-	salq	%cl, %rdx
076f82
-	subq	$1, %rdx
076f82
+	.p2align 4,, 11
076f82
+L(ret_vec_x2):
076f82
+	/* ecx must be non-zero.  */
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-	vpmovmskb %ymm1, %eax
076f82
+	.p2align 4,, 14
076f82
+L(ret_vec_x3):
076f82
+	/* ecx must be non-zero.  */
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-	/* Remove the trailing bytes.  */
076f82
-	andl	%edx, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
 
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
 	.p2align 4
076f82
-L(last_vec_or_less):
076f82
-	addl	$VEC_SIZE, %edx
076f82
+L(more_4x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x2)
076f82
 
076f82
-	/* Check for zero length.  */
076f82
-	testl	%edx, %edx
076f82
-	jz	L(null)
076f82
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	jz	L(last_vec_or_less_aligned)
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x3)
076f82
 
076f82
-	movl	%ecx, %esi
076f82
-	movl	%ecx, %r8d
076f82
-	addl	%edx, %esi
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
+	/* Check if near end before re-aligning (otherwise might do an
076f82
+	   unnecissary loop iteration).  */
076f82
+	addq	$-(VEC_SIZE * 4), %rax
076f82
+	cmpq	$(VEC_SIZE * 4), %rdx
076f82
+	jbe	L(last_4x_vec)
076f82
 
076f82
-	subl	$VEC_SIZE, %esi
076f82
-	ja	L(last_vec_2x_aligned)
076f82
+	/* Align rax to (VEC_SIZE - 1).  */
076f82
+	orq	$(VEC_SIZE * 4 - 1), %rax
076f82
+	movq	%rdi, %rdx
076f82
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
076f82
+	   lengths that overflow can be valid and break the comparison.  */
076f82
+	orq	$(VEC_SIZE * 4 - 1), %rdx
076f82
 
076f82
-	/* Check the last VEC.  */
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm1
076f82
-	vpmovmskb %ymm1, %eax
076f82
-
076f82
-	/* Remove the leading and trailing bytes.  */
076f82
-	sarl	%cl, %eax
076f82
-	movl	%edx, %ecx
076f82
+	.p2align 4
076f82
+L(loop_4x_vec):
076f82
+	/* Need this comparison next no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
076f82
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
076f82
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
076f82
 
076f82
-	movl	$1, %edx
076f82
-	sall	%cl, %edx
076f82
-	subl	$1, %edx
076f82
+	vpor	%ymm1, %ymm2, %ymm2
076f82
+	vpor	%ymm3, %ymm4, %ymm4
076f82
+	vpor	%ymm2, %ymm4, %ymm4
076f82
+	vpmovmskb %ymm4, %esi
076f82
 
076f82
-	andl	%edx, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
+	testl	%esi, %esi
076f82
+	jnz	L(loop_end)
076f82
 
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	addq	%r8, %rax
076f82
-	VZEROUPPER_RETURN
076f82
+	addq	$(VEC_SIZE * -4), %rax
076f82
+	cmpq	%rdx, %rax
076f82
+	jne	L(loop_4x_vec)
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_2x_aligned):
076f82
-	movl	%esi, %ecx
076f82
+	subl	%edi, %edx
076f82
+	incl	%edx
076f82
 
076f82
-	/* Check the last VEC.  */
076f82
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
076f82
+L(last_4x_vec):
076f82
+	/* Used no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 
076f82
-	movl	$1, %edx
076f82
-	sall	%cl, %edx
076f82
-	subl	$1, %edx
076f82
+	cmpl	$(VEC_SIZE * 2), %edx
076f82
+	jbe	L(last_2x_vec)
076f82
 
076f82
-	vpmovmskb %ymm1, %eax
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0_end)
076f82
 
076f82
-	/* Remove the trailing bytes.  */
076f82
-	andl	%edx, %eax
076f82
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1_end)
076f82
 
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
+	/* Used no matter what.  */
076f82
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 
076f82
-	/* Check the second last VEC.  */
076f82
-	vpcmpeqb (%rdi), %ymm0, %ymm1
076f82
+	cmpl	$(VEC_SIZE * 3), %edx
076f82
+	ja	L(last_vec)
076f82
+
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 2), %rax
076f82
+	COND_VZEROUPPER
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	jbe	L(ret0)
076f82
+	xorl	%eax, %eax
076f82
+L(ret0):
076f82
+	ret
076f82
 
076f82
-	movl	%r8d, %ecx
076f82
 
076f82
-	vpmovmskb %ymm1, %eax
076f82
+	.p2align 4
076f82
+L(loop_end):
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0_end)
076f82
+
076f82
+	vpmovmskb %ymm2, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1_end)
076f82
+
076f82
+	vpmovmskb %ymm3, %ecx
076f82
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
076f82
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
076f82
+	   then CHAR in VEC3 and bsrq will use that position.  */
076f82
+	salq	$32, %rcx
076f82
+	orq	%rsi, %rcx
076f82
+	bsrq	%rcx, %rcx
076f82
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-	/* Remove the leading bytes.  Must use unsigned right shift for
076f82
-	   bsrl below.  */
076f82
-	shrl	%cl, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_x1_end):
076f82
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
076f82
+	lzcntq	%rcx, %rcx
076f82
+	subq	%rcx, %rax
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	addq	%r8, %rax
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_x0_end):
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	%rcx, %rax
076f82
 	VZEROUPPER_RETURN
076f82
-END (MEMRCHR)
076f82
+
076f82
+	/* 2 bytes until next cache line.  */
076f82
+END(MEMRCHR)
076f82
 #endif