08c3a6
commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Jun 6 21:11:31 2022 -0700
08c3a6
08c3a6
    x86: Optimize memrchr-evex.S
08c3a6
    
08c3a6
    The new code:
08c3a6
        1. prioritizes smaller user-arg lengths more.
08c3a6
        2. optimizes target placement more carefully
08c3a6
        3. reuses logic more
08c3a6
        4. fixes up various inefficiencies in the logic. The biggest
08c3a6
           case here is the `lzcnt` logic for checking returns which
08c3a6
           saves either a branch or multiple instructions.
08c3a6
    
08c3a6
    The total code size saving is: 263 bytes
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.755
08c3a6
    
08c3a6
    Regressions:
08c3a6
    There are some regressions. Particularly where the length (user arg
08c3a6
    length) is large but the position of the match char is near the
08c3a6
    beginning of the string (in first VEC). This case has roughly a
08c3a6
    20% regression.
08c3a6
    
08c3a6
    This is because the new logic gives the hot path for immediate matches
08c3a6
    to shorter lengths (the more common input). This case has roughly
08c3a6
    a 35% speedup.
08c3a6
    
08c3a6
    Full xcheck passes on x86_64.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
08c3a6
index 16bf8e02b1e80c84..bddc89c3754894ed 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
08c3a6
@@ -19,319 +19,316 @@
08c3a6
 #if IS_IN (libc)
08c3a6
 
08c3a6
 # include <sysdep.h>
08c3a6
+# include "evex256-vecs.h"
08c3a6
+# if VEC_SIZE != 32
08c3a6
+#  error "VEC_SIZE != 32 unimplemented"
08c3a6
+# endif
08c3a6
+
08c3a6
+# ifndef MEMRCHR
08c3a6
+#  define MEMRCHR				__memrchr_evex
08c3a6
+# endif
08c3a6
+
08c3a6
+# define PAGE_SIZE			4096
08c3a6
+# define VECMATCH			VEC(0)
08c3a6
+
08c3a6
+	.section SECTION(.text), "ax", @progbits
08c3a6
+ENTRY_P2ALIGN(MEMRCHR, 6)
08c3a6
+# ifdef __ILP32__
08c3a6
+	/* Clear upper bits.  */
08c3a6
+	and	%RDX_LP, %RDX_LP
08c3a6
+# else
08c3a6
+	test	%RDX_LP, %RDX_LP
08c3a6
+# endif
08c3a6
+	jz	L(zero_0)
08c3a6
+
08c3a6
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
08c3a6
+	   correct page cross check and 2) it correctly sets up end ptr to be
08c3a6
+	   subtract by lzcnt aligned.  */
08c3a6
+	leaq	-1(%rdi, %rdx), %rax
08c3a6
+	vpbroadcastb %esi, %VECMATCH
08c3a6
+
08c3a6
+	/* Check if we can load 1x VEC without cross a page.  */
08c3a6
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
+	jz	L(page_cross)
08c3a6
+
08c3a6
+	/* Don't use rax for pointer here because EVEX has better encoding with
08c3a6
+	   offset % VEC_SIZE == 0.  */
08c3a6
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+
08c3a6
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
08c3a6
+	cmpq	$VEC_SIZE, %rdx
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+L(ret_vec_x0_test):
08c3a6
+
08c3a6
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
08c3a6
+	   will guarantee edx (len) is less than it.  */
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_0)
08c3a6
+	subq	%rcx, %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-# define VMOVA		vmovdqa64
08c3a6
-
08c3a6
-# define YMMMATCH	ymm16
08c3a6
-
08c3a6
-# define VEC_SIZE 32
08c3a6
-
08c3a6
-	.section .text.evex,"ax",@progbits
08c3a6
-ENTRY (__memrchr_evex)
08c3a6
-	/* Broadcast CHAR to YMMMATCH.  */
08c3a6
-	vpbroadcastb %esi, %YMMMATCH
08c3a6
-
08c3a6
-	sub	$VEC_SIZE, %RDX_LP
08c3a6
-	jbe	L(last_vec_or_less)
08c3a6
-
08c3a6
-	add	%RDX_LP, %RDI_LP
08c3a6
-
08c3a6
-	/* Check the last VEC_SIZE bytes.  */
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x0)
08c3a6
-
08c3a6
-	subq	$(VEC_SIZE * 4), %rdi
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	jz	L(aligned_more)
08c3a6
-
08c3a6
-	/* Align data for aligned loads in the loop.  */
08c3a6
-	addq	$VEC_SIZE, %rdi
08c3a6
-	addq	$VEC_SIZE, %rdx
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
-	subq	%rcx, %rdx
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(aligned_more):
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	jbe	L(last_4x_vec_or_less)
08c3a6
-
08c3a6
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
08c3a6
-	   since data is only aligned to VEC_SIZE.  */
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
-
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
08c3a6
-	kmovd	%k2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
-
08c3a6
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
08c3a6
-	kmovd	%k3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
-
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
08c3a6
-	kmovd	%k4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x0)
08c3a6
-
08c3a6
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
08c3a6
-	   There are some overlaps with above if data isn't aligned
08c3a6
-	   to 4 * VEC_SIZE.  */
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE * 4 - 1), %ecx
08c3a6
-	jz	L(loop_4x_vec)
08c3a6
-
08c3a6
-	addq	$(VEC_SIZE * 4), %rdi
08c3a6
-	addq	$(VEC_SIZE * 4), %rdx
08c3a6
-	andq	$-(VEC_SIZE * 4), %rdi
08c3a6
-	subq	%rcx, %rdx
08c3a6
+	/* Fits in aligning bytes of first cache line.  */
08c3a6
+L(zero_0):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(loop_4x_vec):
08c3a6
-	/* Compare 4 * VEC at a time forward.  */
08c3a6
-	subq	$(VEC_SIZE * 4), %rdi
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	jbe	L(last_4x_vec_or_less)
08c3a6
-
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
08c3a6
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
08c3a6
-	kord	%k1, %k2, %k5
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
08c3a6
-
08c3a6
-	kord	%k3, %k4, %k6
08c3a6
-	kortestd %k5, %k6
08c3a6
-	jz	L(loop_4x_vec)
08c3a6
-
08c3a6
-	/* There is a match.  */
08c3a6
-	kmovd	%k4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
-
08c3a6
-	kmovd	%k3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
-
08c3a6
-	kmovd	%k2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
-
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	.p2align 4,, 9
08c3a6
+L(ret_vec_x0_dec):
08c3a6
+	decq	%rax
08c3a6
+L(ret_vec_x0):
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	%rcx, %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_4x_vec_or_less):
08c3a6
-	addl	$(VEC_SIZE * 4), %edx
08c3a6
-	cmpl	$(VEC_SIZE * 2), %edx
08c3a6
-	jbe	L(last_2x_vec)
08c3a6
+	.p2align 4,, 10
08c3a6
+L(more_1x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
 
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3)
08c3a6
+	/* Align rax (pointer to string).  */
08c3a6
+	andq	$-VEC_SIZE, %rax
08c3a6
 
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
08c3a6
-	kmovd	%k2, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x2)
08c3a6
+	/* Recompute length after aligning.  */
08c3a6
+	movq	%rax, %rdx
08c3a6
 
08c3a6
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
08c3a6
-	kmovd	%k3, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1_check)
08c3a6
-	cmpl	$(VEC_SIZE * 3), %edx
08c3a6
-	jbe	L(zero)
08c3a6
+	/* Need no matter what.  */
08c3a6
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
08c3a6
-	kmovd	%k4, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 4), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addq	%rdi, %rax
08c3a6
-	ret
08c3a6
+	subq	%rdi, %rdx
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	cmpq	$(VEC_SIZE * 2), %rdx
08c3a6
+	ja	L(more_2x_vec)
08c3a6
 L(last_2x_vec):
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x3_check)
08c3a6
+
08c3a6
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
08c3a6
+	decq	%rax
08c3a6
 	cmpl	$VEC_SIZE, %edx
08c3a6
-	jbe	L(zero)
08c3a6
-
08c3a6
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 2), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$(VEC_SIZE * 2), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	jbe	L(ret_vec_x0_test)
08c3a6
+
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0)
08c3a6
+
08c3a6
+	/* Don't use rax for pointer here because EVEX has better encoding with
08c3a6
+	   offset % VEC_SIZE == 0.  */
08c3a6
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
08c3a6
+	lzcntq	%rcx, %rcx
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_0)
08c3a6
+	subq	%rcx, %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x0):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	/* Inexpensive place to put this regarding code size / target alignments
08c3a6
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
08c3a6
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
08c3a6
+	   in first cache line.  */
08c3a6
+L(page_cross):
08c3a6
+	movq	%rax, %rsi
08c3a6
+	andq	$-VEC_SIZE, %rsi
08c3a6
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %r8d
08c3a6
+	/* Shift out negative alignment (because we are starting from endptr and
08c3a6
+	   working backwards).  */
08c3a6
+	movl	%eax, %ecx
08c3a6
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
08c3a6
+	notl	%ecx
08c3a6
+	shlxl	%ecx, %r8d, %ecx
08c3a6
+	cmpq	%rdi, %rsi
08c3a6
+	ja	L(more_1x_vec)
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	cmpl	%ecx, %edx
08c3a6
+	jle	L(zero_1)
08c3a6
+	subq	%rcx, %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x1):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$VEC_SIZE, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	/* Continue creating zero labels that fit in aligning bytes and get
08c3a6
+	   2-byte encoding / are in the same cache line as condition.  */
08c3a6
+L(zero_1):
08c3a6
+	xorl	%eax, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x2):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$(VEC_SIZE * 2), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	.p2align 4,, 8
08c3a6
+L(ret_vec_x1):
08c3a6
+	/* This will naturally add 32 to position.  */
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x3):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addl	$(VEC_SIZE * 3), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	ret
08c3a6
+	.p2align 4,, 8
08c3a6
+L(more_2x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0_dec)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x1_check):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$(VEC_SIZE * 3), %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$VEC_SIZE, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	ret
08c3a6
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1)
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_x3_check):
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	subq	$VEC_SIZE, %rdx
08c3a6
-	addq	%rax, %rdx
08c3a6
-	jl	L(zero)
08c3a6
-	addl	$(VEC_SIZE * 3), %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	ret
08c3a6
+	/* Need no matter what.  */
08c3a6
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(zero):
08c3a6
-	xorl	%eax, %eax
08c3a6
+	subq	$(VEC_SIZE * 4), %rdx
08c3a6
+	ja	L(more_4x_vec)
08c3a6
+
08c3a6
+	cmpl	$(VEC_SIZE * -1), %edx
08c3a6
+	jle	L(ret_vec_x2_test)
08c3a6
+L(last_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
+
08c3a6
+
08c3a6
+	/* Need no matter what.  */
08c3a6
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 3 + 1), %rax
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	ja	L(zero_1)
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_or_less_aligned):
08c3a6
-	movl	%edx, %ecx
08c3a6
-
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
08c3a6
-
08c3a6
-	movl	$1, %edx
08c3a6
-	/* Support rdx << 32.  */
08c3a6
-	salq	%cl, %rdx
08c3a6
-	subq	$1, %rdx
08c3a6
-
08c3a6
-	kmovd	%k1, %eax
08c3a6
-
08c3a6
-	/* Remove the trailing bytes.  */
08c3a6
-	andl	%edx, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
-
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
+	.p2align 4,, 8
08c3a6
+L(ret_vec_x2_test):
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 2 + 1), %rax
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	ja	L(zero_1)
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(last_vec_or_less):
08c3a6
-	addl	$VEC_SIZE, %edx
08c3a6
-
08c3a6
-	/* Check for zero length.  */
08c3a6
-	testl	%edx, %edx
08c3a6
-	jz	L(zero)
08c3a6
-
08c3a6
-	movl	%edi, %ecx
08c3a6
-	andl	$(VEC_SIZE - 1), %ecx
08c3a6
-	jz	L(last_vec_or_less_aligned)
08c3a6
-
08c3a6
-	movl	%ecx, %esi
08c3a6
-	movl	%ecx, %r8d
08c3a6
-	addl	%edx, %esi
08c3a6
-	andq	$-VEC_SIZE, %rdi
08c3a6
+	.p2align 4,, 8
08c3a6
+L(ret_vec_x2):
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	subl	$VEC_SIZE, %esi
08c3a6
-	ja	L(last_vec_2x_aligned)
08c3a6
+	.p2align 4,, 8
08c3a6
+L(ret_vec_x3):
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
08c3a6
+	ret
08c3a6
 
08c3a6
-	/* Check the last VEC.  */
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
+	.p2align 4,, 8
08c3a6
+L(more_4x_vec):
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x2)
08c3a6
 
08c3a6
-	/* Remove the leading and trailing bytes.  */
08c3a6
-	sarl	%cl, %eax
08c3a6
-	movl	%edx, %ecx
08c3a6
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	movl	$1, %edx
08c3a6
-	sall	%cl, %edx
08c3a6
-	subl	$1, %edx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x3)
08c3a6
 
08c3a6
-	andl	%edx, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
+	/* Check if near end before re-aligning (otherwise might do an
08c3a6
+	   unnecessary loop iteration).  */
08c3a6
+	addq	$-(VEC_SIZE * 4), %rax
08c3a6
+	cmpq	$(VEC_SIZE * 4), %rdx
08c3a6
+	jbe	L(last_4x_vec)
08c3a6
 
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	addq	%r8, %rax
08c3a6
-	ret
08c3a6
+	decq	%rax
08c3a6
+	andq	$-(VEC_SIZE * 4), %rax
08c3a6
+	movq	%rdi, %rdx
08c3a6
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
08c3a6
+	   lengths that overflow can be valid and break the comparison.  */
08c3a6
+	andq	$-(VEC_SIZE * 4), %rdx
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(last_vec_2x_aligned):
08c3a6
-	movl	%esi, %ecx
08c3a6
-
08c3a6
-	/* Check the last VEC.  */
08c3a6
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
08c3a6
+L(loop_4x_vec):
08c3a6
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
08c3a6
+	   on).  */
08c3a6
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
08c3a6
+
08c3a6
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
08c3a6
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
08c3a6
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
08c3a6
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
08c3a6
+
08c3a6
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
08c3a6
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
08c3a6
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
08c3a6
+	vptestnmb %VEC(3), %VEC(3), %k2
08c3a6
+
08c3a6
+	/* Any 1s and we found CHAR.  */
08c3a6
+	kortestd %k2, %k4
08c3a6
+	jnz	L(loop_end)
08c3a6
+
08c3a6
+	addq	$-(VEC_SIZE * 4), %rax
08c3a6
+	cmpq	%rdx, %rax
08c3a6
+	jne	L(loop_4x_vec)
08c3a6
+
08c3a6
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
08c3a6
+	subq	$-(VEC_SIZE * 4), %rdx
08c3a6
+	movq	%rdx, %rax
08c3a6
+	subl	%edi, %edx
08c3a6
+L(last_4x_vec):
08c3a6
+
08c3a6
+	/* Used no matter what.  */
08c3a6
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	movl	$1, %edx
08c3a6
-	sall	%cl, %edx
08c3a6
-	subl	$1, %edx
08c3a6
+	cmpl	$(VEC_SIZE * 2), %edx
08c3a6
+	jbe	L(last_2x_vec)
08c3a6
 
08c3a6
-	kmovd	%k1, %eax
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0_dec)
08c3a6
 
08c3a6
-	/* Remove the trailing bytes.  */
08c3a6
-	andl	%edx, %eax
08c3a6
 
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(last_vec_x1)
08c3a6
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	/* Check the second last VEC.  */
08c3a6
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1)
08c3a6
 
08c3a6
-	movl	%r8d, %ecx
08c3a6
+	/* Used no matter what.  */
08c3a6
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
 
08c3a6
-	kmovd	%k1, %eax
08c3a6
+	cmpl	$(VEC_SIZE * 3), %edx
08c3a6
+	ja	L(last_vec)
08c3a6
 
08c3a6
-	/* Remove the leading bytes.  Must use unsigned right shift for
08c3a6
-	   bsrl below.  */
08c3a6
-	shrl	%cl, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(zero)
08c3a6
+	lzcntl	%ecx, %ecx
08c3a6
+	subq	$(VEC_SIZE * 2 + 1), %rax
08c3a6
+	subq	%rcx, %rax
08c3a6
+	cmpq	%rax, %rdi
08c3a6
+	jbe	L(ret_1)
08c3a6
+	xorl	%eax, %eax
08c3a6
+L(ret_1):
08c3a6
+	ret
08c3a6
 
08c3a6
-	bsrl	%eax, %eax
08c3a6
-	addq	%rdi, %rax
08c3a6
-	addq	%r8, %rax
08c3a6
+	.p2align 4,, 6
08c3a6
+L(loop_end):
08c3a6
+	kmovd	%k1, %ecx
08c3a6
+	notl	%ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x0_end)
08c3a6
+
08c3a6
+	vptestnmb %VEC(2), %VEC(2), %k0
08c3a6
+	kmovd	%k0, %ecx
08c3a6
+	testl	%ecx, %ecx
08c3a6
+	jnz	L(ret_vec_x1_end)
08c3a6
+
08c3a6
+	kmovd	%k2, %ecx
08c3a6
+	kmovd	%k4, %esi
08c3a6
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
08c3a6
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
08c3a6
+	   then CHAR in VEC3 and bsrq will use that position.  */
08c3a6
+	salq	$32, %rcx
08c3a6
+	orq	%rsi, %rcx
08c3a6
+	bsrq	%rcx, %rcx
08c3a6
+	addq	%rcx, %rax
08c3a6
+	ret
08c3a6
+	.p2align 4,, 4
08c3a6
+L(ret_vec_x0_end):
08c3a6
+	addq	$(VEC_SIZE), %rax
08c3a6
+L(ret_vec_x1_end):
08c3a6
+	bsrl	%ecx, %ecx
08c3a6
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
08c3a6
 	ret
08c3a6
-END (__memrchr_evex)
08c3a6
+
08c3a6
+END(MEMRCHR)
08c3a6
 #endif