076f82
commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Jun 6 21:11:31 2022 -0700
076f82
076f82
    x86: Optimize memrchr-evex.S
076f82
    
076f82
    The new code:
076f82
        1. prioritizes smaller user-arg lengths more.
076f82
        2. optimizes target placement more carefully
076f82
        3. reuses logic more
076f82
        4. fixes up various inefficiencies in the logic. The biggest
076f82
           case here is the `lzcnt` logic for checking returns which
076f82
           saves either a branch or multiple instructions.
076f82
    
076f82
    The total code size saving is: 263 bytes
076f82
    Geometric Mean of all benchmarks New / Old: 0.755
076f82
    
076f82
    Regressions:
076f82
    There are some regressions. Particularly where the length (user arg
076f82
    length) is large but the position of the match char is near the
076f82
    beginning of the string (in first VEC). This case has roughly a
076f82
    20% regression.
076f82
    
076f82
    This is because the new logic gives the hot path for immediate matches
076f82
    to shorter lengths (the more common input). This case has roughly
076f82
    a 35% speedup.
076f82
    
076f82
    Full xcheck passes on x86_64.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
076f82
index 16bf8e02b1e80c84..bddc89c3754894ed 100644
076f82
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
076f82
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
076f82
@@ -19,319 +19,316 @@
076f82
 #if IS_IN (libc)
076f82
 
076f82
 # include <sysdep.h>
076f82
+# include "evex256-vecs.h"
076f82
+# if VEC_SIZE != 32
076f82
+#  error "VEC_SIZE != 32 unimplemented"
076f82
+# endif
076f82
+
076f82
+# ifndef MEMRCHR
076f82
+#  define MEMRCHR				__memrchr_evex
076f82
+# endif
076f82
+
076f82
+# define PAGE_SIZE			4096
076f82
+# define VECMATCH			VEC(0)
076f82
+
076f82
+	.section SECTION(.text), "ax", @progbits
076f82
+ENTRY_P2ALIGN(MEMRCHR, 6)
076f82
+# ifdef __ILP32__
076f82
+	/* Clear upper bits.  */
076f82
+	and	%RDX_LP, %RDX_LP
076f82
+# else
076f82
+	test	%RDX_LP, %RDX_LP
076f82
+# endif
076f82
+	jz	L(zero_0)
076f82
+
076f82
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
076f82
+	   correct page cross check and 2) it correctly sets up end ptr to be
076f82
+	   subtract by lzcnt aligned.  */
076f82
+	leaq	-1(%rdi, %rdx), %rax
076f82
+	vpbroadcastb %esi, %VECMATCH
076f82
+
076f82
+	/* Check if we can load 1x VEC without cross a page.  */
076f82
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
+	jz	L(page_cross)
076f82
+
076f82
+	/* Don't use rax for pointer here because EVEX has better encoding with
076f82
+	   offset % VEC_SIZE == 0.  */
076f82
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+
076f82
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
076f82
+	cmpq	$VEC_SIZE, %rdx
076f82
+	ja	L(more_1x_vec)
076f82
+L(ret_vec_x0_test):
076f82
+
076f82
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
076f82
+	   will guarantee edx (len) is less than it.  */
076f82
+	lzcntl	%ecx, %ecx
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_0)
076f82
+	subq	%rcx, %rax
076f82
+	ret
076f82
 
076f82
-# define VMOVA		vmovdqa64
076f82
-
076f82
-# define YMMMATCH	ymm16
076f82
-
076f82
-# define VEC_SIZE 32
076f82
-
076f82
-	.section .text.evex,"ax",@progbits
076f82
-ENTRY (__memrchr_evex)
076f82
-	/* Broadcast CHAR to YMMMATCH.  */
076f82
-	vpbroadcastb %esi, %YMMMATCH
076f82
-
076f82
-	sub	$VEC_SIZE, %RDX_LP
076f82
-	jbe	L(last_vec_or_less)
076f82
-
076f82
-	add	%RDX_LP, %RDI_LP
076f82
-
076f82
-	/* Check the last VEC_SIZE bytes.  */
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x0)
076f82
-
076f82
-	subq	$(VEC_SIZE * 4), %rdi
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	jz	L(aligned_more)
076f82
-
076f82
-	/* Align data for aligned loads in the loop.  */
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	addq	$VEC_SIZE, %rdx
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
-	subq	%rcx, %rdx
076f82
-
076f82
-	.p2align 4
076f82
-L(aligned_more):
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	jbe	L(last_4x_vec_or_less)
076f82
-
076f82
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
076f82
-	   since data is only aligned to VEC_SIZE.  */
076f82
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
-
076f82
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
076f82
-	kmovd	%k2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
-
076f82
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
076f82
-	kmovd	%k3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
-
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
076f82
-	kmovd	%k4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x0)
076f82
-
076f82
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
076f82
-	   There are some overlaps with above if data isn't aligned
076f82
-	   to 4 * VEC_SIZE.  */
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE * 4 - 1), %ecx
076f82
-	jz	L(loop_4x_vec)
076f82
-
076f82
-	addq	$(VEC_SIZE * 4), %rdi
076f82
-	addq	$(VEC_SIZE * 4), %rdx
076f82
-	andq	$-(VEC_SIZE * 4), %rdi
076f82
-	subq	%rcx, %rdx
076f82
+	/* Fits in aligning bytes of first cache line.  */
076f82
+L(zero_0):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(loop_4x_vec):
076f82
-	/* Compare 4 * VEC at a time forward.  */
076f82
-	subq	$(VEC_SIZE * 4), %rdi
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	jbe	L(last_4x_vec_or_less)
076f82
-
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
076f82
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
076f82
-	kord	%k1, %k2, %k5
076f82
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
076f82
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
076f82
-
076f82
-	kord	%k3, %k4, %k6
076f82
-	kortestd %k5, %k6
076f82
-	jz	L(loop_4x_vec)
076f82
-
076f82
-	/* There is a match.  */
076f82
-	kmovd	%k4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
-
076f82
-	kmovd	%k3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
-
076f82
-	kmovd	%k2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
-
076f82
-	kmovd	%k1, %eax
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
+	.p2align 4,, 9
076f82
+L(ret_vec_x0_dec):
076f82
+	decq	%rax
076f82
+L(ret_vec_x0):
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	%rcx, %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_4x_vec_or_less):
076f82
-	addl	$(VEC_SIZE * 4), %edx
076f82
-	cmpl	$(VEC_SIZE * 2), %edx
076f82
-	jbe	L(last_2x_vec)
076f82
+	.p2align 4,, 10
076f82
+L(more_1x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0)
076f82
 
076f82
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3)
076f82
+	/* Align rax (pointer to string).  */
076f82
+	andq	$-VEC_SIZE, %rax
076f82
 
076f82
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
076f82
-	kmovd	%k2, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x2)
076f82
+	/* Recompute length after aligning.  */
076f82
+	movq	%rax, %rdx
076f82
 
076f82
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
076f82
-	kmovd	%k3, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1_check)
076f82
-	cmpl	$(VEC_SIZE * 3), %edx
076f82
-	jbe	L(zero)
076f82
+	/* Need no matter what.  */
076f82
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
076f82
-	kmovd	%k4, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 4), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addq	%rdi, %rax
076f82
-	ret
076f82
+	subq	%rdi, %rdx
076f82
 
076f82
-	.p2align 4
076f82
+	cmpq	$(VEC_SIZE * 2), %rdx
076f82
+	ja	L(more_2x_vec)
076f82
 L(last_2x_vec):
076f82
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x3_check)
076f82
+
076f82
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
076f82
+	decq	%rax
076f82
 	cmpl	$VEC_SIZE, %edx
076f82
-	jbe	L(zero)
076f82
-
076f82
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 2), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$(VEC_SIZE * 2), %eax
076f82
-	addq	%rdi, %rax
076f82
+	jbe	L(ret_vec_x0_test)
076f82
+
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0)
076f82
+
076f82
+	/* Don't use rax for pointer here because EVEX has better encoding with
076f82
+	   offset % VEC_SIZE == 0.  */
076f82
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
076f82
+	lzcntq	%rcx, %rcx
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_0)
076f82
+	subq	%rcx, %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x0):
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
+	/* Inexpensive place to put this regarding code size / target alignments
076f82
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
076f82
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
076f82
+	   in first cache line.  */
076f82
+L(page_cross):
076f82
+	movq	%rax, %rsi
076f82
+	andq	$-VEC_SIZE, %rsi
076f82
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
076f82
+	kmovd	%k0, %r8d
076f82
+	/* Shift out negative alignment (because we are starting from endptr and
076f82
+	   working backwards).  */
076f82
+	movl	%eax, %ecx
076f82
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
076f82
+	notl	%ecx
076f82
+	shlxl	%ecx, %r8d, %ecx
076f82
+	cmpq	%rdi, %rsi
076f82
+	ja	L(more_1x_vec)
076f82
+	lzcntl	%ecx, %ecx
076f82
+	cmpl	%ecx, %edx
076f82
+	jle	L(zero_1)
076f82
+	subq	%rcx, %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x1):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$VEC_SIZE, %eax
076f82
-	addq	%rdi, %rax
076f82
+	/* Continue creating zero labels that fit in aligning bytes and get
076f82
+	   2-byte encoding / are in the same cache line as condition.  */
076f82
+L(zero_1):
076f82
+	xorl	%eax, %eax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x2):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$(VEC_SIZE * 2), %eax
076f82
-	addq	%rdi, %rax
076f82
+	.p2align 4,, 8
076f82
+L(ret_vec_x1):
076f82
+	/* This will naturally add 32 to position.  */
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x3):
076f82
-	bsrl	%eax, %eax
076f82
-	addl	$(VEC_SIZE * 3), %eax
076f82
-	addq	%rdi, %rax
076f82
-	ret
076f82
+	.p2align 4,, 8
076f82
+L(more_2x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0_dec)
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x1_check):
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$(VEC_SIZE * 3), %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$VEC_SIZE, %eax
076f82
-	addq	%rdi, %rax
076f82
-	ret
076f82
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1)
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_x3_check):
076f82
-	bsrl	%eax, %eax
076f82
-	subq	$VEC_SIZE, %rdx
076f82
-	addq	%rax, %rdx
076f82
-	jl	L(zero)
076f82
-	addl	$(VEC_SIZE * 3), %eax
076f82
-	addq	%rdi, %rax
076f82
-	ret
076f82
+	/* Need no matter what.  */
076f82
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	.p2align 4
076f82
-L(zero):
076f82
-	xorl	%eax, %eax
076f82
+	subq	$(VEC_SIZE * 4), %rdx
076f82
+	ja	L(more_4x_vec)
076f82
+
076f82
+	cmpl	$(VEC_SIZE * -1), %edx
076f82
+	jle	L(ret_vec_x2_test)
076f82
+L(last_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x2)
076f82
+
076f82
+
076f82
+	/* Need no matter what.  */
076f82
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 3 + 1), %rax
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	ja	L(zero_1)
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_or_less_aligned):
076f82
-	movl	%edx, %ecx
076f82
-
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
076f82
-
076f82
-	movl	$1, %edx
076f82
-	/* Support rdx << 32.  */
076f82
-	salq	%cl, %rdx
076f82
-	subq	$1, %rdx
076f82
-
076f82
-	kmovd	%k1, %eax
076f82
-
076f82
-	/* Remove the trailing bytes.  */
076f82
-	andl	%edx, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
-
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
+	.p2align 4,, 8
076f82
+L(ret_vec_x2_test):
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 2 + 1), %rax
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	ja	L(zero_1)
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vec_or_less):
076f82
-	addl	$VEC_SIZE, %edx
076f82
-
076f82
-	/* Check for zero length.  */
076f82
-	testl	%edx, %edx
076f82
-	jz	L(zero)
076f82
-
076f82
-	movl	%edi, %ecx
076f82
-	andl	$(VEC_SIZE - 1), %ecx
076f82
-	jz	L(last_vec_or_less_aligned)
076f82
-
076f82
-	movl	%ecx, %esi
076f82
-	movl	%ecx, %r8d
076f82
-	addl	%edx, %esi
076f82
-	andq	$-VEC_SIZE, %rdi
076f82
+	.p2align 4,, 8
076f82
+L(ret_vec_x2):
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
076f82
+	ret
076f82
 
076f82
-	subl	$VEC_SIZE, %esi
076f82
-	ja	L(last_vec_2x_aligned)
076f82
+	.p2align 4,, 8
076f82
+L(ret_vec_x3):
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
076f82
+	ret
076f82
 
076f82
-	/* Check the last VEC.  */
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
076f82
-	kmovd	%k1, %eax
076f82
+	.p2align 4,, 8
076f82
+L(more_4x_vec):
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x2)
076f82
 
076f82
-	/* Remove the leading and trailing bytes.  */
076f82
-	sarl	%cl, %eax
076f82
-	movl	%edx, %ecx
076f82
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	movl	$1, %edx
076f82
-	sall	%cl, %edx
076f82
-	subl	$1, %edx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x3)
076f82
 
076f82
-	andl	%edx, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
+	/* Check if near end before re-aligning (otherwise might do an
076f82
+	   unnecessary loop iteration).  */
076f82
+	addq	$-(VEC_SIZE * 4), %rax
076f82
+	cmpq	$(VEC_SIZE * 4), %rdx
076f82
+	jbe	L(last_4x_vec)
076f82
 
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	addq	%r8, %rax
076f82
-	ret
076f82
+	decq	%rax
076f82
+	andq	$-(VEC_SIZE * 4), %rax
076f82
+	movq	%rdi, %rdx
076f82
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
076f82
+	   lengths that overflow can be valid and break the comparison.  */
076f82
+	andq	$-(VEC_SIZE * 4), %rdx
076f82
 
076f82
 	.p2align 4
076f82
-L(last_vec_2x_aligned):
076f82
-	movl	%esi, %ecx
076f82
-
076f82
-	/* Check the last VEC.  */
076f82
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
076f82
+L(loop_4x_vec):
076f82
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
076f82
+	   on).  */
076f82
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
076f82
+
076f82
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
076f82
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
076f82
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
076f82
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
076f82
+
076f82
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
076f82
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
076f82
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
076f82
+	vptestnmb %VEC(3), %VEC(3), %k2
076f82
+
076f82
+	/* Any 1s and we found CHAR.  */
076f82
+	kortestd %k2, %k4
076f82
+	jnz	L(loop_end)
076f82
+
076f82
+	addq	$-(VEC_SIZE * 4), %rax
076f82
+	cmpq	%rdx, %rax
076f82
+	jne	L(loop_4x_vec)
076f82
+
076f82
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
076f82
+	subq	$-(VEC_SIZE * 4), %rdx
076f82
+	movq	%rdx, %rax
076f82
+	subl	%edi, %edx
076f82
+L(last_4x_vec):
076f82
+
076f82
+	/* Used no matter what.  */
076f82
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	movl	$1, %edx
076f82
-	sall	%cl, %edx
076f82
-	subl	$1, %edx
076f82
+	cmpl	$(VEC_SIZE * 2), %edx
076f82
+	jbe	L(last_2x_vec)
076f82
 
076f82
-	kmovd	%k1, %eax
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0_dec)
076f82
 
076f82
-	/* Remove the trailing bytes.  */
076f82
-	andl	%edx, %eax
076f82
 
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(last_vec_x1)
076f82
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	/* Check the second last VEC.  */
076f82
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1)
076f82
 
076f82
-	movl	%r8d, %ecx
076f82
+	/* Used no matter what.  */
076f82
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
076f82
+	kmovd	%k0, %ecx
076f82
 
076f82
-	kmovd	%k1, %eax
076f82
+	cmpl	$(VEC_SIZE * 3), %edx
076f82
+	ja	L(last_vec)
076f82
 
076f82
-	/* Remove the leading bytes.  Must use unsigned right shift for
076f82
-	   bsrl below.  */
076f82
-	shrl	%cl, %eax
076f82
-	testl	%eax, %eax
076f82
-	jz	L(zero)
076f82
+	lzcntl	%ecx, %ecx
076f82
+	subq	$(VEC_SIZE * 2 + 1), %rax
076f82
+	subq	%rcx, %rax
076f82
+	cmpq	%rax, %rdi
076f82
+	jbe	L(ret_1)
076f82
+	xorl	%eax, %eax
076f82
+L(ret_1):
076f82
+	ret
076f82
 
076f82
-	bsrl	%eax, %eax
076f82
-	addq	%rdi, %rax
076f82
-	addq	%r8, %rax
076f82
+	.p2align 4,, 6
076f82
+L(loop_end):
076f82
+	kmovd	%k1, %ecx
076f82
+	notl	%ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x0_end)
076f82
+
076f82
+	vptestnmb %VEC(2), %VEC(2), %k0
076f82
+	kmovd	%k0, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(ret_vec_x1_end)
076f82
+
076f82
+	kmovd	%k2, %ecx
076f82
+	kmovd	%k4, %esi
076f82
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
076f82
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
076f82
+	   then CHAR in VEC3 and bsrq will use that position.  */
076f82
+	salq	$32, %rcx
076f82
+	orq	%rsi, %rcx
076f82
+	bsrq	%rcx, %rcx
076f82
+	addq	%rcx, %rax
076f82
+	ret
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_x0_end):
076f82
+	addq	$(VEC_SIZE), %rax
076f82
+L(ret_vec_x1_end):
076f82
+	bsrl	%ecx, %ecx
076f82
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
076f82
 	ret
076f82
-END (__memrchr_evex)
076f82
+
076f82
+END(MEMRCHR)
076f82
 #endif