190885
From c17aa053d5b26520fddad8bfb590b521cb027280 Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Fri, 23 Apr 2021 15:56:25 -0400
190885
Subject: [PATCH] x86: Optimize strchr-evex.S
190885
190885
No bug. This commit optimizes strchr-evex.S. The optimizations are
190885
mostly small things such as save an ALU in the alignment process,
190885
saving a few instructions in the loop return. The one significant
190885
change is saving 2 instructions in the 4x loop. test-strchr,
190885
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
(cherry picked from commit 7f3e7c262cab4e2401e4331a6ef29c428de02044)
190885
---
190885
 sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
190885
 1 file changed, 218 insertions(+), 174 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
190885
index ddc86a70..7f9d4ee4 100644
190885
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
190885
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
190885
@@ -32,13 +32,15 @@
190885
 #  define VPCMP		vpcmpd
190885
 #  define VPMINU	vpminud
190885
 #  define CHAR_REG	esi
190885
-#  define SHIFT_REG	r8d
190885
+#  define SHIFT_REG	ecx
190885
+#  define CHAR_SIZE	4
190885
 # else
190885
 #  define VPBROADCAST	vpbroadcastb
190885
 #  define VPCMP		vpcmpb
190885
 #  define VPMINU	vpminub
190885
 #  define CHAR_REG	sil
190885
-#  define SHIFT_REG	ecx
190885
+#  define SHIFT_REG	edx
190885
+#  define CHAR_SIZE	1
190885
 # endif
190885
 
190885
 # define XMMZERO	xmm16
190885
@@ -56,23 +58,20 @@
190885
 
190885
 # define VEC_SIZE 32
190885
 # define PAGE_SIZE 4096
190885
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
190885
 
190885
 	.section .text.evex,"ax",@progbits
190885
 ENTRY (STRCHR)
190885
-	movl	%edi, %ecx
190885
-# ifndef USE_AS_STRCHRNUL
190885
-	xorl	%edx, %edx
190885
-# endif
190885
-
190885
 	/* Broadcast CHAR to YMM0.	*/
190885
-	VPBROADCAST %esi, %YMM0
190885
-
190885
+	VPBROADCAST	%esi, %YMM0
190885
+	movl	%edi, %eax
190885
+	andl	$(PAGE_SIZE - 1), %eax
190885
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
 
190885
-	/* Check if we cross page boundary with one vector load.  */
190885
-	andl	$(PAGE_SIZE - 1), %ecx
190885
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
190885
-	ja  L(cross_page_boundary)
190885
+	/* Check if we cross page boundary with one vector load.
190885
+	   Otherwise it is safe to use an unaligned load.  */
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	ja	L(cross_page_boundary)
190885
 
190885
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
190885
 	   null bytes.  */
190885
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
190885
 	VPMINU	%YMM2, %YMM1, %YMM2
190885
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
-	ktestd	%k0, %k0
190885
-	jz	L(more_vecs)
190885
 	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(aligned_more)
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
 # ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(%rdi, %rax, 4), %rax
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
190885
+	 */
190885
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
190885
 # else
190885
 	addq	%rdi, %rax
190885
 # endif
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(%rax), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
 	ret
190885
 
190885
-	.p2align 4
190885
-L(more_vecs):
190885
-	/* Align data for aligned loads in the loop.  */
190885
-	andq	$-VEC_SIZE, %rdi
190885
-L(aligned_more):
190885
-
190885
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
190885
-	   since data is only aligned to VEC_SIZE.	*/
190885
-	VMOVA	VEC_SIZE(%rdi), %YMM1
190885
-	addq	$VEC_SIZE, %rdi
190885
-
190885
-	/* Leaves only CHARS matching esi as 0.  */
190885
-	vpxorq	%YMM1, %YMM0, %YMM2
190885
-	VPMINU	%YMM2, %YMM1, %YMM2
190885
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x0)
190885
-
190885
-	VMOVA	VEC_SIZE(%rdi), %YMM1
190885
-	/* Leaves only CHARS matching esi as 0.  */
190885
-	vpxorq	%YMM1, %YMM0, %YMM2
190885
-	VPMINU	%YMM2, %YMM1, %YMM2
190885
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x1)
190885
-
190885
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
190885
-	/* Leaves only CHARS matching esi as 0.  */
190885
-	vpxorq	%YMM1, %YMM0, %YMM2
190885
-	VPMINU	%YMM2, %YMM1, %YMM2
190885
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x2)
190885
-
190885
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
190885
-	/* Leaves only CHARS matching esi as 0.  */
190885
-	vpxorq	%YMM1, %YMM0, %YMM2
190885
-	VPMINU	%YMM2, %YMM1, %YMM2
190885
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
-	ktestd	%k0, %k0
190885
-	jz	L(prep_loop_4x)
190885
-
190885
-	kmovd	%k0, %eax
190885
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
190885
+	   alignment % 32 was either 16 or 0. As well this makes the
190885
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
190885
+	   easier.  */
190885
+	.p2align 5
190885
+L(first_vec_x3):
190885
 	tzcntl	%eax, %eax
190885
+# ifndef USE_AS_STRCHRNUL
190885
 	/* Found CHAR or the null byte.	 */
190885
-# ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
190885
-# else
190885
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
190885
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero)
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
190885
+	ret
190885
+
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
-# endif
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
 	ret
190885
+# endif
190885
 
190885
 	.p2align 4
190885
-L(first_vec_x0):
190885
+L(first_vec_x4):
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
190885
+	kmovd	%k0, %eax
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
-# ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(%rdi, %rax, 4), %rax
190885
+	kmovd	%k1, %ecx
190885
+	/* bzhil will not be 0 if first match was null.  */
190885
+	bzhil	%eax, %ecx, %ecx
190885
+	jne	L(zero)
190885
 # else
190885
-	addq	%rdi, %rax
190885
-# endif
190885
-# ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Combine CHAR and null matches.  */
190885
+	kord	%k0, %k1, %k0
190885
+	kmovd	%k0, %eax
190885
+	tzcntl	%eax, %eax
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
190885
 	ret
190885
 
190885
 	.p2align 4
190885
 L(first_vec_x1):
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
-# ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
190885
-# else
190885
-	leaq	VEC_SIZE(%rdi, %rax), %rax
190885
-# endif
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Found CHAR or the null byte.	 */
190885
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero)
190885
+
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
190885
 	ret
190885
 
190885
 	.p2align 4
190885
 L(first_vec_x2):
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
190885
+	kmovd	%k0, %eax
190885
 	tzcntl	%eax, %eax
190885
-	/* Found CHAR or the null byte.	 */
190885
-# ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
190885
+	kmovd	%k1, %ecx
190885
+	/* bzhil will not be 0 if first match was null.  */
190885
+	bzhil	%eax, %ecx, %ecx
190885
+	jne	L(zero)
190885
 # else
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
-# endif
190885
-# ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Combine CHAR and null matches.  */
190885
+	kord	%k0, %k1, %k0
190885
+	kmovd	%k0, %eax
190885
+	tzcntl	%eax, %eax
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
190885
 	ret
190885
 
190885
-L(prep_loop_4x):
190885
-	/* Align data to 4 * VEC_SIZE.	*/
190885
+	.p2align 4
190885
+L(aligned_more):
190885
+	/* Align data to VEC_SIZE.  */
190885
+	andq	$-VEC_SIZE, %rdi
190885
+L(cross_page_continue):
190885
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
190885
+	   data is only aligned to VEC_SIZE. Use two alternating methods
190885
+	   for checking VEC to balance latency and port contention.  */
190885
+
190885
+	/* This method has higher latency but has better port
190885
+	   distribution.  */
190885
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	/* This method has higher latency but has better port
190885
+	   distribution.  */
190885
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
190885
+	/* Each bit in K0 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMM1, %YMM0, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
190885
+	kortestd	%k0, %k1
190885
+	jnz	L(first_vec_x2)
190885
+
190885
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x3)
190885
+
190885
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
190885
+	/* Each bit in K0 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMM1, %YMM0, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
190885
+	kortestd	%k0, %k1
190885
+	jnz	L(first_vec_x4)
190885
+
190885
+	/* Align data to VEC_SIZE * 4 for the loop.  */
190885
+	addq	$VEC_SIZE, %rdi
190885
 	andq	$-(VEC_SIZE * 4), %rdi
190885
 
190885
 	.p2align 4
190885
 L(loop_4x_vec):
190885
-	/* Compare 4 * VEC at a time forward.  */
190885
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
190885
+	   encoding.  */
190885
 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
190885
 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
190885
 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
190885
 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
190885
 
190885
-	/* Leaves only CHARS matching esi as 0.  */
190885
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
190885
+	   zero.  */
190885
 	vpxorq	%YMM1, %YMM0, %YMM5
190885
-	vpxorq	%YMM2, %YMM0, %YMM6
190885
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
190885
+	   k register. Its possible to save either 1 or 2 instructions
190885
+	   using cmp no equals method for either YMM1 or YMM1 and YMM3
190885
+	   respectively but bottleneck on p5 makes it not worth it.  */
190885
+	VPCMP	$4, %YMM0, %YMM2, %k2
190885
 	vpxorq	%YMM3, %YMM0, %YMM7
190885
-	vpxorq	%YMM4, %YMM0, %YMM8
190885
-
190885
-	VPMINU	%YMM5, %YMM1, %YMM5
190885
-	VPMINU	%YMM6, %YMM2, %YMM6
190885
-	VPMINU	%YMM7, %YMM3, %YMM7
190885
-	VPMINU	%YMM8, %YMM4, %YMM8
190885
-
190885
-	VPMINU	%YMM5, %YMM6, %YMM1
190885
-	VPMINU	%YMM7, %YMM8, %YMM2
190885
-
190885
-	VPMINU	%YMM1, %YMM2, %YMM1
190885
-
190885
-	/* Each bit in K0 represents a CHAR or a null byte.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
-
190885
-	addq	$(VEC_SIZE * 4), %rdi
190885
-
190885
-	ktestd	%k0, %k0
190885
+	VPCMP	$4, %YMM0, %YMM4, %k4
190885
+
190885
+	/* Use min to select all zeros from either xor or end of string).
190885
+	 */
190885
+	VPMINU	%YMM1, %YMM5, %YMM1
190885
+	VPMINU	%YMM3, %YMM7, %YMM3
190885
+
190885
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will
190885
+	   have 0 as positions that matched with CHAR which will set
190885
+	   zero in the corresponding destination bytes in YMM2 / YMM4.
190885
+	 */
190885
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
190885
+	VPMINU	%YMM3, %YMM4, %YMM4
190885
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
190885
+
190885
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
190885
+	kmovd	%k1, %ecx
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	testl	%ecx, %ecx
190885
 	jz	L(loop_4x_vec)
190885
 
190885
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM5, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
 	kmovd	%k0, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(first_vec_x0)
190885
+	jnz	L(last_vec_x1)
190885
 
190885
-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM6, %k1
190885
-	kmovd	%k1, %eax
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(first_vec_x1)
190885
-
190885
-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM7, %k2
190885
-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
190885
-	VPCMP	$0, %YMMZERO, %YMM8, %k3
190885
+	jnz	L(last_vec_x2)
190885
 
190885
+	VPCMP	$0, %YMMZERO, %YMM3, %k0
190885
+	kmovd	%k0, %eax
190885
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
190885
 # ifdef USE_AS_WCSCHR
190885
-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
190885
-	kshiftlw $8, %k3, %k1
190885
+	sall	$8, %ecx
190885
+	orl	%ecx, %eax
190885
+	tzcntl	%eax, %eax
190885
 # else
190885
-	kshiftlq $32, %k3, %k1
190885
+	salq	$32, %rcx
190885
+	orq	%rcx, %rax
190885
+	tzcntq	%rax, %rax
190885
 # endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Check if match was CHAR or null.  */
190885
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero_end)
190885
+# endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
190885
+	ret
190885
 
190885
-	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
-	korq	%k1, %k2, %k1
190885
-	kmovq	%k1, %rax
190885
+# ifndef USE_AS_STRCHRNUL
190885
+L(zero_end):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+# endif
190885
 
190885
-	tzcntq  %rax, %rax
190885
-# ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
190885
-# else
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
+	.p2align 4
190885
+L(last_vec_x1):
190885
+	tzcntl	%eax, %eax
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Check if match was null.  */
190885
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero_end)
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x2):
190885
+	tzcntl	%eax, %eax
190885
 # ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	/* Check if match was null.  */
190885
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero_end)
190885
 # endif
190885
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
190885
+	   bytes.  */
190885
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
190885
 	ret
190885
 
190885
 	/* Cold case for crossing page with first load.	 */
190885
 	.p2align 4
190885
 L(cross_page_boundary):
190885
+	movq	%rdi, %rdx
190885
+	/* Align rdi.  */
190885
 	andq	$-VEC_SIZE, %rdi
190885
-	andl	$(VEC_SIZE - 1), %ecx
190885
-
190885
 	VMOVA	(%rdi), %YMM1
190885
-
190885
 	/* Leaves only CHARS matching esi as 0.  */
190885
 	vpxorq	%YMM1, %YMM0, %YMM2
190885
 	VPMINU	%YMM2, %YMM1, %YMM2
190885
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
 	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-
190885
+	/* Remove the leading bits.	 */
190885
 # ifdef USE_AS_WCSCHR
190885
+	movl	%edx, %SHIFT_REG
190885
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
 	   bytes.  */
190885
-	movl	%ecx, %SHIFT_REG
190885
-	sarl    $2, %SHIFT_REG
190885
+	sarl	$2, %SHIFT_REG
190885
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
190885
 # endif
190885
-
190885
-	/* Remove the leading bits.	 */
190885
 	sarxl	%SHIFT_REG, %eax, %eax
190885
+	/* If eax is zero continue.  */
190885
 	testl	%eax, %eax
190885
-
190885
-	jz	L(aligned_more)
190885
+	jz	L(cross_page_continue)
190885
 	tzcntl	%eax, %eax
190885
-	addq	%rcx, %rdi
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	/* Check to see if match was CHAR or null.  */
190885
+	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
190885
+	jne	L(zero_end)
190885
+# endif
190885
 # ifdef USE_AS_WCSCHR
190885
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
-	leaq	(%rdi, %rax, 4), %rax
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of
190885
+	   bytes.  */
190885
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
190885
 # else
190885
-	addq	%rdi, %rax
190885
-# endif
190885
-# ifndef USE_AS_STRCHRNUL
190885
-	cmp (%rax), %CHAR_REG
190885
-	cmovne	%rdx, %rax
190885
+	addq	%rdx, %rax
190885
 # endif
190885
 	ret
190885
 
190885
-- 
190885
GitLab
190885