08c3a6
commit 3c87383a20daff9a230439e31b778716bfed4d8b
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Jun 6 21:11:34 2022 -0700
08c3a6
08c3a6
    x86: Shrink code size of memchr-evex.S
08c3a6
    
08c3a6
    This is not meant as a performance optimization. The previous code was
08c3a6
    far to liberal in aligning targets and wasted code size unnecissarily.
08c3a6
    
08c3a6
    The total code size saving is: 64 bytes
08c3a6
    
08c3a6
    There are no non-negligible changes in the benchmarks.
08c3a6
    Geometric Mean of all benchmarks New / Old: 1.000
08c3a6
    
08c3a6
    Full xcheck passes on x86_64.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
08c3a6
index 4d0ed6d136f099e1..68381c99a4948134 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
08c3a6
@@ -88,7 +88,7 @@
08c3a6
 # define PAGE_SIZE 4096
08c3a6
 
08c3a6
 	.section SECTION(.text),"ax",@progbits
08c3a6
-ENTRY (MEMCHR)
08c3a6
+ENTRY_P2ALIGN (MEMCHR, 6)
08c3a6
 # ifndef USE_AS_RAWMEMCHR
08c3a6
 	/* Check for zero length.  */
08c3a6
 	test	%RDX_LP, %RDX_LP
08c3a6
@@ -131,22 +131,24 @@ L(zero):
08c3a6
 	xorl	%eax, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 5
08c3a6
+	.p2align 4
08c3a6
 L(first_vec_x0):
08c3a6
-	/* Check if first match was before length.  */
08c3a6
-	tzcntl	%eax, %eax
08c3a6
-	xorl	%ecx, %ecx
08c3a6
-	cmpl	%eax, %edx
08c3a6
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
-	cmovle	%rcx, %rax
08c3a6
+	/* Check if first match was before length. NB: tzcnt has false data-
08c3a6
+	   dependency on destination. eax already had a data-dependency on esi
08c3a6
+	   so this should have no affect here.  */
08c3a6
+	tzcntl	%eax, %esi
08c3a6
+#  ifdef USE_AS_WMEMCHR
08c3a6
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
08c3a6
+#  else
08c3a6
+	addq	%rsi, %rdi
08c3a6
+#  endif
08c3a6
+	xorl	%eax, %eax
08c3a6
+	cmpl	%esi, %edx
08c3a6
+	cmovg	%rdi, %rax
08c3a6
 	ret
08c3a6
-# else
08c3a6
-	/* NB: first_vec_x0 is 17 bytes which will leave
08c3a6
-	   cross_page_boundary (which is relatively cold) close enough
08c3a6
-	   to ideal alignment. So only realign L(cross_page_boundary) if
08c3a6
-	   rawmemchr.  */
08c3a6
-	.p2align 4
08c3a6
 # endif
08c3a6
+
08c3a6
+	.p2align 4
08c3a6
 L(cross_page_boundary):
08c3a6
 	/* Save pointer before aligning as its original value is
08c3a6
 	   necessary for computer return address if byte is found or
08c3a6
@@ -400,10 +402,14 @@ L(last_2x_vec):
08c3a6
 L(zero_end):
08c3a6
 	ret
08c3a6
 
08c3a6
+L(set_zero_end):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
 L(first_vec_x1_check):
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	/* eax must be non-zero. Use bsfl to save code size.  */
08c3a6
+	bsfl	%eax, %eax
08c3a6
 	/* Adjust length.  */
08c3a6
 	subl	$-(CHAR_PER_VEC * 4), %edx
08c3a6
 	/* Check if match within remaining length.  */
08c3a6
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
08c3a6
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
08c3a6
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
 	ret
08c3a6
-L(set_zero_end):
08c3a6
-	xorl	%eax, %eax
08c3a6
-	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
 L(loop_4x_vec_end):
08c3a6
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 10
08c3a6
 L(last_vec_x1_return):
08c3a6
 	tzcntl	%eax, %eax
08c3a6
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
08c3a6
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
08c3a6
 # endif
08c3a6
 
08c3a6
 # ifndef USE_AS_RAWMEMCHR
08c3a6
+	.p2align 4,, 5
08c3a6
 L(last_4x_vec_or_less_cmpeq):
08c3a6
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
08c3a6
 	kmovd	%k0, %eax
08c3a6
@@ -546,7 +550,7 @@ L(last_4x_vec):
08c3a6
 #  endif
08c3a6
 	andl	%ecx, %eax
08c3a6
 	jz	L(zero_end2)
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	bsfl	%eax, %eax
08c3a6
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
 L(zero_end2):
08c3a6
 	ret
08c3a6
@@ -562,6 +566,6 @@ L(last_vec_x3):
08c3a6
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
08c3a6
 	ret
08c3a6
 # endif
08c3a6
-
08c3a6
+	/* 7 bytes from next cache line.  */
08c3a6
 END (MEMCHR)
08c3a6
 #endif