08c3a6
commit a910d7e164f1d9b8e77bbea35a2d2ab89a5e26cc
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Jun 6 21:11:33 2022 -0700
08c3a6
08c3a6
    x86: Shrink code size of memchr-avx2.S
08c3a6
    
08c3a6
    This is not meant as a performance optimization. The previous code was
08c3a6
    far to liberal in aligning targets and wasted code size unnecissarily.
08c3a6
    
08c3a6
    The total code size saving is: 59 bytes
08c3a6
    
08c3a6
    There are no major changes in the benchmarks.
08c3a6
    Geometric Mean of all benchmarks New / Old: 0.967
08c3a6
    
08c3a6
    Full xcheck passes on x86_64.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35)
08c3a6
    
08c3a6
    x86: Fix page cross case in rawmemchr-avx2 [BZ #29234]
08c3a6
    
08c3a6
    commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
08c3a6
    Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
    Date:   Mon Jun 6 21:11:33 2022 -0700
08c3a6
    
08c3a6
        x86: Shrink code size of memchr-avx2.S
08c3a6
    
08c3a6
    Changed how the page cross case aligned string (rdi) in
08c3a6
    rawmemchr. This was incompatible with how
08c3a6
    `L(cross_page_continue)` expected the pointer to be aligned and
08c3a6
    would cause rawmemchr to read data start started before the
08c3a6
    beginning of the string. What it would read was in valid memory
08c3a6
    but could count CHAR matches resulting in an incorrect return
08c3a6
    value.
08c3a6
    
08c3a6
    This commit fixes that issue by essentially reverting the changes to
08c3a6
    the L(page_cross) case as they didn't really matter.
08c3a6
    
08c3a6
    Test cases added and all pass with the new code (and where confirmed
08c3a6
    to fail with the old code).
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 2c9af8421d2b4a7fcce163e7bc81a118d22fd346)
08c3a6
08c3a6
diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
08c3a6
index 085098aba8fdbc13..327c0654e69e7669 100644
08c3a6
--- a/string/test-rawmemchr.c
08c3a6
+++ b/string/test-rawmemchr.c
08c3a6
@@ -18,6 +18,7 @@
08c3a6
    <https://www.gnu.org/licenses/>.  */
08c3a6
 
08c3a6
 #include <assert.h>
08c3a6
+#include <support/xunistd.h>
08c3a6
 
08c3a6
 #define TEST_MAIN
08c3a6
 #define TEST_NAME "rawmemchr"
08c3a6
@@ -51,13 +52,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
08c3a6
     }
08c3a6
 }
08c3a6
 
08c3a6
+static void
08c3a6
+do_test_bz29234 (void)
08c3a6
+{
08c3a6
+  size_t i, j;
08c3a6
+  char *ptr_start;
08c3a6
+  char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE,
08c3a6
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1);
08c3a6
+
08c3a6
+  memset (buf, -1, 8192);
08c3a6
+
08c3a6
+  ptr_start = buf + 4096 - 8;
08c3a6
+
08c3a6
+  /* Out of range matches before the start of a page. */
08c3a6
+  memset (ptr_start - 8, 0x1, 8);
08c3a6
+
08c3a6
+  for (j = 0; j < 8; ++j)
08c3a6
+    {
08c3a6
+      for (i = 0; i < 128; ++i)
08c3a6
+	{
08c3a6
+	  ptr_start[i + j] = 0x1;
08c3a6
+
08c3a6
+	  FOR_EACH_IMPL (impl, 0)
08c3a6
+	    do_one_test (impl, (char *) (ptr_start + j), 0x1,
08c3a6
+			 ptr_start + i + j);
08c3a6
+
08c3a6
+	  ptr_start[i + j] = 0xff;
08c3a6
+	}
08c3a6
+    }
08c3a6
+
08c3a6
+  xmunmap (buf, 8192);
08c3a6
+}
08c3a6
+
08c3a6
 static void
08c3a6
 do_test (size_t align, size_t pos, size_t len, int seek_char)
08c3a6
 {
08c3a6
   size_t i;
08c3a6
   char *result;
08c3a6
 
08c3a6
-  align &= 7;
08c3a6
+  align &= getpagesize () - 1;
08c3a6
   if (align + len >= page_size)
08c3a6
     return;
08c3a6
 
08c3a6
@@ -115,6 +148,13 @@ do_random_tests (void)
08c3a6
 	    }
08c3a6
 	}
08c3a6
 
08c3a6
+      if (align)
08c3a6
+	{
08c3a6
+	  p[align - 1] = seek_char;
08c3a6
+	  if (align > 4)
08c3a6
+	    p[align - 4] = seek_char;
08c3a6
+	}
08c3a6
+
08c3a6
       assert (pos < len);
08c3a6
       size_t r = random ();
08c3a6
       if ((r & 31) == 0)
08c3a6
@@ -130,6 +170,13 @@ do_random_tests (void)
08c3a6
 		   result, p);
08c3a6
 	    ret = 1;
08c3a6
 	  }
08c3a6
+
08c3a6
+      if (align)
08c3a6
+	{
08c3a6
+	  p[align - 1] = seek_char;
08c3a6
+	  if (align > 4)
08c3a6
+	    p[align - 4] = seek_char;
08c3a6
+	}
08c3a6
     }
08c3a6
 }
08c3a6
 
08c3a6
@@ -151,14 +198,22 @@ test_main (void)
08c3a6
       do_test (i, 64, 256, 23);
08c3a6
       do_test (0, 16 << i, 2048, 0);
08c3a6
       do_test (i, 64, 256, 0);
08c3a6
+
08c3a6
+      do_test (getpagesize () - i, 64, 256, 23);
08c3a6
+      do_test (getpagesize () - i, 64, 256, 0);
08c3a6
     }
08c3a6
   for (i = 1; i < 32; ++i)
08c3a6
     {
08c3a6
       do_test (0, i, i + 1, 23);
08c3a6
       do_test (0, i, i + 1, 0);
08c3a6
+
08c3a6
+      do_test (getpagesize () - 7, i, i + 1, 23);
08c3a6
+      do_test (getpagesize () - i / 2, i, i + 1, 23);
08c3a6
+      do_test (getpagesize () - i, i, i + 1, 23);
08c3a6
     }
08c3a6
 
08c3a6
   do_random_tests ();
08c3a6
+  do_test_bz29234 ();
08c3a6
   return ret;
08c3a6
 }
08c3a6
 
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
08c3a6
index 87b076c7c403ba85..c4d71938c5a3ed24 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
08c3a6
@@ -2,6 +2,7 @@
08c3a6
 # define MEMCHR __memchr_avx2_rtm
08c3a6
 #endif
08c3a6
 
08c3a6
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
08c3a6
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
08c3a6
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
08c3a6
 
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
08c3a6
index afdb95650232fdac..9e0b7dd1f4fe9909 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
08c3a6
@@ -57,7 +57,7 @@
08c3a6
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
08c3a6
 
08c3a6
 	.section SECTION(.text),"ax",@progbits
08c3a6
-ENTRY (MEMCHR)
08c3a6
+ENTRY_P2ALIGN (MEMCHR, 5)
08c3a6
 # ifndef USE_AS_RAWMEMCHR
08c3a6
 	/* Check for zero length.  */
08c3a6
 #  ifdef __ILP32__
08c3a6
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
08c3a6
 # endif
08c3a6
 	testl	%eax, %eax
08c3a6
 	jz	L(aligned_more)
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	bsfl	%eax, %eax
08c3a6
 	addq	%rdi, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
+L(return_vzeroupper):
08c3a6
+	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
+
08c3a6
 
08c3a6
 # ifndef USE_AS_RAWMEMCHR
08c3a6
-	.p2align 5
08c3a6
+	.p2align 4
08c3a6
 L(first_vec_x0):
08c3a6
 	/* Check if first match was before length.  */
08c3a6
 	tzcntl	%eax, %eax
08c3a6
@@ -100,58 +102,31 @@ L(first_vec_x0):
08c3a6
 	/* NB: Multiply length by 4 to get byte count.  */
08c3a6
 	sall	$2, %edx
08c3a6
 #  endif
08c3a6
-	xorl	%ecx, %ecx
08c3a6
+    COND_VZEROUPPER
08c3a6
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
08c3a6
+	   block. branch here as opposed to cmovcc is not that costly. Common
08c3a6
+	   usage of memchr is to check if the return was NULL (if string was
08c3a6
+	   known to contain CHAR user would use rawmemchr). This branch will be
08c3a6
+	   highly correlated with the user branch and can be used by most
08c3a6
+	   modern branch predictors to predict the user branch.  */
08c3a6
 	cmpl	%eax, %edx
08c3a6
-	leaq	(%rdi, %rax), %rax
08c3a6
-	cmovle	%rcx, %rax
08c3a6
-	VZEROUPPER_RETURN
08c3a6
-
08c3a6
-L(null):
08c3a6
-	xorl	%eax, %eax
08c3a6
-	ret
08c3a6
-# endif
08c3a6
-	.p2align 4
08c3a6
-L(cross_page_boundary):
08c3a6
-	/* Save pointer before aligning as its original value is
08c3a6
-	   necessary for computer return address if byte is found or
08c3a6
-	   adjusting length if it is not and this is memchr.  */
08c3a6
-	movq	%rdi, %rcx
08c3a6
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
08c3a6
-	   and rdi for rawmemchr.  */
08c3a6
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
08c3a6
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
08c3a6
-	vpmovmskb %ymm1, %eax
08c3a6
-# ifndef USE_AS_RAWMEMCHR
08c3a6
-	/* Calculate length until end of page (length checked for a
08c3a6
-	   match).  */
08c3a6
-	leaq	1(%ALGN_PTR_REG), %rsi
08c3a6
-	subq	%RRAW_PTR_REG, %rsi
08c3a6
-#  ifdef USE_AS_WMEMCHR
08c3a6
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
08c3a6
-	shrl	$2, %esi
08c3a6
-#  endif
08c3a6
-# endif
08c3a6
-	/* Remove the leading bytes.  */
08c3a6
-	sarxl	%ERAW_PTR_REG, %eax, %eax
08c3a6
-# ifndef USE_AS_RAWMEMCHR
08c3a6
-	/* Check the end of data.  */
08c3a6
-	cmpq	%rsi, %rdx
08c3a6
-	jbe	L(first_vec_x0)
08c3a6
+    jle  L(null)
08c3a6
+	addq	%rdi, %rax
08c3a6
+    ret
08c3a6
 # endif
08c3a6
-	testl	%eax, %eax
08c3a6
-	jz	L(cross_page_continue)
08c3a6
-	tzcntl	%eax, %eax
08c3a6
-	addq	%RRAW_PTR_REG, %rax
08c3a6
-L(return_vzeroupper):
08c3a6
-	ZERO_UPPER_VEC_REGISTERS_RETURN
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 10
08c3a6
 L(first_vec_x1):
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	bsfl	%eax, %eax
08c3a6
 	incq	%rdi
08c3a6
 	addq	%rdi, %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-
08c3a6
+# ifndef USE_AS_RAWMEMCHR
08c3a6
+	/* First in aligning bytes here.  */
08c3a6
+L(null):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
+# endif
08c3a6
 	.p2align 4
08c3a6
 L(first_vec_x2):
08c3a6
 	tzcntl	%eax, %eax
08c3a6
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
08c3a6
 	incq	%rdi
08c3a6
 	addq	%rdi, %rax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 6
08c3a6
 L(set_zero_end):
08c3a6
 	xorl	%eax, %eax
08c3a6
 	VZEROUPPER_RETURN
08c3a6
@@ -428,5 +403,39 @@ L(last_vec_x3):
08c3a6
 	VZEROUPPER_RETURN
08c3a6
 # endif
08c3a6
 
08c3a6
+	.p2align 4
08c3a6
+L(cross_page_boundary):
08c3a6
+	/* Save pointer before aligning as its original value is necessary for
08c3a6
+	   computer return address if byte is found or adjusting length if it
08c3a6
+	   is not and this is memchr.  */
08c3a6
+	movq	%rdi, %rcx
08c3a6
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
08c3a6
+	   and rdi for rawmemchr.  */
08c3a6
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
08c3a6
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
08c3a6
+	vpmovmskb %ymm1, %eax
08c3a6
+# ifndef USE_AS_RAWMEMCHR
08c3a6
+	/* Calculate length until end of page (length checked for a match).  */
08c3a6
+	leaq	1(%ALGN_PTR_REG), %rsi
08c3a6
+	subq	%RRAW_PTR_REG, %rsi
08c3a6
+#  ifdef USE_AS_WMEMCHR
08c3a6
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
08c3a6
+	shrl	$2, %esi
08c3a6
+#  endif
08c3a6
+# endif
08c3a6
+	/* Remove the leading bytes.  */
08c3a6
+	sarxl	%ERAW_PTR_REG, %eax, %eax
08c3a6
+# ifndef USE_AS_RAWMEMCHR
08c3a6
+	/* Check the end of data.  */
08c3a6
+	cmpq	%rsi, %rdx
08c3a6
+	jbe	L(first_vec_x0)
08c3a6
+# endif
08c3a6
+	testl	%eax, %eax
08c3a6
+	jz	L(cross_page_continue)
08c3a6
+	bsfl	%eax, %eax
08c3a6
+	addq	%RRAW_PTR_REG, %rax
08c3a6
+	VZEROUPPER_RETURN
08c3a6
+
08c3a6
+
08c3a6
 END (MEMCHR)
08c3a6
 #endif