513694
From 36926710d4ddab6f7d5fa9559cd5e70ccc95e13a Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Wed, 23 Mar 2022 16:57:22 -0500
513694
Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c
513694
513694
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
513694
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
513694
sign extensions.
513694
513694
geometric_mean(N=20) of all benchmarks that dont fallback on
513694
sse2/strlen; New / Original: .928
513694
513694
All string/memory tests pass.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit 30d627d477d7255345a4b713cf352ac32d644d61)
513694
---
513694
 sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
513694
 1 file changed, 37 insertions(+), 46 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
513694
index 857af104..6cce4296 100644
513694
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
513694
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
513694
@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
513694
     RETURN (NULL, strlen (s));
513694
 
513694
   const char *aligned;
513694
-  __m128i mask;
513694
-  int offset = (int) ((size_t) a & 15);
513694
+  __m128i mask, maskz, zero;
513694
+  unsigned int maskz_bits;
513694
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
513694
+  zero = _mm_set1_epi8 (0);
513694
   if (offset != 0)
513694
     {
513694
       /* Load masks.  */
513694
       aligned = (const char *) ((size_t) a & -16L);
513694
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
513694
-
513694
-      mask = __m128i_shift_right (mask0, offset);
513694
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
513694
 
513694
       /* Find where the NULL terminator is.  */
513694
-      int length = _mm_cmpistri (mask, mask, 0x3a);
513694
-      if (length == 16 - offset)
513694
-	{
513694
-	  /* There is no NULL terminator.  */
513694
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
513694
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
513694
-	  length += index;
513694
-
513694
-	  /* Don't use SSE4.2 if the length of A > 16.  */
513694
-	  if (length > 16)
513694
-	    return STRCSPN_SSE2 (s, a);
513694
-
513694
-	  if (index != 0)
513694
-	    {
513694
-	      /* Combine mask0 and mask1.  We could play games with
513694
-		 palignr, but frankly this data should be in L1 now
513694
-		 so do the merge via an unaligned load.  */
513694
-	      mask = _mm_loadu_si128 ((__m128i *) a);
513694
-	    }
513694
-	}
513694
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
513694
+      if (maskz_bits != 0)
513694
+        {
513694
+          mask = __m128i_shift_right (mask0, offset);
513694
+          offset = (unsigned int) ((size_t) s & 15);
513694
+          if (offset)
513694
+            goto start_unaligned;
513694
+
513694
+          aligned = s;
513694
+          goto start_loop;
513694
+        }
513694
     }
513694
-  else
513694
-    {
513694
-      /* A is aligned.  */
513694
-      mask = _mm_load_si128 ((__m128i *) a);
513694
 
513694
-      /* Find where the NULL terminator is.  */
513694
-      int length = _mm_cmpistri (mask, mask, 0x3a);
513694
-      if (length == 16)
513694
-	{
513694
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
513694
-	     of A > 16.  */
513694
-	  if (a[16] != 0)
513694
-	    return STRCSPN_SSE2 (s, a);
513694
-	}
513694
+  /* A is aligned.  */
513694
+  mask = _mm_loadu_si128 ((__m128i *) a);
513694
+  /* Find where the NULL terminator is.  */
513694
+  maskz = _mm_cmpeq_epi8 (mask, zero);
513694
+  maskz_bits = _mm_movemask_epi8 (maskz);
513694
+  if (maskz_bits == 0)
513694
+    {
513694
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
513694
+         of A > 16.  */
513694
+      if (a[16] != 0)
513694
+        return STRCSPN_SSE2 (s, a);
513694
     }
513694
 
513694
-  offset = (int) ((size_t) s & 15);
513694
+  aligned = s;
513694
+  offset = (unsigned int) ((size_t) s & 15);
513694
   if (offset != 0)
513694
     {
513694
+    start_unaligned:
513694
       /* Check partial string.  */
513694
       aligned = (const char *) ((size_t) s & -16L);
513694
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
513694
 
513694
       value = __m128i_shift_right (value, offset);
513694
 
513694
-      int length = _mm_cmpistri (mask, value, 0x2);
513694
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
513694
       /* No need to check ZFlag since ZFlag is always 1.  */
513694
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
513694
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
513694
       if (cflag)
513694
 	RETURN ((char *) (s + length), length);
513694
       /* Find where the NULL terminator is.  */
513694
-      int index = _mm_cmpistri (value, value, 0x3a);
513694
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
513694
       if (index < 16 - offset)
513694
 	RETURN (NULL, index);
513694
       aligned += 16;
513694
     }
513694
-  else
513694
-    aligned = s;
513694
 
513694
+start_loop:
513694
   while (1)
513694
     {
513694
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
513694
-      int index = _mm_cmpistri (mask, value, 0x2);
513694
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
513694
-      int zflag = _mm_cmpistrz (mask, value, 0x2);
513694
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
513694
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
513694
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
513694
       if (cflag)
513694
 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
513694
       if (zflag)
513694
-- 
513694
GitLab
513694