076f82
commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Wed Mar 23 16:57:24 2022 -0500
076f82
076f82
    x86: Optimize strspn in strspn-c.c
076f82
    
076f82
    Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
076f82
    _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
076f82
    sign extensions.
076f82
    
076f82
    geometric_mean(N=20) of all benchmarks that dont fallback on
076f82
    sse2; New / Original: .901
076f82
    
076f82
    All string/memory tests pass.
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    
076f82
    (cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
076f82
index a17196296b9ebe52..3bcc479f1b52ff6a 100644
076f82
--- a/sysdeps/x86_64/multiarch/strspn-c.c
076f82
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
076f82
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
076f82
     return 0;
076f82
 
076f82
   const char *aligned;
076f82
-  __m128i mask;
076f82
-  int offset = (int) ((size_t) a & 15);
076f82
+  __m128i mask, maskz, zero;
076f82
+  unsigned int maskz_bits;
076f82
+  unsigned int offset = (int) ((size_t) a & 15);
076f82
+  zero = _mm_set1_epi8 (0);
076f82
   if (offset != 0)
076f82
     {
076f82
       /* Load masks.  */
076f82
       aligned = (const char *) ((size_t) a & -16L);
076f82
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
076f82
-
076f82
-      mask = __m128i_shift_right (mask0, offset);
076f82
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
076f82
 
076f82
       /* Find where the NULL terminator is.  */
076f82
-      int length = _mm_cmpistri (mask, mask, 0x3a);
076f82
-      if (length == 16 - offset)
076f82
-	{
076f82
-	  /* There is no NULL terminator.  */
076f82
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
076f82
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
076f82
-	  length += index;
076f82
-
076f82
-	  /* Don't use SSE4.2 if the length of A > 16.  */
076f82
-	  if (length > 16)
076f82
-	    return __strspn_sse2 (s, a);
076f82
-
076f82
-	  if (index != 0)
076f82
-	    {
076f82
-	      /* Combine mask0 and mask1.  We could play games with
076f82
-		 palignr, but frankly this data should be in L1 now
076f82
-		 so do the merge via an unaligned load.  */
076f82
-	      mask = _mm_loadu_si128 ((__m128i *) a);
076f82
-	    }
076f82
-	}
076f82
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
076f82
+      if (maskz_bits != 0)
076f82
+        {
076f82
+          mask = __m128i_shift_right (mask0, offset);
076f82
+          offset = (unsigned int) ((size_t) s & 15);
076f82
+          if (offset)
076f82
+            goto start_unaligned;
076f82
+
076f82
+          aligned = s;
076f82
+          goto start_loop;
076f82
+        }
076f82
     }
076f82
-  else
076f82
-    {
076f82
-      /* A is aligned.  */
076f82
-      mask = _mm_load_si128 ((__m128i *) a);
076f82
 
076f82
-      /* Find where the NULL terminator is.  */
076f82
-      int length = _mm_cmpistri (mask, mask, 0x3a);
076f82
-      if (length == 16)
076f82
-	{
076f82
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
076f82
-	     of A > 16.  */
076f82
-	  if (a[16] != 0)
076f82
-	    return __strspn_sse2 (s, a);
076f82
-	}
076f82
+  /* A is aligned.  */
076f82
+  mask = _mm_loadu_si128 ((__m128i *) a);
076f82
+
076f82
+  /* Find where the NULL terminator is.  */
076f82
+  maskz = _mm_cmpeq_epi8 (mask, zero);
076f82
+  maskz_bits = _mm_movemask_epi8 (maskz);
076f82
+  if (maskz_bits == 0)
076f82
+    {
076f82
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
076f82
+         of A > 16.  */
076f82
+      if (a[16] != 0)
076f82
+        return __strspn_sse2 (s, a);
076f82
     }
076f82
+  aligned = s;
076f82
+  offset = (unsigned int) ((size_t) s & 15);
076f82
 
076f82
-  offset = (int) ((size_t) s & 15);
076f82
   if (offset != 0)
076f82
     {
076f82
+    start_unaligned:
076f82
       /* Check partial string.  */
076f82
       aligned = (const char *) ((size_t) s & -16L);
076f82
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
076f82
+      __m128i adj_value = __m128i_shift_right (value, offset);
076f82
 
076f82
-      value = __m128i_shift_right (value, offset);
076f82
-
076f82
-      int length = _mm_cmpistri (mask, value, 0x12);
076f82
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
076f82
       /* No need to check CFlag since it is always 1.  */
076f82
       if (length < 16 - offset)
076f82
 	return length;
076f82
       /* Find where the NULL terminator is.  */
076f82
-      int index = _mm_cmpistri (value, value, 0x3a);
076f82
-      if (index < 16 - offset)
076f82
+      maskz = _mm_cmpeq_epi8 (value, zero);
076f82
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
076f82
+      if (maskz_bits != 0)
076f82
 	return length;
076f82
       aligned += 16;
076f82
     }
076f82
-  else
076f82
-    aligned = s;
076f82
 
076f82
+start_loop:
076f82
   while (1)
076f82
     {
076f82
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
076f82
-      int index = _mm_cmpistri (mask, value, 0x12);
076f82
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
076f82
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
076f82
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
076f82
       if (cflag)
076f82
 	return (size_t) (aligned + index - s);
076f82
       aligned += 16;