08c3a6
commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Wed Mar 23 16:57:24 2022 -0500
08c3a6
08c3a6
    x86: Optimize strspn in strspn-c.c
08c3a6
    
08c3a6
    Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
08c3a6
    _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
08c3a6
    sign extensions.
08c3a6
    
08c3a6
    geometric_mean(N=20) of all benchmarks that dont fallback on
08c3a6
    sse2; New / Original: .901
08c3a6
    
08c3a6
    All string/memory tests pass.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
08c3a6
index a17196296b9ebe52..3bcc479f1b52ff6a 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/strspn-c.c
08c3a6
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
08c3a6
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
08c3a6
     return 0;
08c3a6
 
08c3a6
   const char *aligned;
08c3a6
-  __m128i mask;
08c3a6
-  int offset = (int) ((size_t) a & 15);
08c3a6
+  __m128i mask, maskz, zero;
08c3a6
+  unsigned int maskz_bits;
08c3a6
+  unsigned int offset = (int) ((size_t) a & 15);
08c3a6
+  zero = _mm_set1_epi8 (0);
08c3a6
   if (offset != 0)
08c3a6
     {
08c3a6
       /* Load masks.  */
08c3a6
       aligned = (const char *) ((size_t) a & -16L);
08c3a6
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
08c3a6
-
08c3a6
-      mask = __m128i_shift_right (mask0, offset);
08c3a6
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
08c3a6
 
08c3a6
       /* Find where the NULL terminator is.  */
08c3a6
-      int length = _mm_cmpistri (mask, mask, 0x3a);
08c3a6
-      if (length == 16 - offset)
08c3a6
-	{
08c3a6
-	  /* There is no NULL terminator.  */
08c3a6
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
08c3a6
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
08c3a6
-	  length += index;
08c3a6
-
08c3a6
-	  /* Don't use SSE4.2 if the length of A > 16.  */
08c3a6
-	  if (length > 16)
08c3a6
-	    return __strspn_sse2 (s, a);
08c3a6
-
08c3a6
-	  if (index != 0)
08c3a6
-	    {
08c3a6
-	      /* Combine mask0 and mask1.  We could play games with
08c3a6
-		 palignr, but frankly this data should be in L1 now
08c3a6
-		 so do the merge via an unaligned load.  */
08c3a6
-	      mask = _mm_loadu_si128 ((__m128i *) a);
08c3a6
-	    }
08c3a6
-	}
08c3a6
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
08c3a6
+      if (maskz_bits != 0)
08c3a6
+        {
08c3a6
+          mask = __m128i_shift_right (mask0, offset);
08c3a6
+          offset = (unsigned int) ((size_t) s & 15);
08c3a6
+          if (offset)
08c3a6
+            goto start_unaligned;
08c3a6
+
08c3a6
+          aligned = s;
08c3a6
+          goto start_loop;
08c3a6
+        }
08c3a6
     }
08c3a6
-  else
08c3a6
-    {
08c3a6
-      /* A is aligned.  */
08c3a6
-      mask = _mm_load_si128 ((__m128i *) a);
08c3a6
 
08c3a6
-      /* Find where the NULL terminator is.  */
08c3a6
-      int length = _mm_cmpistri (mask, mask, 0x3a);
08c3a6
-      if (length == 16)
08c3a6
-	{
08c3a6
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
08c3a6
-	     of A > 16.  */
08c3a6
-	  if (a[16] != 0)
08c3a6
-	    return __strspn_sse2 (s, a);
08c3a6
-	}
08c3a6
+  /* A is aligned.  */
08c3a6
+  mask = _mm_loadu_si128 ((__m128i *) a);
08c3a6
+
08c3a6
+  /* Find where the NULL terminator is.  */
08c3a6
+  maskz = _mm_cmpeq_epi8 (mask, zero);
08c3a6
+  maskz_bits = _mm_movemask_epi8 (maskz);
08c3a6
+  if (maskz_bits == 0)
08c3a6
+    {
08c3a6
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
08c3a6
+         of A > 16.  */
08c3a6
+      if (a[16] != 0)
08c3a6
+        return __strspn_sse2 (s, a);
08c3a6
     }
08c3a6
+  aligned = s;
08c3a6
+  offset = (unsigned int) ((size_t) s & 15);
08c3a6
 
08c3a6
-  offset = (int) ((size_t) s & 15);
08c3a6
   if (offset != 0)
08c3a6
     {
08c3a6
+    start_unaligned:
08c3a6
       /* Check partial string.  */
08c3a6
       aligned = (const char *) ((size_t) s & -16L);
08c3a6
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
08c3a6
+      __m128i adj_value = __m128i_shift_right (value, offset);
08c3a6
 
08c3a6
-      value = __m128i_shift_right (value, offset);
08c3a6
-
08c3a6
-      int length = _mm_cmpistri (mask, value, 0x12);
08c3a6
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
08c3a6
       /* No need to check CFlag since it is always 1.  */
08c3a6
       if (length < 16 - offset)
08c3a6
 	return length;
08c3a6
       /* Find where the NULL terminator is.  */
08c3a6
-      int index = _mm_cmpistri (value, value, 0x3a);
08c3a6
-      if (index < 16 - offset)
08c3a6
+      maskz = _mm_cmpeq_epi8 (value, zero);
08c3a6
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
08c3a6
+      if (maskz_bits != 0)
08c3a6
 	return length;
08c3a6
       aligned += 16;
08c3a6
     }
08c3a6
-  else
08c3a6
-    aligned = s;
08c3a6
 
08c3a6
+start_loop:
08c3a6
   while (1)
08c3a6
     {
08c3a6
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
08c3a6
-      int index = _mm_cmpistri (mask, value, 0x12);
08c3a6
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
08c3a6
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
08c3a6
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
08c3a6
       if (cflag)
08c3a6
 	return (size_t) (aligned + index - s);
08c3a6
       aligned += 16;