513694
From cdcf8794677acba1fc38ac101bcf52deee23d91d Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Wed, 23 Mar 2022 16:57:24 -0500
513694
Subject: [PATCH] x86: Optimize strspn in strspn-c.c
513694
513694
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
513694
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
513694
sign extensions.
513694
513694
geometric_mean(N=20) of all benchmarks that dont fallback on
513694
sse2; New / Original: .901
513694
513694
All string/memory tests pass.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
513694
---
513694
 sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
513694
 1 file changed, 39 insertions(+), 47 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
513694
index 4554cff0..87c5e4bf 100644
513694
--- a/sysdeps/x86_64/multiarch/strspn-c.c
513694
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
513694
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
513694
     return 0;
513694
 
513694
   const char *aligned;
513694
-  __m128i mask;
513694
-  int offset = (int) ((size_t) a & 15);
513694
+  __m128i mask, maskz, zero;
513694
+  unsigned int maskz_bits;
513694
+  unsigned int offset = (int) ((size_t) a & 15);
513694
+  zero = _mm_set1_epi8 (0);
513694
   if (offset != 0)
513694
     {
513694
       /* Load masks.  */
513694
       aligned = (const char *) ((size_t) a & -16L);
513694
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
513694
-
513694
-      mask = __m128i_shift_right (mask0, offset);
513694
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
513694
 
513694
       /* Find where the NULL terminator is.  */
513694
-      int length = _mm_cmpistri (mask, mask, 0x3a);
513694
-      if (length == 16 - offset)
513694
-	{
513694
-	  /* There is no NULL terminator.  */
513694
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
513694
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
513694
-	  length += index;
513694
-
513694
-	  /* Don't use SSE4.2 if the length of A > 16.  */
513694
-	  if (length > 16)
513694
-	    return __strspn_sse2 (s, a);
513694
-
513694
-	  if (index != 0)
513694
-	    {
513694
-	      /* Combine mask0 and mask1.  We could play games with
513694
-		 palignr, but frankly this data should be in L1 now
513694
-		 so do the merge via an unaligned load.  */
513694
-	      mask = _mm_loadu_si128 ((__m128i *) a);
513694
-	    }
513694
-	}
513694
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
513694
+      if (maskz_bits != 0)
513694
+        {
513694
+          mask = __m128i_shift_right (mask0, offset);
513694
+          offset = (unsigned int) ((size_t) s & 15);
513694
+          if (offset)
513694
+            goto start_unaligned;
513694
+
513694
+          aligned = s;
513694
+          goto start_loop;
513694
+        }
513694
     }
513694
-  else
513694
-    {
513694
-      /* A is aligned.  */
513694
-      mask = _mm_load_si128 ((__m128i *) a);
513694
 
513694
-      /* Find where the NULL terminator is.  */
513694
-      int length = _mm_cmpistri (mask, mask, 0x3a);
513694
-      if (length == 16)
513694
-	{
513694
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
513694
-	     of A > 16.  */
513694
-	  if (a[16] != 0)
513694
-	    return __strspn_sse2 (s, a);
513694
-	}
513694
+  /* A is aligned.  */
513694
+  mask = _mm_loadu_si128 ((__m128i *) a);
513694
+
513694
+  /* Find where the NULL terminator is.  */
513694
+  maskz = _mm_cmpeq_epi8 (mask, zero);
513694
+  maskz_bits = _mm_movemask_epi8 (maskz);
513694
+  if (maskz_bits == 0)
513694
+    {
513694
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
513694
+         of A > 16.  */
513694
+      if (a[16] != 0)
513694
+        return __strspn_sse2 (s, a);
513694
     }
513694
+  aligned = s;
513694
+  offset = (unsigned int) ((size_t) s & 15);
513694
 
513694
-  offset = (int) ((size_t) s & 15);
513694
   if (offset != 0)
513694
     {
513694
+    start_unaligned:
513694
       /* Check partial string.  */
513694
       aligned = (const char *) ((size_t) s & -16L);
513694
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
513694
+      __m128i adj_value = __m128i_shift_right (value, offset);
513694
 
513694
-      value = __m128i_shift_right (value, offset);
513694
-
513694
-      int length = _mm_cmpistri (mask, value, 0x12);
513694
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
513694
       /* No need to check CFlag since it is always 1.  */
513694
       if (length < 16 - offset)
513694
 	return length;
513694
       /* Find where the NULL terminator is.  */
513694
-      int index = _mm_cmpistri (value, value, 0x3a);
513694
-      if (index < 16 - offset)
513694
+      maskz = _mm_cmpeq_epi8 (value, zero);
513694
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
513694
+      if (maskz_bits != 0)
513694
 	return length;
513694
       aligned += 16;
513694
     }
513694
-  else
513694
-    aligned = s;
513694
 
513694
+start_loop:
513694
   while (1)
513694
     {
513694
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
513694
-      int index = _mm_cmpistri (mask, value, 0x12);
513694
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
513694
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
513694
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
513694
       if (cflag)
513694
 	return (size_t) (aligned + index - s);
513694
       aligned += 16;
513694
-- 
513694
GitLab
513694